# Reinforcement Learning - Deep Q Network

In [1]:
import numpy as np
import random
import torch
import copy
from torch import nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm as _tqdm
import os
import pickle

def tqdm(*args, **kwargs):
    return _tqdm(*args, **kwargs, mininterval=1) 

%matplotlib inline
import matplotlib.pyplot as plt
import sys
import time

# !pip install gym==0.10.8

import gym
import pandas as pd


assert sys.version_info[:3] >= (3, 6, 0), "Make sure you have Python 3.6 installed!"

In [2]:
class QNetwork(nn.Module):
    
    def __init__(self, num_hidden=128, observation_space=4, action_space=2):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(observation_space, num_hidden)
        self.l2 = nn.Linear(num_hidden, action_space)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.relu(self.l1(x))
        out = self.l2(out)

        return out

In [3]:
class ReplayMemory:
    
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            self.memory.pop(0)

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        return batch

    def __len__(self):
        return len(self.memory)

In [4]:
class EpsilonGreedyPolicy(object):
    """
    A simple epsilon greedy policy.
    """
    def __init__(self, Q, epsilon):
        self.Q = Q
        self.epsilon = epsilon
    
    def sample_action(self, obs, device):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            obs: current state

        Returns:
            An action (int).
        """
        if random.random() < self.epsilon:
            action = random.randint(0, 1)
        else:
            with torch.no_grad():
                action = torch.argmax(self.Q(torch.Tensor(obs).to(device))).item()
        return int(action)
        
    def set_epsilon(self, epsilon):
        self.epsilon = epsilon

In [5]:
def get_epsilon(it, start=1, end=0.05, length=1000):
    if it < length:
        epsilon =  - it/(1000/0.95)
    else:
        epsilon = 0.05
    return epsilon

def smooth(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

def compute_q_vals(Q, states, actions, device):
    """
    This method returns Q values for given state action pairs.
    
    Args:
        Q: Q-net
        states: a tensor of states. Shape: batch_size x obs_dim
        actions: a tensor of actions. Shape: Shape: batch_size x 1

    Returns:
        A torch tensor filled with Q values. Shape: batch_size x 1.
    """
    QValues = Q(states.to(device))[torch.arange(len(actions)).to(device), actions.squeeze().to(device)]
    return QValues.unsqueeze(dim=1)

def compute_targets(Q, rewards, next_states, dones, discount_factor, target_net, device):
    """
    This method returns targets (values towards which Q-values should move).
    
    Args:
        Q: Q-net
        rewards: a tensor of actions. Shape: Shape: batch_size x 1
        next_states: a tensor of states. Shape: batch_size x obs_dim
        dones: a tensor of boolean done flags (indicates if next_state is terminal) Shape: batch_size x 1
        discount_factor: discount
    Returns:
        A torch tensor filled with target values. Shape: batch_size x 1.
    """
    maxQs = torch.max(target_net(next_states.to(device)), 1).values
    if isinstance(dones, torch.BoolTensor):
        targets = rewards.to(device) + discount_factor * maxQs.unsqueeze(dim=1).to(device) * (~dones).to(device)
    else:
        targets = rewards.to(device) + discount_factor * maxQs.unsqueeze(dim=1).to(device) * (1 - dones).to(device)
    return targets
    
def train(Q, memory, optimizer, batch_size, discount_factor, target_net, device, error_clipping=True):
    # don't learn without some decent experience
    if len(memory) < batch_size:
        return None, None

    # random transition batch is taken from experience replay memory
    transitions = memory.sample(batch_size)
    
    # transition is a list of 4-tuples, instead we want 4 vectors (as torch.Tensor's)
    state, action, reward, next_state, done = zip(*transitions)
    
    # convert to PyTorch and define types
    state = torch.tensor(state, dtype=torch.float)
    action = torch.tensor(action, dtype=torch.int64)[:, None]  # Need 64 bit to use them as index
    next_state = torch.tensor(next_state, dtype=torch.float)
    reward = torch.tensor(reward, dtype=torch.float)[:, None]
    done = torch.tensor(done, dtype=torch.uint8)[:, None]  # Boolean
    
    # compute the q value
    q_val = compute_q_vals(Q, state, action, device)
    max_q = torch.max(torch.abs(q_val)).item()

    with torch.no_grad():  # Don't compute gradient info for the target (semi-gradient)
        target = compute_targets(Q, reward, next_state, done, discount_factor, target_net, device)
    
    # loss is measured from error between current and newly expected Q values
    loss = F.smooth_l1_loss(q_val, target)
    
    if error_clipping:
        loss = torch.clamp(loss, min=-1, max=1)

    # backpropagation of loss to Neural Network (PyTorch magic)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item(), max_q

In [6]:

def run_episodes(train, Q, policy, memory, env, num_episodes, batch_size, discount_factor, learn_rate, device, rendering_criterium=999999, use_target_net=10, error_clipping=True):
    optimizer = optim.Adam(Q.parameters(), learn_rate)
    if use_target_net:
        TARGET_UPDATE = use_target_net
        target_net = copy.deepcopy(Q)
        target_net.eval()
    else:
        target_net = Q
    
    global_steps = 0  # Count the steps (do not reset at episode start, to compute epsilon)
    episode_scores = []  #
    weights = []
    max_Qs = []
    rendering = False
    for i in tqdm(range(num_episodes)):
        state = env.reset()
        
        max_Qs_episode = []
        score = 0
        while True:
            policy.set_epsilon(get_epsilon(global_steps))
            action = policy.sample_action(state, device)
            state_next, reward, done, _ = env.step(action)                

            loss, max_Q_batch = train(Q, memory, optimizer, batch_size, discount_factor, target_net, device, error_clipping=error_clipping)

            max_Qs_episode.append(max_Q_batch)

            global_steps += 1
            score += reward

            if rendering:
                env.render()
                time.sleep(0.05)

            memory.push((state, action, reward, state_next, done))
            state = state_next

            if done:
                episode_scores.append(score)

                if all([q != None for q in max_Qs_episode]):
                    max_Qs.append(max(max_Qs_episode))
                else:
                    max_Qs.append(None)
                    
                weights.append(copy.deepcopy(Q).state_dict().copy())

                if score > rendering_criterium:
                    rendering = True
                else:
                    rendering = False
                break
                
            if use_target_net and (i % TARGET_UPDATE == 0):
                target_net.load_state_dict(Q.state_dict())
            
        env.close()

    return episode_scores, weights, max_Qs

In [110]:
cuda_ = "cuda:0"
device = torch.device(cuda_ if torch.cuda.is_available() else "cpu")

results = []
all_episode_weights = []

print(device)

# baseline tuning to env --------------------------------------------  

test_envs = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]
runs_per_setup = 15
num_episodes = 1000

# NO Tricks
for error_clipping in [False]:
    for use_target in [0]:       
        for memory_size in [64]:                   
    
            for env in test_envs:                                          # 3
                for i in range(runs_per_setup):                            # 15
                    i += 3
                    
                    # tuning these params
                    for discount_factor in [0.99, 0.8, 0.9]:               # 3       
                        for learn_rate in [1e-3, 1e-4]:                    # 2
                            for num_hidden in [128, 256]:                  # 2
                                                                           
                                if memory_size < 64:
                                    batch_size = memory_size
                                else:
                                    batch_size = 64

                                seed = i # same seed for different setups, different for every run
                                new_env = gym.envs.make(env)
                                memory = ReplayMemory(memory_size)

                                random.seed(seed)
                                torch.manual_seed(seed)
                                new_env.seed(seed)

                                setup = {"env":env, "batch_size":batch_size, "use_target":use_target, "num_hidden":num_hidden, "learn_rate":learn_rate, "memory_size":memory_size, "discount_factor":discount_factor, "error_clipping":error_clipping, "run":i}

                                try:
                                    # Continious obs space
                                    Q_net = QNetwork(num_hidden, observation_space=new_env.observation_space.low.shape[0], action_space=new_env.action_space.n).to(device)
                                    obs_space = new_env.observation_space.low.shape[0]
                                except:
                                    # Discrete obs space
                                    Q_net = QNetwork(num_hidden, observation_space=new_env.observation_space.n, action_space=new_env.action_space.n).to(device)
                                    obs_space = new_env.observation_space.n

                                policy = EpsilonGreedyPolicy(Q_net, 0.05)
                                episode_scores, episode_weights, max_Qs_episodes = run_episodes(train, Q_net, policy, memory, new_env, num_episodes, batch_size, discount_factor, learn_rate, device, use_target_net=use_target, error_clipping=error_clipping)        

                                # # L2 norm calculation
                                l2_norms=[]
                                for weights in episode_weights:
                                    all_weights = []
                                    for layer in weights.keys():
                                        all_weights.append(weights[layer].flatten())
                                    all_weights = torch.cat(all_weights, dim=0)
                                    l2_norms.append(torch.norm(((all_weights)), 2, -1))

                                # # Save results
                                results.append({**setup, **{"scores":episode_scores, "l2_norms":l2_norms, "maxQs": max_Qs_episodes, "soft_convergence_thresh":(1/(1-discount_factor))}})
                                df = pd.DataFrame(results)
                                df.to_csv(f"baseline_tune_results.csv")



  0%|          | 0/1000 [00:00<?, ?it/s]cpu
 14%|█▎        | 135/1000 [00:02<00:14, 61.33it/s]


KeyboardInterrupt: 

In [8]:
import glob
from collections import defaultdict

all_df = pd.DataFrame()
runs_per_setup = 10

print("Assure all files are here")
for fn in glob.glob("baseline*"):
    print(fn)
    df = pd.read_csv(fn)
    all_df = all_df.append(df, ignore_index=True)

baseline_df = pd.DataFrame(columns=['discount_factor', 'learn_rate', 'num_hidden',  "avg_max_score", "avg_max_qs", "env", "num_runs", "scores", "q_values"])
for discount_factor in [0.99, 0.8, 0.9]:               # 3       
    for learn_rate in [1e-3, 1e-4]:                    # 2
        for num_hidden in [128, 256]:                  # 2
            for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:

                a = all_df.loc[(all_df['discount_factor'] == discount_factor) & (all_df['learn_rate'] == learn_rate) & (all_df['num_hidden'] == num_hidden) & (all_df['env'] == env)]

                scores = []
                max_scores = []
                q_values = []
                max_qs = []
                for index, row in a.iterrows():
                    scores.append(eval(row["scores"]))
                    max_scores.append(max(eval(row["scores"])))
                    q_values.append(eval(row["maxQs"])[10:])
                    max_qs.append(max(eval(row["maxQs"])[10:]))
                avg_max_score = sum(max_scores)/len(max_scores)
                avg_max_qs = sum(max_qs)/len(max_qs)

                baseline_df = baseline_df.append({'discount_factor':discount_factor, 'learn_rate':learn_rate, 'num_hidden':num_hidden, "scores":scores, "avg_max_score":avg_max_score, "q_values":q_values, "avg_max_qs":avg_max_qs, "env":env, "num_runs":len(a)}, ignore_index=True)

best_setups = {}
best_baseline_params = {}

for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:
    env_results = baseline_df.loc[baseline_df['env'] == env]
    best_setups[env] = env_results.iloc[int(env_results['avg_max_score'].argmax())]
    best_baseline_params[env] = env_results.iloc[int(env_results['avg_max_score'].argmax())][["discount_factor", "learn_rate", "num_hidden"]]

for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:
    print(env)
    # print(best_setups[env][["discount_factor", "learn_rate", "num_hidden"]])
    print(best_baseline_params[env]["discount_factor"])

Assure all files are here
baseline_tune_results_longxiang.csv
baseline_tune_results_Alex.csv
baseline_tune_results_Jan.csv
baseline_tune_results_Fab.csv
baseline_tune_results_ALL.csv
CartPole-v1
0.8
Acrobot-v1
0.99
MountainCar-v0
0.99


In [9]:
cuda_ = "cuda:0"
device = torch.device(cuda_ if torch.cuda.is_available() else "cpu")

results = []
all_episode_weights = []

print(device)

# Trick tuning --------------------------------------------  

test_envs = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]
runs_per_setup = 15
num_episodes = 1000

for env in test_envs:                                               # 3
    # set best baseline paramaters per env
    discount_factor = best_baseline_params[env]["discount_factor"]
    num_hidden = best_baseline_params[env]["num_hidden"]
    learn_rate = best_baseline_params[env]["learn_rate"]

    learn_rate = float(learn_rate)
    num_hidden = int(num_hidden)
    discount_factor = float(discount_factor)
    
    for i in range(runs_per_setup):                                 # 15
        i += 3
        for error_clipping in [False, True]:                        # 2
            for use_target in [0, 25, 250, 2500]:                   # 4
                for memory_size in [64, 5000, 10000, 20000]:        # 4
                                                                    
                                
                    if memory_size < 64:
                        batch_size = memory_size
                    else:
                        batch_size = 64

                    seed = i # same seed for different setups, different for every run
                    new_env = gym.envs.make(env)
                    memory = ReplayMemory(memory_size)

                    random.seed(seed)
                    torch.manual_seed(seed)
                    new_env.seed(seed)

                    setup = {"env":env, "batch_size":batch_size, "use_target":use_target, "num_hidden":num_hidden, "learn_rate":learn_rate, "memory_size":memory_size, "discount_factor":discount_factor, "error_clipping":error_clipping, "run":i}

                    print(setup)

                    try:
                        # Continious obs space
                        Q_net = QNetwork(num_hidden, observation_space=int(new_env.observation_space.low.shape[0]), action_space=new_env.action_space.n).to(device)
                        obs_space = new_env.observation_space.low.shape[0]
                    except:
                    #     # Discrete obs space
                        Q_net = QNetwork(num_hidden, observation_space=new_env.observation_space.n, action_space=new_env.action_space.n).to(device)
                        obs_space = new_env.observation_space.n 
                      
                    policy = EpsilonGreedyPolicy(Q_net, 0.05)
                    episode_scores, episode_weights, max_Qs_episodes = run_episodes(train, Q_net, policy, memory, new_env, num_episodes, batch_size, discount_factor, learn_rate, device, use_target_net=use_target, error_clipping=error_clipping)        

                    # # L2 norm calculation
                    l2_norms=[]
                    for weights in episode_weights:
                        all_weights = []
                        for layer in weights.keys():
                            all_weights.append(weights[layer].flatten())
                        all_weights = torch.cat(all_weights, dim=0)
                        l2_norms.append(torch.norm(((all_weights)), 2, -1))

                    # # Save results
                    results.append({**setup, **{"scores":episode_scores, "l2_norms":l2_norms, "maxQs": max_Qs_episodes, "soft_convergence_thresh":(1/(1-discount_factor))}})
                    df = pd.DataFrame(results)
                    df.to_csv(f"trick_tune_results.csv")


batch_size': 64, 'use_target': 25, 'num_hidden': 128, 'learn_rate': 0.001, 'memory_size': 64, 'discount_factor': 0.8, 'run': 1}
100%|██████████| 1000/1000 [02:19<00:00,  7.18it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]{'env': 'CartPole-v1', 'batch_size': 64, 'use_target': 25, 'num_hidden': 128, 'learn_rate': 0.001, 'memory_size': 5000, 'discount_factor': 0.8, 'run': 1}
100%|██████████| 1000/1000 [03:01<00:00,  5.52it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]{'env': 'CartPole-v1', 'batch_size': 64, 'use_target': 25, 'num_hidden': 128, 'learn_rate': 0.001, 'memory_size': 10000, 'discount_factor': 0.8, 'run': 1}
100%|██████████| 1000/1000 [03:09<00:00,  3.70it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]{'env': 'CartPole-v1', 'batch_size': 64, 'use_target': 25, 'num_hidden': 128, 'learn_rate': 0.001, 'memory_size': 20000, 'discount_factor': 0.8, 'run': 1}
100%|██████████| 1000/1000 [03:01<00:00,  6.23it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]{'env': 'CartPole-v1', 'batch_size': 64,

In [30]:
all_df = pd.DataFrame()
runs_per_setup = 10

print("Assure all files are here")
for fn in glob.glob("trick*"):
    print(fn)
    df = pd.read_csv(fn)
    if 'error_clipping' not in df:
        df["error_clipping"] = [int(i/4)%2 for i in range(len(df))]
    all_df = all_df.append(df, ignore_index=True)

trick_df = pd.DataFrame(columns=["error_clipping", "memory_size", "use_target",  "avg_max_score", "avg_max_qs", "env", "num_runs", "scores", "q_values", "error_clipping_bool", "memory_size_bool", "use_target_bool", "soft_div_thresh"])
for error_clipping in [False, True]:                        # 2
    for use_target in [0, 25, 250, 2500]: 
        for memory_size in [64, 5000, 10000, 20000]:        # 4
            for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:

                a = all_df.loc[(all_df['use_target'] == use_target) & (all_df['error_clipping'] == error_clipping) & (all_df['memory_size'] == memory_size) & (all_df['env'] == env)]

                scores = []
                max_scores = []
                q_values = []
                max_qs = []
                for index, row in a.iterrows():
                    scores.append(eval(row["scores"]))
                    max_scores.append(max(eval(row["scores"])))
                    q_values.append(eval(row["maxQs"])[10:])
                    max_qs.append(max(eval(row["maxQs"])[10:]))
                    discount = row["discount_factor"]
                
                if len(max_scores):
                    avg_max_score = sum(max_scores)/len(max_scores)
                    avg_max_qs = sum(max_qs)/len(max_qs)

                trick_df = trick_df.append({'use_target':use_target, 'error_clipping':error_clipping, 'memory_size':memory_size, "scores":scores, "avg_max_score":avg_max_score, "q_values":q_values, "avg_max_qs":avg_max_qs, "env":env, "num_runs":len(a), 'use_target_bool':bool(use_target), 'error_clipping_bool':bool(error_clipping), 'memory_size_bool':bool(memory_size-64), "soft_div_thresh":(1/(1-discount))}, ignore_index=True)

best_setups = {}

for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:
    env_results = trick_df.loc[trick_df['env'] == env]
    best_setups[env] = env_results.iloc[int(env_results['avg_max_score'].argmax())]

for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:
    print(env)
    print(best_setups[env][["use_target", "error_clipping", "memory_size"]])

trick_df[["env", "use_target", "error_clipping", "memory_size", "avg_max_score", "avg_max_qs", "num_runs", "soft_div_thresh"]].sort_values(by=['env', 'avg_max_score', "avg_max_qs"], ascending=[True, False, True]).to_html("overview_trick_tune.html")

best_q_setups = pd.DataFrame(columns=["error_clipping", "memory_size", "use_target",  "avg_max_score", "avg_max_qs", "soft_div_thresh", "env", "num_runs", "scores", "q_values", "error_clipping_bool", "memory_size_bool", "use_target_bool"])

for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:
    for error_clipping in [True, False]:
        for memory_size in [True, False]:
            for use_target in [True, False]:

                sub_df = trick_df.loc[(trick_df['use_target_bool'] == use_target) & (trick_df['error_clipping_bool'] == error_clipping) & (trick_df['memory_size_bool'] == memory_size) & (trick_df['env'] == env)]

                no_soft_divergence = sub_df.loc[(sub_df["avg_max_qs"] <= sub_df["soft_div_thresh"])]

                if len(no_soft_divergence):
                    best_q_setups = best_q_setups.append(no_soft_divergence.iloc[int(no_soft_divergence['avg_max_score'].argmax())])

                else:
                    best_q_setups = best_q_setups.append(sub_df.iloc[int(sub_df['avg_max_qs'].argmin())])


                # sub_df[["env", "use_target", "error_clipping", "memory_size", "avg_max_score", "avg_max_qs", "num_runs"]].sort_values(by=['env', 'avg_max_score', "avg_max_qs"], ascending=[True, False, True]).to_html(f"results_{env}_clip_{error_clipping}_mem_{memory_size}_targ_{use_target}.html")

best_q_setups[["env", "use_target", "error_clipping", "memory_size", "avg_max_score", "avg_max_qs", "num_runs", "soft_div_thresh"]].sort_values(by=['env', 'avg_max_score', "avg_max_qs"], ascending=[True, False, True]).to_html("best_q_setups.html")
best_q_setups.sort_values(by=['env', 'avg_max_score', "avg_max_qs"], ascending=[True, False, True]).to_csv("best_q_setups.csv")

# # NOTES ===========================================================
# Everything averaged over 10 runs
# Error clipping never positive effect on max score or max q-values

# For every env clear best setup if we'd allow slight soft conv exceedance 
# (upper paragraph in every section below)

# If we'd not allow slight soft conv exceedance, 
# Acrobot and Cartpole take big hit on performance
# less of a hit for Mountaincar, but performance already quite sucks
# (lower paragraph in every section below) 

# Acrobot -------------------------------------------------------

# <Almost> Best performing setup (+ close to soft conv limit): 
# use_target: 0, error_clipping: False, memory_size: 10000
# avg_max_score: -61.8
# avg_max_qs: 124.9 <<<< close to soft conv limit (100) <<<<<<<<<<<<<<<

# Best performing setup: 
# use_target: 0, error_clipping: False, memory_size: 5000
# avg_max_score: -61.7
# avg_max_qs: 230.686954

# Best performing setup with avg Q-values under the soft conv limit: 
# use_target: 2500, error_clipping: False, memory_size: 64
# avg_max_score: -161.2
# avg_max_qs: 63.101186

# NO TRICKS 
# Acrobot-v1	0	False	64	-65.6	786.503452	10

# CartPole -------------------------------------------------------

# max performing setup, with best results in terms of avg_max_qs: 
# use_target: 25, error_clipping: True/False, memory_size: 10000
# avg_max_score: 500
# avg_max_qs: 6.9 <<<< close to soft conv limit (5) <<<

# Side note: several best performing setups: 
# various setups unaffected by mem_size, error clipping, target size in [0, 25]
# avg_max_score: 500

# Best performing setup with avg Q-values under the soft conv limit: 
# use_target: 2500, error_clipping: True/False, memory_size: 64
# avg_max_score: 65.2 <<<<< big hit on performance
# avg_max_qs: 2.48

# MountainCar -------------------------------------------------------

# Best performing setup: 
# use_target: 0, error_clipping: False, memory_size: 64 <<<<< NO TRICKS
# avg_max_score: 	-135.1
# avg_max_qs: 107.268062 <<< Very close to soft conv limit (100) <<<<<<<<<<<<<<<

# Best performing setup with avg Q-values under the soft conv limit: 
# use_target: 250, error_clipping: False, memory_size: 20000
# avg_max_score: -147.6
# avg_max_qs: 22.273739






Assure all files are here
trick_tune_results_Alex_2.csv
trick_tune_results_Fabrizio.csv
trick_tune_results_Alex.csv
trick_tune_results_Fabrizio_2.csv
trick_tune_results_Longxiang_2.csv
trick_tune_results_Jan_2.csv
trick_tune_results_Longxiang.csv
trick_tune_results_Jan.csv
CartPole-v1
use_target            0
error_clipping    False
memory_size          64
Name: 0, dtype: object
Acrobot-v1
use_target            0
error_clipping    False
memory_size        5000
Name: 4, dtype: object
MountainCar-v0
use_target            0
error_clipping    False
memory_size          64
Name: 2, dtype: object


In [10]:
cuda_ = "cuda:0"
device = torch.device(cuda_ if torch.cuda.is_available() else "cpu")

results = []
all_episode_weights = []

print(device)

# validation runs of best setups --------------------------------------------  

test_envs = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]
runs_per_setup = 100
num_episodes = 1000

work_divider = {"J":150, "L":200, "F":300, "A":400}

# Who are you?
name = "J"

assert name in work_divider, f"name should be one of {list(work_divider.keys())}"

for i in range(runs_per_setup):                             
    for env in test_envs:                                        
        
        # set best baseline paramaters
        discount_factor = best_baseline_params[env]["discount_factor"]
        num_hidden = best_baseline_params[env]["num_hidden"]
        learn_rate = best_baseline_params[env]["learn_rate"]

        learn_rate = float(learn_rate)
        num_hidden = int(num_hidden)
        discount_factor = float(discount_factor)

        for error_clipping_bool in [False, True]:                        
            for use_target_bool in [False, True]:           
                for memory_size_bool in [False, True]:          
                                                               
                    # set best trick params
                    error_clipping = best_q_setups.loc[(best_q_setups['use_target_bool'] == use_target_bool) & (best_q_setups['error_clipping_bool'] == error_clipping_bool) & (best_q_setups['memory_size_bool'] == memory_size_bool) & (best_q_setups['env'] == env)]["error_clipping"].item()

                    memory_size = best_q_setups.loc[(best_q_setups['use_target_bool'] == use_target_bool) & (best_q_setups['error_clipping_bool'] == error_clipping_bool) & (best_q_setups['memory_size_bool'] == memory_size_bool) & (best_q_setups['env'] == env)]["memory_size"].item()
                    
                    use_target = best_q_setups.loc[(best_q_setups['use_target_bool'] == use_target_bool) & (best_q_setups['error_clipping_bool'] == error_clipping_bool) & (best_q_setups['memory_size_bool'] == memory_size_bool) & (best_q_setups['env'] == env)]["use_target"].item()

                    if memory_size < 64:
                        batch_size = memory_size
                    else:
                        batch_size = 64

                    seed = i + work_divider[name] # same seed for different setups, different for every run
                    new_env = gym.envs.make(env)
                    memory = ReplayMemory(memory_size)

                    random.seed(seed)
                    torch.manual_seed(seed)
                    new_env.seed(seed)

                    setup = {"env":env, "batch_size":batch_size, "use_target":use_target, "num_hidden":num_hidden, "learn_rate":learn_rate, "memory_size":memory_size, "discount_factor":discount_factor, "error_clipping":error_clipping, "run":seed}

                    print(setup)

                    try:
                        # Continious obs space
                        Q_net = QNetwork(num_hidden, observation_space=int(new_env.observation_space.low.shape[0]), action_space=new_env.action_space.n).to(device)
                        obs_space = new_env.observation_space.low.shape[0]
                    except:
                    #     # Discrete obs space
                        Q_net = QNetwork(num_hidden, observation_space=new_env.observation_space.n, action_space=new_env.action_space.n).to(device)
                        obs_space = new_env.observation_space.n 
                      
                    policy = EpsilonGreedyPolicy(Q_net, 0.05)
                    episode_scores, episode_weights, max_Qs_episodes = run_episodes(train, Q_net, policy, memory, new_env, num_episodes, batch_size, discount_factor, learn_rate, device, use_target_net=use_target, error_clipping=error_clipping)        

                    # # L2 norm calculation
                    l2_norms=[]
                    for weights in episode_weights:
                        all_weights = []
                        for layer in weights.keys():
                            all_weights.append(weights[layer].flatten())
                        all_weights = torch.cat(all_weights, dim=0)
                        l2_norms.append(torch.norm(((all_weights)), 2, -1))

                    # # Save results
                    results.append({**setup, **{"scores":episode_scores, "l2_norms":l2_norms, "maxQs": max_Qs_episodes, "soft_convergence_thresh":(1/(1-discount_factor))}})
                    df = pd.DataFrame(results)
                    df.to_csv(f"validation_results_{name}_2.csv")



scount_factor': 0.99, 'error_clipping': False, 'run': 150}
100%|██████████| 1000/1000 [03:46<00:00,  4.42it/s]
  0%|          | 0/1000 [00:00<?, ?it/s][33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
{'env': 'MountainCar-v0', 'batch_size': 64, 'use_target': 250, 'num_hidden': 128, 'learn_rate': 0.0001, 'memory_size': 20000, 'discount_factor': 0.99, 'error_clipping': False, 'run': 150}
100%|██████████| 1000/1000 [03:51<00:00,  4.32it/s]
  0%|          | 0/1000 [00:00<?, ?it/s][33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
{'env': 'MountainCar-v0', 'batch_size': 64, 'use_target': 0, 'num_hidden': 128, 'learn_rate': 0.0001, 'memory_size': 64, 'discount_factor': 0.99, 'error_clipping': True, 'run': 150}
100%|██████████| 1000/1000 [03:54<00:00,  4.39it/s]
  0%|          | 0/1000 [00:00<?, ?it/s][33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please

KeyboardInterrupt: 

In [38]:
all_df = pd.DataFrame()
runs_per_setup = 10

print("Assure all files are here")
for fn in glob.glob("validation_results*.csv"):
    print(fn)
    df = pd.read_csv(fn)
    all_df = all_df.append(df, ignore_index=True)

# all_df = all_df.append(best_q_setups, ignore_index=True)

for fn in glob.glob("trick*.csv"):
    print(fn)
    df = pd.read_csv(fn)
    if 'error_clipping' not in df:
        df["error_clipping"] = [int(i/4)%2 for i in range(len(df))]
    all_df = all_df.append(df, ignore_index=True)

validation_df = pd.DataFrame(columns=["error_clipping", "memory_size", "use_target",  "avg_max_score", "avg_max_qs", "env", "num_runs", "scores", "q_values", "error_clipping_bool", "memory_size_bool", "use_target_bool", "soft_div_thresh"])

for error_clipping_bool in [False, True]:                        
    for use_target_bool in [False, True]:           
        for memory_size_bool in [False, True]:          
            for env in ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]:
                                                        
                # set best trick params
                error_clipping = best_q_setups.loc[(best_q_setups['use_target_bool'] == use_target_bool) & (best_q_setups['error_clipping_bool'] == error_clipping_bool) & (best_q_setups['memory_size_bool'] == memory_size_bool) & (best_q_setups['env'] == env)]["error_clipping"].item()

                memory_size = best_q_setups.loc[(best_q_setups['use_target_bool'] == use_target_bool) & (best_q_setups['error_clipping_bool'] == error_clipping_bool) & (best_q_setups['memory_size_bool'] == memory_size_bool) & (best_q_setups['env'] == env)]["memory_size"].item()
                
                use_target = best_q_setups.loc[(best_q_setups['use_target_bool'] == use_target_bool) & (best_q_setups['error_clipping_bool'] == error_clipping_bool) & (best_q_setups['memory_size_bool'] == memory_size_bool) & (best_q_setups['env'] == env)]["use_target"].item()


                a = all_df.loc[(all_df['use_target'] == use_target) & (all_df['error_clipping'] == error_clipping) & (all_df['memory_size'] == memory_size) & (all_df['env'] == env)]
                
                if not len(a):
                    continue

                scores = []
                max_scores = []
                q_values = []
                max_qs = []
                for index, row in a.iterrows():
                    scores.append(eval(row["scores"]))
                    max_scores.append(max(eval(row["scores"])))
                    q_values.append(eval(row["maxQs"])[10:])
                    max_qs.append(max(eval(row["maxQs"])[10:]))
                    discount = row["discount_factor"]
                
                if len(max_scores):
                    avg_max_score = sum(max_scores)/len(max_scores)
                    avg_max_qs = sum(max_qs)/len(max_qs)

                validation_df = validation_df.append({'use_target':use_target, 'error_clipping':error_clipping, 'memory_size':memory_size, "scores":scores, "avg_max_score":avg_max_score, "q_values":q_values, "avg_max_qs":avg_max_qs, "env":env, "num_runs":len(a), 'use_target_bool':bool(use_target), 'error_clipping_bool':bool(error_clipping), 'memory_size_bool':bool(memory_size-64), "soft_div_thresh":(1/(1-discount))}, ignore_index=True)


validation_df[["env", "use_target", "error_clipping", "memory_size", "avg_max_score", "avg_max_qs", "num_runs", "soft_div_thresh"]].sort_values(by=['env', 'avg_max_score', "avg_max_qs"], ascending=[True, False, True]).to_html("validation_df.html")

validation_df.sort_values(by=['env', 'avg_max_score', "avg_max_qs"], ascending=[True, False, True]).to_csv("validation_df.csv")



Assure all files are here
validation_results_J.csv
validation_results_J_2.csv
trick_tune_results_Alex_2.csv
trick_tune_results_Fabrizio.csv
trick_tune_results_Alex.csv
trick_tune_results_Fabrizio_2.csv
trick_tune_results_Longxiang_2.csv
trick_tune_results_Jan_2.csv
trick_tune_results_Longxiang.csv
trick_tune_results_Jan.csv
