In [1]:
## Imports

import gym 
import numpy as np

from function_approximators.function_approximators import NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics.pairwise import rbf_kernel

from utils.train_utils import train, solve, train_time

from agents.agents import DQNAgent, LinearAgent, FQIAgent, OnlineGaussianProccessAgent
import operator


In [2]:
## Environment

function_approximators = [NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess]

agents = [DQNAgent, LinearAgent, *[FQIAgent]*5, OnlineGaussianProccessAgent]

RENDER = False
env = gym.make("LunarLander-v2")
environment = "lunarlander"

In [3]:
# DQN Config
CONFIG_DQN = {
    "episode_length": 500,
    "max_timesteps": 200000,
    "max_time": 30 * 60,
    "eval_freq": 10000, 
    "eval_episodes": 5,
    "learning_rate": 0.0015,
    "hidden_size": (256,128),
    "target_update_freq": 100,
    "batch_size": 64,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "plot_loss": False,
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.1,
    "lr_step_size": 1000,
    "lr_gamma": 0.99,
    "max_steps": 500,
    "non_param": False,
}

# Linear Config
CONFIG_LINEAR = {
    "episode_length": 500,
    "max_timesteps": 200000,
    "max_time": 30 * 60,
    "eval_freq": 10000, 
    "eval_episodes": 5,
    "learning_rate": 0.02,
    "target_update_freq": 50,
    "batch_size": 64,
    "gamma": 0.99,
    "buffer_capacity": int(1e5),
    "plot_loss": False,
    "epsilon": 1,
    "max_steps": 500,
    "poly_degree": 4,
    "max_deduct": 0.95,
    "decay": 0.1,
    "lr_step_size": 1000,
    "lr_gamma": 0.99,
    "non_param": False,
}

# Decision Tree Config
CONFIG_DT = {
    "episode_length": 500,
    "max_timesteps": 200000,
    "max_time": 30 * 60,
    "eval_freq": 10000, 
    "eval_episodes": 5,
    "model_save_freq": 0,
    "model_save_capacity": 0,
    "update_freq": 100,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"criterion":"mse","max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 5},
    "feature_names": ["Cart Position", "Cart Velocity", "Pole Angle", "Pole Angular Velocity", "Action: Push Left", "Action: Push Right"],
    "plot_name": "dt_depth=8",
}

# Random Forest Config
CONFIG_RF = {
    "episode_length": 500,
    "max_timesteps": 200000,
    "max_time": 30 * 1000,
    "eval_freq": 10000, 
    "eval_episodes": 5,
    "model_save_freq": 0,
    "model_save_capacity": 0,
    "update_freq": 100,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"n_estimators": 10,"max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 5},
}

# Support Vector Regressor Config
CONFIG_SVR = {
    "episode_length": 500,
    "max_timesteps": 50000,
    "max_time": 30 * 1000,
    "eval_freq": 2500, 
    "eval_episodes": 10,
    "model_save_freq": 0,
    "model_save_capacity": 0,
    "update_freq": 200,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"kernel":"rbf", "degree": 2, "C": 1.2},
}


# K-Neighbors Regressor Config
CONFIG_KNR = {
    "episode_length": 500,
    "max_timesteps": 50000,
    "max_time": 30 * 1000,
    "eval_freq": 5000, 
    "eval_episodes": 5,
    "model_save_freq": 0,
    "model_save_capacity": 0,
    "update_freq": 100,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.93,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"n_neighbors":10, "weights": "distance", "algorithm": "auto", "leaf_size": 30},
}

# Gaussian Process Config
CONFIG_GP = {
    "episode_length": 500,
    "max_timesteps": 50000,
    "max_time": 30 * 1000,
    "eval_freq": 10000, 
    "eval_episodes": 5,
    "model_save_freq": 0,
    "model_save_capacity": 0,
    "update_freq": 100,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"alpha": 1e-10, "normalize_y": False, "kernel":  RBF(length_scale=0.3, length_scale_bounds="fixed")},
}

# Online Gaussian Process Config
CONFIG_GP_Online = {
    "episode_length": 500,
    "max_timesteps": 50000,
    "max_time": 30 * 1000,
    "eval_freq": 10000, 
    "eval_episodes": 5,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "batch_size": 32,
    "epsilon": 1,
    "max_deduct": 0.93,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"sigma_0": 0.3, "init": 0, "kernel":  rbf_kernel, "epsilon_tol": 0.075, "basis_limit": 1000},
}

CONFIGS = [CONFIG_DQN, CONFIG_LINEAR, CONFIG_DT, CONFIG_RF, CONFIG_SVR, CONFIG_KNR, CONFIG_GP, CONFIG_GP_Online]
onlines = [False, False, False, False, False, False, False, True]
models = ["Neural Network", "Linear Model", "Decision Tree", "Random Forest", "Support Vectors", "K-Neighbours", "Gaussian Process", "Gaussian Process Online"]

In [4]:
## Performance Evaluation

returns = []
train_returns = []
train_times = []
n_seeds=11

j=4
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _, t, times = train(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j], 
            render=RENDER,
            online=onlines[j],
            threshold = 0.2)
    env.close()
    returns.append(r)
    train_returns.append(t)
    train_times.append(times)
    


  1%|          | 404/50000 [00:00<00:13, 3559.81it/s]


 Run: 1 



  5%|▌         | 2603/50000 [00:06<00:35, 1334.26it/s]

Evaluation at timestep 2603 returned a mean returns of -98.6351366753515
Epsilon = 0.8419833333333333
Replay Buffer count: 899


 10%|█         | 5057/50000 [00:16<02:06, 355.75it/s]

Evaluation at timestep 5057 returned a mean returns of -32.189830203053084
Epsilon = 0.6854233333333333
Replay Buffer count: 1630


 15%|█▌        | 7635/50000 [00:30<03:10, 222.96it/s]

Evaluation at timestep 7635 returned a mean returns of -55.48689743098798
Epsilon = 0.52918
Replay Buffer count: 2219


 20%|██        | 10128/50000 [00:52<05:18, 125.10it/s]

Evaluation at timestep 10128 returned a mean returns of -30.529813716463124
Epsilon = 0.36812333333333336
Replay Buffer count: 2814


 26%|██▌       | 12878/50000 [01:23<06:05, 101.67it/s]

Evaluation at timestep 12878 returned a mean returns of 14.76060906443245
Epsilon = 0.21606000000000003
Replay Buffer count: 3354


 30%|███       | 15039/50000 [01:53<07:26, 78.37it/s]

Evaluation at timestep 15039 returned a mean returns of -16.83055963201993
Epsilon = 0.06095666666666666
Replay Buffer count: 3805


 35%|███▌      | 17529/50000 [02:35<09:26, 57.33it/s]

Evaluation at timestep 17529 returned a mean returns of -7.529410282472616
Epsilon = 0.050000000000000044
Replay Buffer count: 4317


 40%|████      | 20094/50000 [03:25<08:17, 60.07it/s]

Evaluation at timestep 20094 returned a mean returns of 1.7883750597641483
Epsilon = 0.050000000000000044
Replay Buffer count: 4746


 46%|████▌     | 22911/50000 [04:39<11:42, 38.57it/s]

Evaluation at timestep 22911 returned a mean returns of -0.021092603363531026
Epsilon = 0.050000000000000044
Replay Buffer count: 5156


 50%|█████     | 25054/50000 [05:34<09:57, 41.76it/s]

Evaluation at timestep 25054 returned a mean returns of 61.6261133418823
Epsilon = 0.050000000000000044
Replay Buffer count: 5489


 56%|█████▌    | 27782/50000 [07:01<11:28, 32.26it/s]

Evaluation at timestep 27782 returned a mean returns of 71.30701684637992
Epsilon = 0.050000000000000044
Replay Buffer count: 5862


 60%|██████    | 30048/50000 [08:19<10:45, 30.90it/s]

Evaluation at timestep 30048 returned a mean returns of 135.88219426721577
Epsilon = 0.050000000000000044
Replay Buffer count: 6069


 65%|██████▌   | 32597/50000 [09:55<10:46, 26.91it/s]

Evaluation at timestep 32597 returned a mean returns of 60.392442202472054
Epsilon = 0.050000000000000044
Replay Buffer count: 6341


 70%|███████   | 35017/50000 [11:28<09:38, 25.88it/s]

Evaluation at timestep 35017 returned a mean returns of 124.329245203934
Epsilon = 0.050000000000000044
Replay Buffer count: 6560


 76%|███████▌  | 37917/50000 [13:35<08:54, 22.60it/s]

Evaluation at timestep 37917 returned a mean returns of 107.97010140360555
Epsilon = 0.050000000000000044
Replay Buffer count: 6725


 80%|████████  | 40162/50000 [15:13<07:15, 22.58it/s]

Evaluation at timestep 40162 returned a mean returns of 43.45245807371996
Epsilon = 0.050000000000000044
Replay Buffer count: 6957


 86%|████████▌ | 42973/50000 [17:24<05:39, 20.67it/s]

Evaluation at timestep 42973 returned a mean returns of 99.6300001102436
Epsilon = 0.050000000000000044
Replay Buffer count: 7112


 90%|█████████ | 45125/50000 [19:13<04:22, 18.57it/s]

Evaluation at timestep 45125 returned a mean returns of 120.54943452370279
Epsilon = 0.050000000000000044
Replay Buffer count: 7291


 95%|█████████▌| 47618/50000 [21:16<01:52, 21.11it/s]

Evaluation at timestep 47618 returned a mean returns of 93.82107401080873
Epsilon = 0.050000000000000044
Replay Buffer count: 7433


50205it [23:36, 35.44it/s]
  1%|          | 484/50000 [00:00<00:13, 3808.45it/s]

Evaluation at timestep 50205 returned a mean returns of 69.57562090381623
Epsilon = 0.050000000000000044
Replay Buffer count: 7604

 Run: 2 



  5%|▌         | 2710/50000 [00:03<02:29, 317.36it/s] 

Evaluation at timestep 2560 returned a mean returns of -231.281429335382
Epsilon = 0.8435033333333333
Replay Buffer count: 827


 10%|█         | 5082/50000 [00:12<01:34, 477.66it/s]

Evaluation at timestep 5082 returned a mean returns of -82.41309810085572
Epsilon = 0.6884633333333333
Replay Buffer count: 1600


 15%|█▌        | 7538/50000 [00:25<02:40, 264.69it/s]

Evaluation at timestep 7538 returned a mean returns of -170.105379813353
Epsilon = 0.53127
Replay Buffer count: 2251


 20%|██        | 10044/50000 [00:47<04:45, 139.77it/s]

Evaluation at timestep 10044 returned a mean returns of -16.179471683853325
Epsilon = 0.3764833333333334
Replay Buffer count: 2852


 25%|██▌       | 12653/50000 [01:17<06:45, 92.11it/s]

Evaluation at timestep 12653 returned a mean returns of -49.68762489400869
Epsilon = 0.23031000000000001
Replay Buffer count: 3360


 31%|███       | 15396/50000 [01:55<07:55, 72.70it/s]

Evaluation at timestep 15396 returned a mean returns of 15.415807964112183
Epsilon = 0.056586666666666674
Replay Buffer count: 3737


 35%|███▌      | 17570/50000 [02:26<07:35, 71.19it/s]

Evaluation at timestep 17570 returned a mean returns of -9.771878200811798
Epsilon = 0.050000000000000044
Replay Buffer count: 4197


 40%|████      | 20012/50000 [03:17<10:10, 49.15it/s]

Evaluation at timestep 20012 returned a mean returns of 28.2315562387093
Epsilon = 0.050000000000000044
Replay Buffer count: 4640


 45%|████▌     | 22658/50000 [04:17<11:27, 39.76it/s]

Evaluation at timestep 22658 returned a mean returns of -96.08029106938332
Epsilon = 0.050000000000000044
Replay Buffer count: 5111


 50%|█████     | 25086/50000 [05:20<11:07, 37.30it/s]

Evaluation at timestep 25086 returned a mean returns of 43.31850837349885
Epsilon = 0.050000000000000044
Replay Buffer count: 5404


 55%|█████▌    | 27609/50000 [06:37<12:09, 30.70it/s]

Evaluation at timestep 27609 returned a mean returns of 71.82604072338914
Epsilon = 0.050000000000000044
Replay Buffer count: 5763


 60%|██████    | 30009/50000 [07:57<12:03, 27.65it/s]

Evaluation at timestep 30009 returned a mean returns of 36.67991317988685
Epsilon = 0.050000000000000044
Replay Buffer count: 6059


 66%|██████▌   | 32951/50000 [09:37<09:28, 30.01it/s]

Evaluation at timestep 32951 returned a mean returns of 139.60819654923444
Epsilon = 0.050000000000000044
Replay Buffer count: 6326


 70%|███████   | 35077/50000 [11:01<09:46, 25.44it/s]

Evaluation at timestep 35077 returned a mean returns of 34.02372335566116
Epsilon = 0.050000000000000044
Replay Buffer count: 6445


 75%|███████▌  | 37582/50000 [12:38<07:34, 27.30it/s]

Evaluation at timestep 37582 returned a mean returns of 64.55347101521258
Epsilon = 0.050000000000000044
Replay Buffer count: 6705


 80%|████████  | 40161/50000 [14:29<06:07, 26.77it/s]

Evaluation at timestep 40161 returned a mean returns of 105.09240377266235
Epsilon = 0.050000000000000044
Replay Buffer count: 6944


 85%|████████▌ | 42536/50000 [16:21<05:21, 23.22it/s]

Evaluation at timestep 42536 returned a mean returns of 108.30373670162595
Epsilon = 0.050000000000000044
Replay Buffer count: 7124


 90%|█████████ | 45049/50000 [18:24<03:56, 20.92it/s]

Evaluation at timestep 45049 returned a mean returns of 43.32838866324019
Epsilon = 0.050000000000000044
Replay Buffer count: 7271


 95%|█████████▌| 47693/50000 [20:34<01:48, 21.32it/s]

Evaluation at timestep 47693 returned a mean returns of 99.9714670648718
Epsilon = 0.050000000000000044
Replay Buffer count: 7450


50196it [22:50, 36.63it/s]
  1%|          | 441/50000 [00:00<00:14, 3316.73it/s]

Evaluation at timestep 50196 returned a mean returns of 114.97046178606358
Epsilon = 0.050000000000000044
Replay Buffer count: 7614

 Run: 3 



  5%|▌         | 2598/50000 [00:05<00:32, 1436.82it/s]

Evaluation at timestep 2598 returned a mean returns of -82.33375941443225
Epsilon = 0.8435666666666667
Replay Buffer count: 882


 10%|█         | 5050/50000 [00:14<01:40, 448.29it/s]

Evaluation at timestep 5050 returned a mean returns of -60.756635995761144
Epsilon = 0.68764
Replay Buffer count: 1523


 15%|█▌        | 7554/50000 [00:29<03:06, 227.51it/s]

Evaluation at timestep 7554 returned a mean returns of -38.17886477617883
Epsilon = 0.5274066666666667
Replay Buffer count: 2124


 20%|██        | 10026/50000 [00:50<04:26, 150.19it/s]

Evaluation at timestep 10026 returned a mean returns of -16.937185600573397
Epsilon = 0.39668666666666674
Replay Buffer count: 2740


 26%|██▌       | 12884/50000 [01:19<05:59, 103.38it/s]

Evaluation at timestep 12884 returned a mean returns of -114.3506557783199
Epsilon = 0.21567999999999998
Replay Buffer count: 3411


 30%|███       | 15148/50000 [01:55<07:54, 73.40it/s]

Evaluation at timestep 15148 returned a mean returns of -91.11502724308882
Epsilon = 0.05734666666666677
Replay Buffer count: 3781


 35%|███▌      | 17582/50000 [02:37<08:31, 63.40it/s]

Evaluation at timestep 17582 returned a mean returns of 11.739509275663533
Epsilon = 0.050000000000000044
Replay Buffer count: 4137


 40%|████      | 20117/50000 [03:27<09:56, 50.12it/s]

Evaluation at timestep 20117 returned a mean returns of -82.20555866468065
Epsilon = 0.050000000000000044
Replay Buffer count: 4606


 46%|████▌     | 22957/50000 [04:31<11:04, 40.70it/s]

Evaluation at timestep 22957 returned a mean returns of -92.72294435408787
Epsilon = 0.050000000000000044
Replay Buffer count: 5100


 50%|█████     | 25199/50000 [05:34<10:58, 37.67it/s]

Evaluation at timestep 25199 returned a mean returns of 131.1760892818691
Epsilon = 0.050000000000000044
Replay Buffer count: 5465


 55%|█████▌    | 27689/50000 [06:53<11:56, 31.12it/s]

Evaluation at timestep 27689 returned a mean returns of 66.00967625653291
Epsilon = 0.050000000000000044
Replay Buffer count: 5729


 60%|██████    | 30031/50000 [08:07<09:57, 33.43it/s]

Evaluation at timestep 30031 returned a mean returns of 173.19114903243525
Epsilon = 0.050000000000000044
Replay Buffer count: 5951


 65%|██████▌   | 32626/50000 [09:43<09:37, 30.06it/s]

Evaluation at timestep 32626 returned a mean returns of 117.19075710291543
Epsilon = 0.050000000000000044
Replay Buffer count: 6220


 71%|███████   | 35290/50000 [11:30<09:47, 25.03it/s]

Evaluation at timestep 35290 returned a mean returns of 206.51400071234116
Epsilon = 0.050000000000000044
Replay Buffer count: 6456


 75%|███████▌  | 37520/50000 [13:01<08:29, 24.50it/s]

Evaluation at timestep 37520 returned a mean returns of 81.2062366814063
Epsilon = 0.050000000000000044
Replay Buffer count: 6687


 80%|████████  | 40079/50000 [14:44<06:13, 26.59it/s]

Evaluation at timestep 40079 returned a mean returns of 197.15637857938825
Epsilon = 0.050000000000000044
Replay Buffer count: 6840


 86%|████████▌ | 42773/50000 [16:49<05:33, 21.67it/s]

Evaluation at timestep 42773 returned a mean returns of 197.40543333952235
Epsilon = 0.050000000000000044
Replay Buffer count: 7036


 91%|█████████ | 45360/50000 [18:51<03:41, 20.99it/s]

Evaluation at timestep 45360 returned a mean returns of 48.25475790194096
Epsilon = 0.050000000000000044
Replay Buffer count: 7205


 95%|█████████▌| 47543/50000 [20:40<02:05, 19.64it/s]

Evaluation at timestep 47543 returned a mean returns of 66.23299224127469
Epsilon = 0.050000000000000044
Replay Buffer count: 7456


50225it [23:01, 36.35it/s]
  1%|          | 422/50000 [00:00<00:12, 3950.24it/s]

Evaluation at timestep 50225 returned a mean returns of 111.82409057615793
Epsilon = 0.050000000000000044
Replay Buffer count: 7610

 Run: 4 



  5%|▌         | 2623/50000 [00:04<03:52, 204.06it/s] 

Evaluation at timestep 2519 returned a mean returns of -110.24880009277015
Epsilon = 0.8455933333333333
Replay Buffer count: 878


 10%|█         | 5091/50000 [00:10<01:40, 447.27it/s]

Evaluation at timestep 5091 returned a mean returns of -115.12903456696196
Epsilon = 0.6837133333333334
Replay Buffer count: 1625


 15%|█▌        | 7593/50000 [00:26<03:15, 216.84it/s]

Evaluation at timestep 7593 returned a mean returns of -43.59637790002252
Epsilon = 0.52633
Replay Buffer count: 2292


 20%|██        | 10238/50000 [00:47<05:00, 132.29it/s]

Evaluation at timestep 10238 returned a mean returns of -104.43534039529113
Epsilon = 0.38326000000000005
Replay Buffer count: 2973


 26%|██▌       | 12766/50000 [01:19<07:25, 83.58it/s]

Evaluation at timestep 12766 returned a mean returns of -54.68576911610143
Epsilon = 0.22315333333333343
Replay Buffer count: 3504


 30%|███       | 15043/50000 [01:52<07:44, 75.20it/s]

Evaluation at timestep 15043 returned a mean returns of -51.566024237995094
Epsilon = 0.06444000000000005
Replay Buffer count: 3931


 35%|███▌      | 17579/50000 [02:37<09:50, 54.90it/s]

Evaluation at timestep 17579 returned a mean returns of -179.09281880875108
Epsilon = 0.050000000000000044
Replay Buffer count: 4459


 40%|████      | 20062/50000 [03:32<09:30, 52.46it/s]

Evaluation at timestep 20062 returned a mean returns of -1.1443313559136232
Epsilon = 0.050000000000000044
Replay Buffer count: 4864


 45%|████▌     | 22601/50000 [04:39<11:07, 41.07it/s]

Evaluation at timestep 22601 returned a mean returns of 73.39382713179396
Epsilon = 0.050000000000000044
Replay Buffer count: 5224


 50%|█████     | 25158/50000 [05:52<12:16, 33.75it/s]

Evaluation at timestep 25158 returned a mean returns of -106.22131750558054
Epsilon = 0.050000000000000044
Replay Buffer count: 5570


 55%|█████▌    | 27555/50000 [07:06<11:21, 32.91it/s]

Evaluation at timestep 27555 returned a mean returns of -37.111505916785994
Epsilon = 0.050000000000000044
Replay Buffer count: 5799


 60%|██████    | 30041/50000 [08:28<10:20, 32.14it/s]

Evaluation at timestep 30041 returned a mean returns of 104.64880733747937
Epsilon = 0.050000000000000044
Replay Buffer count: 6068


 65%|██████▌   | 32737/50000 [10:11<10:46, 26.70it/s]

Evaluation at timestep 32737 returned a mean returns of 234.2724969713907
Epsilon = 0.050000000000000044
Replay Buffer count: 6312


 70%|███████   | 35011/50000 [11:40<09:14, 27.04it/s]

Evaluation at timestep 35011 returned a mean returns of 120.91047004665462
Epsilon = 0.050000000000000044
Replay Buffer count: 6459


 76%|███████▌  | 37791/50000 [13:36<08:08, 24.99it/s]

Evaluation at timestep 37791 returned a mean returns of 198.72445772368803
Epsilon = 0.050000000000000044
Replay Buffer count: 6667


 81%|████████  | 40412/50000 [15:30<06:34, 24.33it/s]

Evaluation at timestep 40412 returned a mean returns of 201.74032198383057
Epsilon = 0.050000000000000044
Replay Buffer count: 6821


 86%|████████▌ | 42806/50000 [17:20<05:18, 22.62it/s]

Evaluation at timestep 42806 returned a mean returns of 171.6733074138637
Epsilon = 0.050000000000000044
Replay Buffer count: 6935


 90%|█████████ | 45168/50000 [19:14<03:48, 21.13it/s]

Evaluation at timestep 45168 returned a mean returns of 88.4292590257491
Epsilon = 0.050000000000000044
Replay Buffer count: 7103


 96%|█████████▌| 47842/50000 [21:23<01:37, 22.07it/s]

Evaluation at timestep 47842 returned a mean returns of 189.83148328003347
Epsilon = 0.050000000000000044
Replay Buffer count: 7210


50009it [23:14, 35.85it/s]
  1%|          | 446/50000 [00:00<00:13, 3810.68it/s]

Evaluation at timestep 50009 returned a mean returns of 197.24330903749143
Epsilon = 0.050000000000000044
Replay Buffer count: 7375

 Run: 5 



  5%|▌         | 2685/50000 [00:05<04:49, 163.40it/s] 

Evaluation at timestep 2585 returned a mean returns of -69.8443294853237
Epsilon = 0.8450866666666667
Replay Buffer count: 910


 10%|█         | 5077/50000 [00:15<01:38, 456.17it/s]

Evaluation at timestep 5077 returned a mean returns of -45.11795628345868
Epsilon = 0.6913766666666666
Replay Buffer count: 1640


 15%|█▌        | 7527/50000 [00:31<03:09, 223.79it/s]

Evaluation at timestep 7527 returned a mean returns of -49.87865368685157
Epsilon = 0.5294966666666667
Replay Buffer count: 2316


 20%|██        | 10011/50000 [00:54<05:05, 130.93it/s]

Evaluation at timestep 10011 returned a mean returns of -105.03263420429064
Epsilon = 0.3943433333333334
Replay Buffer count: 2837


 25%|██▌       | 12653/50000 [01:24<06:00, 103.64it/s]

Evaluation at timestep 12653 returned a mean returns of 18.919203619632125
Epsilon = 0.23031000000000001
Replay Buffer count: 3199


 31%|███       | 15333/50000 [01:59<07:01, 82.31it/s]

Evaluation at timestep 15333 returned a mean returns of -146.06937533393634
Epsilon = 0.06057666666666672
Replay Buffer count: 3677


 35%|███▌      | 17671/50000 [02:37<07:45, 69.48it/s]

Evaluation at timestep 17671 returned a mean returns of -15.011577934379291
Epsilon = 0.050000000000000044
Replay Buffer count: 4114


 40%|████      | 20246/50000 [03:27<09:14, 53.63it/s]

Evaluation at timestep 20246 returned a mean returns of 31.04312128158475
Epsilon = 0.050000000000000044
Replay Buffer count: 4532


 45%|████▌     | 22631/50000 [04:19<09:49, 46.41it/s]

Evaluation at timestep 22631 returned a mean returns of -70.43086490178962
Epsilon = 0.050000000000000044
Replay Buffer count: 4881


 50%|█████     | 25017/50000 [05:18<09:56, 41.88it/s]

Evaluation at timestep 25017 returned a mean returns of -5.4582755714093665
Epsilon = 0.050000000000000044
Replay Buffer count: 5290


 55%|█████▌    | 27620/50000 [06:32<11:13, 33.22it/s]

Evaluation at timestep 27620 returned a mean returns of -17.016980032117544
Epsilon = 0.050000000000000044
Replay Buffer count: 5599


 60%|██████    | 30177/50000 [07:55<11:25, 28.91it/s]

Evaluation at timestep 30177 returned a mean returns of 3.3210427347059706
Epsilon = 0.050000000000000044
Replay Buffer count: 5875


 65%|██████▌   | 32548/50000 [09:18<11:08, 26.10it/s]

Evaluation at timestep 32548 returned a mean returns of 11.14487671316027
Epsilon = 0.050000000000000044
Replay Buffer count: 6231


 70%|███████   | 35161/50000 [11:01<10:32, 23.46it/s]

Evaluation at timestep 35161 returned a mean returns of 155.6624521505724
Epsilon = 0.050000000000000044
Replay Buffer count: 6492


 76%|███████▌  | 37801/50000 [12:48<08:16, 24.59it/s]

Evaluation at timestep 37801 returned a mean returns of 9.29756123694623
Epsilon = 0.050000000000000044
Replay Buffer count: 6688


 80%|████████  | 40119/50000 [14:23<06:32, 25.16it/s]

Evaluation at timestep 40119 returned a mean returns of 88.15519040594958
Epsilon = 0.050000000000000044
Replay Buffer count: 6836


 86%|████████▌ | 42824/50000 [16:29<05:21, 22.29it/s]

Evaluation at timestep 42824 returned a mean returns of 86.39328828981847
Epsilon = 0.050000000000000044
Replay Buffer count: 7075


 90%|█████████ | 45184/50000 [18:24<04:01, 19.96it/s]

Evaluation at timestep 45184 returned a mean returns of 57.84928112032657
Epsilon = 0.050000000000000044
Replay Buffer count: 7210


 95%|█████████▌| 47522/50000 [20:12<01:45, 23.49it/s]

Evaluation at timestep 47522 returned a mean returns of 48.28409746426006
Epsilon = 0.050000000000000044
Replay Buffer count: 7313


50329it [22:34, 37.16it/s]
  1%|          | 408/50000 [00:00<00:12, 4017.71it/s]

Evaluation at timestep 50329 returned a mean returns of 160.53561662257192
Epsilon = 0.050000000000000044
Replay Buffer count: 7434

 Run: 6 



  5%|▌         | 2617/50000 [00:05<05:13, 151.05it/s] 

Evaluation at timestep 2517 returned a mean returns of -128.35804614654634
Epsilon = 0.8467966666666666
Replay Buffer count: 876


 10%|█         | 5111/50000 [00:15<01:44, 429.84it/s]

Evaluation at timestep 5111 returned a mean returns of -66.47372399773676
Epsilon = 0.6850433333333333
Replay Buffer count: 1590


 15%|█▌        | 7610/50000 [00:30<03:08, 224.29it/s]

Evaluation at timestep 7610 returned a mean returns of -34.13059640779092
Epsilon = 0.52633
Replay Buffer count: 2269


 20%|██        | 10085/50000 [00:52<04:38, 143.57it/s]

Evaluation at timestep 10085 returned a mean returns of -75.68197142101565
Epsilon = 0.39295
Replay Buffer count: 2834


 26%|██▌       | 12829/50000 [01:25<06:25, 96.36it/s]

Evaluation at timestep 12829 returned a mean returns of -21.114674256147623
Epsilon = 0.21916333333333338
Replay Buffer count: 3383


 30%|███       | 15111/50000 [01:57<07:20, 79.19it/s]

Evaluation at timestep 15111 returned a mean returns of -32.59824120118242
Epsilon = 0.07203999999999999
Replay Buffer count: 3834


 35%|███▌      | 17519/50000 [02:35<07:49, 69.23it/s]

Evaluation at timestep 17519 returned a mean returns of -53.669737691327725
Epsilon = 0.050000000000000044
Replay Buffer count: 4246


 40%|████      | 20119/50000 [03:26<09:11, 54.16it/s]

Evaluation at timestep 20119 returned a mean returns of -175.89386959080534
Epsilon = 0.050000000000000044
Replay Buffer count: 4816


 45%|████▌     | 22538/50000 [04:26<09:28, 48.29it/s]

Evaluation at timestep 22538 returned a mean returns of 142.27700837301015
Epsilon = 0.050000000000000044
Replay Buffer count: 5257


 50%|█████     | 25051/50000 [05:38<11:23, 36.53it/s]

Evaluation at timestep 25051 returned a mean returns of -29.03797795552841
Epsilon = 0.050000000000000044
Replay Buffer count: 5586


 55%|█████▌    | 27505/50000 [06:53<10:43, 34.95it/s]

Evaluation at timestep 27505 returned a mean returns of -25.080869786369284
Epsilon = 0.050000000000000044
Replay Buffer count: 5886


 60%|██████    | 30011/50000 [08:21<11:58, 27.82it/s]

Evaluation at timestep 30011 returned a mean returns of -21.299754586086635
Epsilon = 0.050000000000000044
Replay Buffer count: 6122


 65%|██████▌   | 32617/50000 [09:56<11:01, 26.28it/s]

Evaluation at timestep 32617 returned a mean returns of 27.509052855525756
Epsilon = 0.050000000000000044
Replay Buffer count: 6415


 70%|███████   | 35022/50000 [11:36<10:07, 24.66it/s]

Evaluation at timestep 35022 returned a mean returns of 91.2541354910319
Epsilon = 0.050000000000000044
Replay Buffer count: 6642


 75%|███████▌  | 37555/50000 [13:29<09:32, 21.75it/s]

Evaluation at timestep 37555 returned a mean returns of 41.520946695964675
Epsilon = 0.050000000000000044
Replay Buffer count: 6959


 80%|████████  | 40055/50000 [15:23<07:34, 21.89it/s]

Evaluation at timestep 40055 returned a mean returns of -21.228049705505576
Epsilon = 0.050000000000000044
Replay Buffer count: 7287


 85%|████████▌ | 42700/50000 [17:34<05:35, 21.77it/s]

Evaluation at timestep 42700 returned a mean returns of 47.35062797679147
Epsilon = 0.050000000000000044
Replay Buffer count: 7482


 90%|█████████ | 45171/50000 [19:52<04:40, 17.20it/s]

Evaluation at timestep 45171 returned a mean returns of 134.72835768271338
Epsilon = 0.050000000000000044
Replay Buffer count: 7667


 95%|█████████▌| 47510/50000 [21:54<01:54, 21.82it/s]

Evaluation at timestep 47510 returned a mean returns of 30.553273228883384
Epsilon = 0.050000000000000044
Replay Buffer count: 7867


50109it [24:22, 34.26it/s]
  1%|          | 464/50000 [00:00<00:13, 3689.82it/s]

Evaluation at timestep 50109 returned a mean returns of -5.551201417089338
Epsilon = 0.050000000000000044
Replay Buffer count: 8001

 Run: 7 



  5%|▌         | 2535/50000 [00:04<00:33, 1396.75it/s]

Evaluation at timestep 2535 returned a mean returns of -115.25514999275126
Epsilon = 0.8443266666666667
Replay Buffer count: 887


 10%|█         | 5092/50000 [00:14<01:25, 525.05it/s]

Evaluation at timestep 5092 returned a mean returns of -23.143890199304927
Epsilon = 0.68916
Replay Buffer count: 1515


 15%|█▌        | 7535/50000 [00:28<02:23, 296.36it/s]

Evaluation at timestep 7535 returned a mean returns of -101.93608308185387
Epsilon = 0.53469
Replay Buffer count: 2178


 20%|██        | 10116/50000 [00:49<04:15, 156.18it/s]

Evaluation at timestep 10116 returned a mean returns of -252.3491628730473
Epsilon = 0.37990333333333326
Replay Buffer count: 2802


 26%|██▌       | 12933/50000 [01:21<06:12, 99.51it/s]

Evaluation at timestep 12933 returned a mean returns of 116.54133071849684
Epsilon = 0.21257666666666675
Replay Buffer count: 3457


 30%|███       | 15056/50000 [01:53<08:00, 72.79it/s]

Evaluation at timestep 15056 returned a mean returns of -37.46298467541851
Epsilon = 0.056460000000000066
Replay Buffer count: 3878


 35%|███▌      | 17537/50000 [02:32<07:31, 71.91it/s]

Evaluation at timestep 17537 returned a mean returns of -122.2232751429629
Epsilon = 0.050000000000000044
Replay Buffer count: 4328


 40%|████      | 20109/50000 [03:24<09:36, 51.84it/s]

Evaluation at timestep 20109 returned a mean returns of 6.674533520757606
Epsilon = 0.050000000000000044
Replay Buffer count: 4749


 46%|████▌     | 22868/50000 [04:31<10:49, 41.76it/s]

Evaluation at timestep 22868 returned a mean returns of 106.7604648896525
Epsilon = 0.050000000000000044
Replay Buffer count: 5216


 50%|█████     | 25043/50000 [05:36<10:58, 37.88it/s]

Evaluation at timestep 25043 returned a mean returns of 92.66687035765749
Epsilon = 0.050000000000000044
Replay Buffer count: 5527


 56%|█████▌    | 27772/50000 [07:05<12:31, 29.56it/s]

Evaluation at timestep 27772 returned a mean returns of 154.6965423960663
Epsilon = 0.050000000000000044
Replay Buffer count: 5881


 61%|██████    | 30305/50000 [08:29<10:26, 31.45it/s]

Evaluation at timestep 30305 returned a mean returns of 49.1976244580477
Epsilon = 0.050000000000000044
Replay Buffer count: 6180


 66%|██████▌   | 32857/50000 [10:09<10:09, 28.12it/s]

Evaluation at timestep 32857 returned a mean returns of 117.1723874288465
Epsilon = 0.050000000000000044
Replay Buffer count: 6464


 70%|███████   | 35182/50000 [11:47<10:12, 24.18it/s]

Evaluation at timestep 35182 returned a mean returns of 98.24246755712902
Epsilon = 0.050000000000000044
Replay Buffer count: 6689


 75%|███████▌  | 37596/50000 [13:33<09:05, 22.74it/s]

Evaluation at timestep 37596 returned a mean returns of 139.0685677009449
Epsilon = 0.050000000000000044
Replay Buffer count: 6910


 80%|████████  | 40137/50000 [15:25<06:37, 24.83it/s]

Evaluation at timestep 40137 returned a mean returns of 168.40174970124295
Epsilon = 0.050000000000000044
Replay Buffer count: 7069


 85%|████████▌ | 42576/50000 [17:29<06:08, 20.17it/s]

Evaluation at timestep 42576 returned a mean returns of 217.09711027325358
Epsilon = 0.050000000000000044
Replay Buffer count: 7261


 90%|█████████ | 45195/50000 [19:41<04:10, 19.22it/s]

Evaluation at timestep 45195 returned a mean returns of 219.5928295200918
Epsilon = 0.050000000000000044
Replay Buffer count: 7482


 95%|█████████▌| 47510/50000 [21:46<02:05, 19.81it/s]

Evaluation at timestep 47510 returned a mean returns of 76.70002952522319
Epsilon = 0.050000000000000044
Replay Buffer count: 7654


50162it [24:22, 34.30it/s]
  1%|          | 428/50000 [00:00<00:11, 4228.74it/s]

Evaluation at timestep 50162 returned a mean returns of 92.43670520760698
Epsilon = 0.050000000000000044
Replay Buffer count: 7839

 Run: 8 



  5%|▌         | 2611/50000 [00:05<04:32, 174.05it/s] 

Evaluation at timestep 2529 returned a mean returns of -50.39080134109437
Epsilon = 0.8451500000000001
Replay Buffer count: 822


 10%|█         | 5135/50000 [00:12<05:50, 127.83it/s]

Evaluation at timestep 5030 returned a mean returns of -289.18260675524255
Epsilon = 0.6894133333333333
Replay Buffer count: 1468


 15%|█▌        | 7706/50000 [00:27<07:39, 92.00it/s] 

Evaluation at timestep 7587 returned a mean returns of 34.31203831368459
Epsilon = 0.5317766666666667
Replay Buffer count: 2125


 20%|██        | 10037/50000 [00:46<04:22, 152.10it/s]

Evaluation at timestep 10037 returned a mean returns of 6.582689297489149
Epsilon = 0.37426666666666675
Replay Buffer count: 2678


 26%|██▌       | 12850/50000 [01:13<06:17, 98.47it/s]

Evaluation at timestep 12850 returned a mean returns of -42.65719116085651
Epsilon = 0.21783333333333332
Replay Buffer count: 3300


 30%|███       | 15026/50000 [01:45<07:35, 76.71it/s]

Evaluation at timestep 15026 returned a mean returns of 6.111499324823281
Epsilon = 0.08001999999999998
Replay Buffer count: 3748


 35%|███▌      | 17702/50000 [02:29<08:15, 65.20it/s]

Evaluation at timestep 17702 returned a mean returns of -240.3119733974712
Epsilon = 0.050000000000000044
Replay Buffer count: 4194


 41%|████      | 20283/50000 [03:20<08:52, 55.76it/s]

Evaluation at timestep 20283 returned a mean returns of 131.17966953725838
Epsilon = 0.050000000000000044
Replay Buffer count: 4565


 45%|████▌     | 22674/50000 [04:12<09:18, 48.89it/s]

Evaluation at timestep 22674 returned a mean returns of -53.2332285146983
Epsilon = 0.050000000000000044
Replay Buffer count: 4934


 50%|█████     | 25061/50000 [05:15<10:24, 39.92it/s]

Evaluation at timestep 25061 returned a mean returns of 200.05851028198288
Epsilon = 0.050000000000000044
Replay Buffer count: 5318


 56%|█████▌    | 27927/50000 [06:37<10:00, 36.75it/s]

Evaluation at timestep 27927 returned a mean returns of 105.02246280751399
Epsilon = 0.050000000000000044
Replay Buffer count: 5726


 60%|██████    | 30106/50000 [07:52<10:07, 32.77it/s]

Evaluation at timestep 30106 returned a mean returns of 79.64359787355335
Epsilon = 0.050000000000000044
Replay Buffer count: 5935


 65%|██████▌   | 32504/50000 [09:16<09:19, 31.28it/s]

Evaluation at timestep 32504 returned a mean returns of 28.92843569349087
Epsilon = 0.050000000000000044
Replay Buffer count: 6146


 71%|███████   | 35290/50000 [11:03<08:47, 27.89it/s]

Evaluation at timestep 35290 returned a mean returns of 155.74880961471368
Epsilon = 0.050000000000000044
Replay Buffer count: 6375


 75%|███████▌  | 37737/50000 [12:40<07:26, 27.49it/s]

Evaluation at timestep 37737 returned a mean returns of 140.571498267623
Epsilon = 0.050000000000000044
Replay Buffer count: 6582


 81%|████████  | 40267/50000 [14:30<06:23, 25.38it/s]

Evaluation at timestep 40267 returned a mean returns of 90.4750753020847
Epsilon = 0.050000000000000044
Replay Buffer count: 6759


 86%|████████▌ | 42979/50000 [16:37<05:13, 22.37it/s]

Evaluation at timestep 42979 returned a mean returns of 142.6318894931808
Epsilon = 0.050000000000000044
Replay Buffer count: 6887


 91%|█████████ | 45284/50000 [18:18<03:29, 22.47it/s]

Evaluation at timestep 45284 returned a mean returns of 150.17361421133256
Epsilon = 0.050000000000000044
Replay Buffer count: 7132


 96%|█████████▌| 47773/50000 [20:28<01:50, 20.11it/s]

Evaluation at timestep 47773 returned a mean returns of 126.08126549936293
Epsilon = 0.050000000000000044
Replay Buffer count: 7248


50363it [22:40, 37.03it/s]
  1%|          | 492/50000 [00:00<00:12, 3865.61it/s]

Evaluation at timestep 50363 returned a mean returns of 218.64029008970311
Epsilon = 0.050000000000000044
Replay Buffer count: 7412

 Run: 9 



  5%|▌         | 2590/50000 [00:04<00:37, 1275.59it/s]

Evaluation at timestep 2590 returned a mean returns of -191.17903925035301
Epsilon = 0.8456566666666667
Replay Buffer count: 880


 10%|█         | 5135/50000 [00:13<06:18, 118.61it/s]

Evaluation at timestep 5017 returned a mean returns of -168.18733737161028
Epsilon = 0.6890333333333334
Replay Buffer count: 1559


 16%|█▌        | 7897/50000 [00:25<04:53, 143.43it/s]

Evaluation at timestep 7796 returned a mean returns of -534.9313460532451
Epsilon = 0.53792
Replay Buffer count: 2251


 20%|██        | 10113/50000 [00:45<04:20, 153.17it/s]

Evaluation at timestep 10113 returned a mean returns of -43.52167887510885
Epsilon = 0.3911766666666666
Replay Buffer count: 2807


 25%|██▌       | 12680/50000 [01:11<06:10, 100.86it/s]

Evaluation at timestep 12680 returned a mean returns of -201.81065345993414
Epsilon = 0.21612333333333333
Replay Buffer count: 3362


 30%|███       | 15136/50000 [01:44<07:23, 78.64it/s]

Evaluation at timestep 15136 returned a mean returns of -129.10855921712948
Epsilon = 0.05456000000000005
Replay Buffer count: 3911


 36%|███▌      | 17793/50000 [02:34<10:15, 52.30it/s]

Evaluation at timestep 17793 returned a mean returns of 46.41458959549722
Epsilon = 0.050000000000000044
Replay Buffer count: 4487


 40%|████      | 20058/50000 [03:22<10:23, 48.01it/s]

Evaluation at timestep 20058 returned a mean returns of -5.839457601157948
Epsilon = 0.050000000000000044
Replay Buffer count: 4949


 45%|████▌     | 22700/50000 [04:31<11:29, 39.57it/s]

Evaluation at timestep 22700 returned a mean returns of 42.78777417419495
Epsilon = 0.050000000000000044
Replay Buffer count: 5417


 50%|█████     | 25181/50000 [05:49<12:44, 32.46it/s]

Evaluation at timestep 25181 returned a mean returns of 76.91519376395365
Epsilon = 0.050000000000000044
Replay Buffer count: 5745


 55%|█████▌    | 27523/50000 [07:01<10:30, 35.68it/s]

Evaluation at timestep 27523 returned a mean returns of 106.79775530912933
Epsilon = 0.050000000000000044
Replay Buffer count: 5993


 60%|██████    | 30193/50000 [08:39<11:41, 28.22it/s]

Evaluation at timestep 30193 returned a mean returns of 30.656801017956383
Epsilon = 0.050000000000000044
Replay Buffer count: 6258


 65%|██████▌   | 32550/50000 [10:12<12:37, 23.05it/s]

Evaluation at timestep 32550 returned a mean returns of 29.146412949883462
Epsilon = 0.050000000000000044
Replay Buffer count: 6534


 70%|███████   | 35181/50000 [12:00<10:49, 22.81it/s]

Evaluation at timestep 35181 returned a mean returns of 124.65020977335249
Epsilon = 0.050000000000000044
Replay Buffer count: 6792


 76%|███████▌  | 37790/50000 [13:58<09:14, 22.02it/s]

Evaluation at timestep 37790 returned a mean returns of 87.46675899019655
Epsilon = 0.050000000000000044
Replay Buffer count: 7061


 81%|████████  | 40375/50000 [16:02<07:57, 20.15it/s]

Evaluation at timestep 40375 returned a mean returns of 158.25468004447382
Epsilon = 0.050000000000000044
Replay Buffer count: 7226


 85%|████████▌ | 42748/50000 [18:01<06:20, 19.06it/s]

Evaluation at timestep 42748 returned a mean returns of 104.58065341236897
Epsilon = 0.050000000000000044
Replay Buffer count: 7411


 90%|█████████ | 45102/50000 [19:58<03:45, 21.75it/s]

Evaluation at timestep 45102 returned a mean returns of 182.32480764748627
Epsilon = 0.050000000000000044
Replay Buffer count: 7622


 95%|█████████▌| 47657/50000 [22:21<02:04, 18.88it/s]

Evaluation at timestep 47657 returned a mean returns of 188.68761315485
Epsilon = 0.050000000000000044
Replay Buffer count: 7758


50242it [24:48, 33.76it/s]
  1%|          | 339/50000 [00:00<00:14, 3380.93it/s]

Evaluation at timestep 50242 returned a mean returns of 71.2963947108969
Epsilon = 0.050000000000000044
Replay Buffer count: 7939

 Run: 10 



  5%|▌         | 2594/50000 [00:06<05:23, 146.46it/s] 

Evaluation at timestep 2519 returned a mean returns of -120.83780587278767
Epsilon = 0.8476833333333333
Replay Buffer count: 878


 10%|█         | 5114/50000 [00:14<01:34, 473.67it/s]

Evaluation at timestep 5114 returned a mean returns of -150.2167975535956
Epsilon = 0.6870700000000001
Replay Buffer count: 1621


 15%|█▌        | 7632/50000 [00:29<02:56, 240.67it/s]

Evaluation at timestep 7632 returned a mean returns of -98.49881645539861
Epsilon = 0.52576
Replay Buffer count: 2229


 20%|██        | 10225/50000 [00:51<04:51, 136.67it/s]

Evaluation at timestep 10225 returned a mean returns of 14.498442607615164
Epsilon = 0.38408333333333333
Replay Buffer count: 2732


 26%|██▌       | 12865/50000 [01:20<06:03, 102.17it/s]

Evaluation at timestep 12865 returned a mean returns of 53.69813310366698
Epsilon = 0.21194333333333337
Replay Buffer count: 3280


 30%|███       | 15166/50000 [01:55<07:54, 73.41it/s]

Evaluation at timestep 15166 returned a mean returns of 25.130975173191533
Epsilon = 0.0711533333333334
Replay Buffer count: 3667


 35%|███▌      | 17743/50000 [02:32<07:09, 75.10it/s]

Evaluation at timestep 17743 returned a mean returns of -27.61730670920694
Epsilon = 0.050000000000000044
Replay Buffer count: 4092


 40%|████      | 20020/50000 [03:20<09:45, 51.20it/s]

Evaluation at timestep 20020 returned a mean returns of 41.781073002594326
Epsilon = 0.050000000000000044
Replay Buffer count: 4543


 45%|████▌     | 22645/50000 [04:17<10:09, 44.87it/s]

Evaluation at timestep 22645 returned a mean returns of 72.74973466014102
Epsilon = 0.050000000000000044
Replay Buffer count: 4935


 51%|█████     | 25378/50000 [05:32<11:07, 36.88it/s]

Evaluation at timestep 25378 returned a mean returns of 72.28064660879586
Epsilon = 0.050000000000000044
Replay Buffer count: 5382


 55%|█████▌    | 27655/50000 [06:41<10:34, 35.22it/s]

Evaluation at timestep 27655 returned a mean returns of 98.77675111599204
Epsilon = 0.050000000000000044
Replay Buffer count: 5633


 60%|██████    | 30065/50000 [07:58<10:12, 32.55it/s]

Evaluation at timestep 30065 returned a mean returns of 18.182535737410667
Epsilon = 0.050000000000000044
Replay Buffer count: 5888


 66%|██████▌   | 32756/50000 [09:36<10:39, 26.99it/s]

Evaluation at timestep 32756 returned a mean returns of 94.40438093746442
Epsilon = 0.050000000000000044
Replay Buffer count: 6174


 70%|███████   | 35151/50000 [11:09<09:58, 24.79it/s]

Evaluation at timestep 35151 returned a mean returns of 159.95938931388972
Epsilon = 0.050000000000000044
Replay Buffer count: 6396


 75%|███████▌  | 37616/50000 [12:48<08:11, 25.17it/s]

Evaluation at timestep 37616 returned a mean returns of 205.46645579029695
Epsilon = 0.050000000000000044
Replay Buffer count: 6562


 80%|████████  | 40039/50000 [14:30<06:49, 24.30it/s]

Evaluation at timestep 40039 returned a mean returns of 119.71004003319335
Epsilon = 0.050000000000000044
Replay Buffer count: 6746


 85%|████████▌ | 42723/50000 [16:24<04:48, 25.21it/s]

Evaluation at timestep 42723 returned a mean returns of 125.16773336595163
Epsilon = 0.050000000000000044
Replay Buffer count: 6932


 90%|█████████ | 45230/50000 [18:28<03:40, 21.60it/s]

Evaluation at timestep 45230 returned a mean returns of 158.40533659003597
Epsilon = 0.050000000000000044
Replay Buffer count: 7088


 95%|█████████▌| 47601/50000 [20:25<02:01, 19.79it/s]

Evaluation at timestep 47601 returned a mean returns of 48.984313937882234
Epsilon = 0.050000000000000044
Replay Buffer count: 7258


50242it [22:38, 36.98it/s]
  1%|          | 433/50000 [00:00<00:11, 4167.15it/s]

Evaluation at timestep 50242 returned a mean returns of 114.68489325895933
Epsilon = 0.050000000000000044
Replay Buffer count: 7444

 Run: 11 



  5%|▌         | 2500/50000 [00:04<00:40, 1176.91it/s]

Evaluation at timestep 2500 returned a mean returns of -294.02071701902025
Epsilon = 0.8473033333333333
Replay Buffer count: 918


 10%|█         | 5096/50000 [00:15<08:33, 87.39it/s] 

Evaluation at timestep 5006 returned a mean returns of -65.96229234731182
Epsilon = 0.6897933333333333
Replay Buffer count: 1556


 15%|█▌        | 7605/50000 [00:30<03:15, 217.39it/s]

Evaluation at timestep 7605 returned a mean returns of -22.473344171209536
Epsilon = 0.5275333333333334
Replay Buffer count: 2158


 20%|██        | 10014/50000 [00:51<04:44, 140.31it/s]

Evaluation at timestep 10014 returned a mean returns of 5.086463317078981
Epsilon = 0.38060000000000005
Replay Buffer count: 2715


 25%|██▌       | 12630/50000 [01:21<06:34, 94.81it/s]

Evaluation at timestep 12630 returned a mean returns of -56.78548813602405
Epsilon = 0.21150000000000002
Replay Buffer count: 3381


 31%|███       | 15422/50000 [02:05<08:13, 70.00it/s]

Evaluation at timestep 15422 returned a mean returns of -72.03325870600906
Epsilon = 0.054306666666666614
Replay Buffer count: 3804


 36%|███▌      | 17853/50000 [02:50<08:36, 62.24it/s]

Evaluation at timestep 17853 returned a mean returns of 24.05488723968302
Epsilon = 0.050000000000000044
Replay Buffer count: 4076


 40%|████      | 20012/50000 [03:36<09:41, 51.58it/s]

Evaluation at timestep 20012 returned a mean returns of 106.22875435419418
Epsilon = 0.050000000000000044
Replay Buffer count: 4502


 45%|████▌     | 22537/50000 [04:35<10:06, 45.27it/s]

Evaluation at timestep 22537 returned a mean returns of 133.0567064551012
Epsilon = 0.050000000000000044
Replay Buffer count: 4840


 50%|█████     | 25110/50000 [05:34<08:51, 46.80it/s]

Evaluation at timestep 25110 returned a mean returns of 47.240144051802076
Epsilon = 0.050000000000000044
Replay Buffer count: 5213


 55%|█████▌    | 27681/50000 [06:47<09:42, 38.34it/s]

Evaluation at timestep 27681 returned a mean returns of 105.53119747683087
Epsilon = 0.050000000000000044
Replay Buffer count: 5525


 60%|██████    | 30025/50000 [08:02<09:39, 34.49it/s]

Evaluation at timestep 30025 returned a mean returns of 109.91300068836375
Epsilon = 0.050000000000000044
Replay Buffer count: 5662


 65%|██████▌   | 32557/50000 [09:27<10:10, 28.58it/s]

Evaluation at timestep 32557 returned a mean returns of 188.86304757871724
Epsilon = 0.050000000000000044
Replay Buffer count: 6000


 70%|███████   | 35071/50000 [10:52<07:31, 33.04it/s]

Evaluation at timestep 35071 returned a mean returns of 99.40226621898822
Epsilon = 0.050000000000000044
Replay Buffer count: 6152


 76%|███████▌  | 37999/50000 [12:44<07:26, 26.86it/s]

Evaluation at timestep 37999 returned a mean returns of 113.47015991869124
Epsilon = 0.050000000000000044
Replay Buffer count: 6355


 81%|████████  | 40305/50000 [14:10<05:36, 28.84it/s]

Evaluation at timestep 40305 returned a mean returns of 69.22241474571824
Epsilon = 0.050000000000000044
Replay Buffer count: 6530


 86%|████████▌ | 42907/50000 [16:00<04:16, 27.70it/s]

Evaluation at timestep 42907 returned a mean returns of 151.29378428047409
Epsilon = 0.050000000000000044
Replay Buffer count: 6787


 90%|█████████ | 45102/50000 [17:40<03:19, 24.58it/s]

Evaluation at timestep 45102 returned a mean returns of 133.4277269274316
Epsilon = 0.050000000000000044
Replay Buffer count: 6944


 96%|█████████▌| 47954/50000 [19:58<01:36, 21.21it/s]

Evaluation at timestep 47954 returned a mean returns of 110.9214778921075
Epsilon = 0.050000000000000044
Replay Buffer count: 7081


50209it [21:45, 38.45it/s]

Evaluation at timestep 50209 returned a mean returns of 102.44194615791693
Epsilon = 0.050000000000000044
Replay Buffer count: 7254





In [7]:
with open(f'{environment}_eval_{models[j]}.csv', 'ab') as eval:
    for i in range(n_seeds):
        np.savetxt(eval, [returns[i]], delimiter=',')

In [8]:
with open(f'{environment}_train_{models[j]}.csv', 'ab') as train:
    for i in range(n_seeds):
        np.savetxt(train, [train_returns[i]], delimiter=',')
        np.savetxt(train, [train_times[i]], delimiter=',')

In [10]:
## Sample Efficiency Evaluation

n_eps = []
n_steps = []
not_solved = []
n_seeds=10

j=4
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    s, e, n = solve(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j],
            target_return=100,
            op=operator.ge, 
            render=RENDER,
            online=onlines[j],
            threshold=0.25)
    env.close()
    n_eps.append(e)
    n_steps.append(s)
    not_solved.append(n)


 Run: 1 

Ep. timesteps: 500
Total timesteps: 10455
Total episodes: 78
Evaluation mean return: 114.72874752385272

 Run: 2 

Ep. timesteps: 500
Total timesteps: 6306
Total episodes: 54
Evaluation mean return: 143.54222720984495

 Run: 3 

Ep. timesteps: 494
Total timesteps: 11736
Total episodes: 72
Evaluation mean return: 159.6750638685724

 Run: 4 

Ep. timesteps: 500
Total timesteps: 16944
Total episodes: 93
Evaluation mean return: 175.46025593824072

 Run: 5 

Ep. timesteps: 387
Total timesteps: 9434
Total episodes: 74
Evaluation mean return: 279.08728930953407

 Run: 6 

Ep. timesteps: 244
Total timesteps: 14255
Total episodes: 87
Evaluation mean return: 302.6887018206717

 Run: 7 

Ep. timesteps: 500
Total timesteps: 14544
Total episodes: 81
Evaluation mean return: 130.73198234143658

 Run: 8 

Ep. timesteps: 384
Total timesteps: 15448
Total episodes: 89
Evaluation mean return: 200.9778050945896

 Run: 9 

Ep. timesteps: 452
Total timesteps: 9004
Total episodes: 68
Evaluation mea

In [11]:
with open(f'{environment}_se_{models[j]}.csv', 'ab') as se:
    np.savetxt(se, [n_eps], delimiter=',')
    np.savetxt(se, [n_steps], delimiter=',')
    np.savetxt(se, [not_solved], delimiter=',')

In [22]:
mean_eps = np.mean(n_eps)
std_eps = np.std(n_eps)
print(f"Average n_eps: {mean_eps}")
print(f"Std n_eps: {std_eps}")
print(f"St.error n_eps: {std_eps/np.sqrt(n_seeds)}")

mean_steps = np.mean(n_steps)
std_steps = np.std(n_steps)
print(f"Average n_steps: {mean_steps}0")
print(f"Std n_steps: {std_steps}")
print(f"St.error n_steps: {std_steps/np.sqrt(n_seeds)}")

print(f"Not solved: {np.sum(not_solved)} runs")

Average n_eps: 21.633333333333333
Std n_eps: 27.45721924902245
St.error n_eps: 5.0129794496848845
Average n_steps: 900.60
Std n_steps: 1383.9435826651315
St.error n_steps: 252.67237284673604
Not solved: 3 runs


In [27]:
## Training time

times = []
for j in range(5,8):
        time = train_time(env, 
                CONFIGS[j], 
                fa=function_approximators[j], 
                agent = agents[j],
                online=onlines[j],
                threshold=0.2)
        env.close()
        times.append(time)

print(time)

50013it [09:16, 89.90it/s]
  1%|          | 347/50000 [00:00<00:16, 2995.31it/s]

-189.0375427386612


50030it [43:20, 19.24it/s]
  0%|          | 0/50000 [00:00<?, ?it/s]

-65.36838185707568


50078it [1:09:02, 12.09it/s]

-205.56159420668013
4142.282765388489





In [28]:
with open(f'{environment}_times.csv', 'ab') as t:
    np.savetxt(t, [times], delimiter=',')