In [1]:
import gym 
import numpy as np
from custom_envs.gridworlds import WindyGridworldEnv, SimpleGridworldEnv
from custom_envs.mountain_car import MountainCarEnv

from function_approximators.function_approximators import NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, RationalQuadratic, Matern
from sklearn.metrics.pairwise import rbf_kernel, chi2_kernel, laplacian_kernel 

from utils.train_utils import train, solve, train_time
from utils.plot_utils import plot_returns

from agents.ad_agents import DQNAgent, LinearAgent, FQIAgent, OnlineGaussianProccessAgent

import operator


In [2]:
function_approximators = [NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess]

agents = [DQNAgent, LinearAgent, *[FQIAgent]*5, OnlineGaussianProccessAgent]

RENDER = False
# env = gym.make("CartPole-v1")
# env = gym.make("Acrobot-v1")
# env = gym.make("MountainCar-v0")
# env = WindyGridworldEnv()
# env = gym.make("LunarLander-v2")
env = SimpleGridworldEnv()
# env = MountainCarEnv()

In [3]:
# DQN Config
CONFIG_DQN = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "learning_rate": 0.00075,
    "hidden_size": (32,32),
    "target_update_freq": 50,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "plot_loss": False,
    "epsilon": 1,
    "max_deduct": 0.97,
    "decay": 0.25,
    "lr_step_size": 250,
    "lr_gamma": 0.95,
    "max_steps": 50,
    "non_param": False,
}

# Linear Config
CONFIG_LINEAR = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "learning_rate": 0.02,
    "target_update_freq": 20,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e7),
    "plot_loss": False,
    "epsilon": 1,
    "max_steps": 50,
    "poly_degree": 1,
    "max_deduct": 0.97,
    "decay": 0.5,
    "lr_step_size": 250,
    "lr_gamma": 0.99,
    "non_param": False,
}

# Decision Tree Config
CONFIG_DT = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 512,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.4,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"criterion":"mse","max_depth": 15, "min_samples_split": 20, "min_samples_leaf": 5},
    "feature_names": ["Cart Position", "Cart Velocity", "Pole Angle", "Pole Angular Velocity", "Action: Push Left", "Action: Push Right"],
    "plot_name": "dt_depth=8",
}

# Random Forest Config
CONFIG_RF = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 5,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 5,
    "batch_size": 512,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.2,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"n_estimators": 5,"max_depth": 15, "min_samples_split": 20, "min_samples_leaf": 5},
}

# Support Vector Regressor Config
CONFIG_SVR = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 256,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"kernel":"rbf", "degree": 2, "C": 3},
}


# K-Neighbors Regressor Config
CONFIG_KNR = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 256,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"n_neighbors":7, "weights": "distance", "algorithm": "auto", "leaf_size": 30},
}

# Gaussian Process Config
CONFIG_GP = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "model_save_freq": 250,
    "model_save_capacity": 20,
    "update_freq": 10,
    "batch_size": 512,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"alpha": 1e-10, "normalize_y": False, "kernel":  RBF(length_scale=0.5, length_scale_bounds="fixed")},
}

# Online Gaussian Process Config
CONFIG_GP_Online = {
    "episode_length": 50,
    "max_timesteps": 5000,
    "max_time": 30 * 60,
    "eval_freq": 250, 
    "eval_episodes": 10,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "batch_size": 32,
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.3,
    "max_steps": 50,
    "non_param": True,
    "model_params": {"sigma_0": 0.5, "init":-10, "kernel":  rbf_kernel, "epsilon_tol": 0.085, "basis_limit": 1000},
}

CONFIGS = [CONFIG_DQN, CONFIG_LINEAR, CONFIG_DT, CONFIG_RF, CONFIG_SVR, CONFIG_KNR, CONFIG_GP, CONFIG_GP_Online]
onlines = [False, False, False, False, False, False, False, True]

In [4]:
returns = []
train_returns = []
train_times = []
n_seeds=10

j=7
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _, t, times = train(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j], 
            render=RENDER,
            online=onlines[j])
    env.close()
    returns.append(r)
    train_returns.append(t)
    train_times.append(times)
    


  1%|          | 50/5000 [00:00<00:14, 347.34it/s]
 Run: 1 

  5%|▌         | 263/5000 [00:01<00:23, 203.96it/s]Evaluation at timestep 263 returned a mean returns of -50.0
Epsilon = 0.8651
Support Points = 264
 10%|█         | 524/5000 [00:04<00:43, 103.23it/s]Evaluation at timestep 524 returned a mean returns of -50.0
Epsilon = 0.6998
Support Points = 517
 15%|█▌        | 759/5000 [00:07<00:44, 96.14it/s]Evaluation at timestep 759 returned a mean returns of -5.999999999999999
Epsilon = 0.5313333333333333
Support Points = 650
 20%|██        | 1011/5000 [00:10<00:34, 114.74it/s]Evaluation at timestep 1011 returned a mean returns of -5.999999999999999
Epsilon = 0.373
Support Points = 681
 25%|██▌       | 1257/5000 [00:12<00:51, 72.12it/s] Evaluation at timestep 1250 returned a mean returns of -5.999999999999999
Epsilon = 0.21340000000000003
Support Points = 700
 30%|███       | 1522/5000 [00:15<00:42, 81.83it/s] Evaluation at timestep 1508 returned a mean returns of -5.999999999999999
Ep

In [5]:
with open(f'simplegrid_eval_Gaussian Process Online.csv', 'ab') as eval:
    for i in range(n_seeds):
        np.savetxt(eval, [returns[i]], delimiter=',')

In [6]:
with open(f'simplegrid_train_Gaussian Process Online.csv', 'ab') as train:
    for i in range(n_seeds):
        np.savetxt(train, [train_returns[i]], delimiter=',')
        np.savetxt(train, [train_times[i]], delimiter=',')

In [23]:
n_eps = []
n_steps = []
not_solved = []
n_seeds=30

j=7
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    s, e, n = solve(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j],
            target_return=-6,
            op=operator.ge, 
            render=RENDER,
            online=onlines[j])
    env.close()
    n_eps.append(e)
    n_steps.append(s)
    not_solved.append(n)


 Run: 1 


 Run: 2 


 Run: 3 


 Run: 4 


 Run: 5 


 Run: 6 


 Run: 7 

Ep. timesteps: 6
Total timesteps: 220
Total episodes: 11
Evaluation mean return: -5.999999999999999

 Run: 8 


 Run: 9 


 Run: 10 


 Run: 11 


 Run: 12 


 Run: 13 


 Run: 14 


 Run: 15 

Ep. timesteps: 6
Total timesteps: 338
Total episodes: 18
Evaluation mean return: -5.999999999999999

 Run: 16 

Ep. timesteps: 6
Total timesteps: 242
Total episodes: 14
Evaluation mean return: -5.999999999999999

 Run: 17 

Ep. timesteps: 6
Total timesteps: 541
Total episodes: 28
Evaluation mean return: -5.999999999999999

 Run: 18 


 Run: 19 


 Run: 20 

Ep. timesteps: 6
Total timesteps: 317
Total episodes: 17
Evaluation mean return: -5.999999999999999

 Run: 21 

Ep. timesteps: 6
Total timesteps: 120
Total episodes: 6
Evaluation mean return: -5.999999999999999

 Run: 22 

Ep. timesteps: 6
Total timesteps: 245
Total episodes: 14
Evaluation mean return: -5.999999999999999

 Run: 23 


 Run: 24 


 Run: 25 

Ep. timest

In [24]:
with open(f'simplegrid_se-6_Gaussian Process (Online).csv', 'ab') as se:
    np.savetxt(se, [n_eps], delimiter=',')
    np.savetxt(se, [n_steps], delimiter=',')
    np.savetxt(se, [not_solved], delimiter=',')

In [25]:
mean_eps = np.mean(n_eps)
std_eps = np.std(n_eps)
print(f"Average n_eps: {mean_eps}")
print(f"Std n_eps: {std_eps}")
print(f"St.error n_eps: {std_eps/np.sqrt(n_seeds)}")

mean_steps = np.mean(n_steps)
std_steps = np.std(n_steps)
print(f"Average n_steps: {mean_steps}0")
print(f"Std n_steps: {std_steps}")
print(f"St.error n_steps: {std_steps/np.sqrt(n_seeds)}")

print(f"Not solved: {np.sum(not_solved)} runs")

Average n_eps: 75.9
Std n_eps: 38.85987647947431
St.error n_eps: 7.09481030989084
Average n_steps: 1500.83333333333330
Std n_steps: 780.0819650666347
St.error n_steps: 142.42282965665092
Not solved: 21 runs


In [32]:
times = []
for j in range(8):
        time = train_time(env, 
                CONFIGS[j], 
                fa=function_approximators[j], 
                agent = agents[j],
                online=onlines[j])
        env.close()
        times.append(time)

print(time)

100%|██████████| 2000/2000 [00:02<00:00, 707.38it/s]
  9%|▊         | 173/2000 [00:00<00:01, 1579.17it/s]-6
2004it [00:01, 1275.31it/s]                          
 27%|██▋       | 544/2000 [00:00<00:00, 5067.62it/s]-6
2004it [00:06, 304.55it/s]
 26%|██▋       | 528/2000 [00:00<00:00, 3781.12it/s]-20
2009it [00:26, 76.57it/s]
 14%|█▍        | 284/2000 [00:00<00:00, 1796.62it/s]-10
2003it [00:34, 57.24it/s]
  0%|          | 0/2000 [00:00<?, ?it/s]-20
2005it [01:24, 23.72it/s]
  0%|          | 0/2000 [00:00<?, ?it/s]-6
2017it [00:43, 46.75it/s]
  1%|          | 20/2000 [00:00<00:10, 188.37it/s]-20
100%|██████████| 2000/2000 [00:18<00:00, 108.86it/s]-20
18.38461923599243



In [33]:
with open(f'simplegrid_times.csv', 'ab') as t:
    np.savetxt(t, [times], delimiter=',')