In [1]:
## Imports

import numpy as np
from custom_envs.gridworlds import WindyGridworldEnv

from function_approximators.function_approximators import NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics.pairwise import rbf_kernel

from utils.train_utils import train, solve, train_time
from agents.agents import DQNAgent, LinearAgent, FQIAgent, OnlineGaussianProccessAgent
import operator


In [2]:
## Environment

function_approximators = [NeuralNetwork, LinearModel, DecisionTree, RandomForest, SupportVectorRegressor, KNeighboursRegressor, GaussianProcess, OnlineGaussianProcess]

agents = [DQNAgent, LinearAgent, *[FQIAgent]*5, OnlineGaussianProccessAgent]

RENDER = False
# env = HybridGridworldEnv()
env = WindyGridworldEnv()

environment = "windygrid"

In [3]:
## Configuration Files

# DQN Config
CONFIG_DQN = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "learning_rate": 0.0007,
    "hidden_size": (64,64),
    "target_update_freq": 200,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "plot_loss": False,
    "epsilon": 1,
    "max_deduct": 0.97,
    "decay": 0.3,
    "lr_step_size": 1000,
    "lr_gamma": 0.95,
    "max_steps": 500,
    "non_param": False,
}

# Linear Config
CONFIG_LINEAR = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "learning_rate": 0.02,
    "target_update_freq": 50,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e7),
    "plot_loss": False,
    "epsilon": 1,
    "max_steps": 500,
    "poly_degree": 1,
    "max_deduct": 0.97,
    "decay": 0.5,
    "lr_step_size": 1000,
    "lr_gamma": 0.99,
    "non_param": False,
}

# Decision Tree Config
CONFIG_DT = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "model_save_freq": 1000,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"criterion":"mse","max_depth": 100, "min_samples_split": 2, "min_samples_leaf": 1},
    "feature_names": ["Cart Position", "Cart Velocity", "Pole Angle", "Pole Angular Velocity", "Action: Push Left", "Action: Push Right"],
    "plot_name": "dt_depth=8",
}

# Random Forest Config
CONFIG_RF = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "model_save_freq": 5000,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"n_estimators": 5,"max_depth": 100, "min_samples_split": 2, "min_samples_leaf": 1},
}

# Support Vector Regressor Config
CONFIG_SVR = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 1,
    "model_save_freq": 1000,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 64,
    "gamma": 0.99,
    "buffer_capacity": 512,
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"kernel":"rbf", "degree": 2, "C": 1},
}


# K-Neighbors Regressor Config
CONFIG_KNR = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "model_save_freq": 1000,
    "model_save_capacity": 20,
    "update_freq": 1,
    "batch_size": 64,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"n_neighbors":5, "weights": "distance", "algorithm": "auto", "leaf_size": 30},
}

# Gaussian Process Config
CONFIG_GP = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "model_save_freq": 1000,
    "model_save_capacity": 20,
    "update_freq": 10,
    "batch_size": 64,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"alpha": 1e-10, "normalize_y": False, "kernel":  RBF(length_scale=0.5, length_scale_bounds="fixed")},
}

# Online Gaussian Process Config
CONFIG_GP_Online = {
    "episode_length": 500,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "gamma": 0.99,
    "buffer_capacity": int(1e6),
    "batch_size": 32,
    "epsilon": 1,
    "max_deduct": 0.9,
    "decay": 0.4,
    "max_steps": 500,
    "non_param": True,
    "model_params": {"sigma_0": 0.5, "init":-200, "kernel":  rbf_kernel, "epsilon_tol": 0.045, "basis_limit": 1000},
}

CONFIGS = [CONFIG_DQN, CONFIG_LINEAR, CONFIG_DT, CONFIG_RF, CONFIG_SVR, CONFIG_KNR, CONFIG_GP, CONFIG_GP_Online]
onlines = [False, False, False, False, False, False, False, True]
models = ["Neural Network", "Linear Model", "Decision Tree", "Random Forest", "Support Vectors", "K-Neighbours", "Gaussian Process", "Gaussian Process Online"]

In [5]:
## Performance Evaluation

returns = []
train_returns = []
train_times = []
n_seeds=30

j=2
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _, t, times = train(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j], 
            render=RENDER,
            online=onlines[j],
            threshold=0)
    env.close()
    returns.append(r)
    train_returns.append(t)
    train_times.append(times)
    


  0%|          | 0/20000 [00:00<?, ?it/s]


 Run: 1 



  5%|▌         | 1000/20000 [00:05<01:31, 206.95it/s]

Evaluation at timestep 1000 returned a mean returns of -500.0
Epsilon = 0.94375
Replay Buffer count: 1220


 10%|█         | 2034/20000 [00:14<02:32, 117.80it/s]

Evaluation at timestep 2034 returned a mean returns of -15.9999
Epsilon = 0.7872625
Replay Buffer count: 1490


 16%|█▌        | 3145/20000 [00:28<03:20, 84.19it/s]

Evaluation at timestep 3145 returned a mean returns of -15.9999
Epsilon = 0.6629499999999999
Replay Buffer count: 1720


 20%|██        | 4025/20000 [00:39<03:35, 74.02it/s]

Evaluation at timestep 4025 returned a mean returns of -15.9999
Epsilon = 0.5505625
Replay Buffer count: 1800


 25%|██▌       | 5002/20000 [00:53<03:33, 70.13it/s]

Evaluation at timestep 5002 returned a mean returns of -15.9999
Epsilon = 0.439975
Replay Buffer count: 1900


 30%|███       | 6010/20000 [01:08<03:21, 69.26it/s]

Evaluation at timestep 6010 returned a mean returns of -15.9999
Epsilon = 0.3257874999999999
Replay Buffer count: 1920


 35%|███▌      | 7002/20000 [01:23<03:14, 66.80it/s]

Evaluation at timestep 7002 returned a mean returns of -15.9999
Epsilon = 0.21429999999999993
Replay Buffer count: 1960


 40%|████      | 8005/20000 [01:38<03:01, 66.06it/s]

Evaluation at timestep 8005 returned a mean returns of -15.9999
Epsilon = 0.10123749999999998
Replay Buffer count: 1960


 45%|████▌     | 9005/20000 [01:54<02:47, 65.75it/s]

Evaluation at timestep 9005 returned a mean returns of -15.9999
Epsilon = 0.09999999999999998
Replay Buffer count: 1960


 50%|█████     | 10006/20000 [02:09<02:34, 64.58it/s]

Evaluation at timestep 10006 returned a mean returns of -15.9999
Epsilon = 0.09999999999999998
Replay Buffer count: 1960


 51%|█████     | 10117/20000 [02:12<02:08, 76.62it/s]


KeyboardInterrupt: 

In [5]:
with open(f'windygrid_eval_{models[j]}.csv', 'ab') as eval:
    for i in range(n_seeds):
        np.savetxt(eval, [returns[i]], delimiter=',')

In [6]:
with open(f'windygrid_train_{models[j]}.csv', 'ab') as train:
    for i in range(n_seeds):
        np.savetxt(train, [train_returns[i]], delimiter=',')
        np.savetxt(train, [train_times[i]], delimiter=',')

In [5]:
## Sample Efficiency Evaluation

n_eps = []
n_steps = []
not_solved = []
n_seeds=30

j=3
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    s, e, n = solve(env, 
            CONFIGS[j], 
            fa=function_approximators[j], 
            agent = agents[j],
            target_return=-16,
            op=operator.ge, 
            render=RENDER,
            online=onlines[j],
            threshold=0)
    env.close()
    n_eps.append(e)
    n_steps.append(s)
    not_solved.append(n)


 Run: 1 

Ep. timesteps: 16
Total timesteps: 2135
Total episodes: 6
Evaluation mean return: -15.9999

 Run: 2 

Ep. timesteps: 15
Total timesteps: 4444
Total episodes: 10
Evaluation mean return: -14.999900000000002

 Run: 3 

Ep. timesteps: 16
Total timesteps: 3372
Total episodes: 8
Evaluation mean return: -15.9999

 Run: 4 

Ep. timesteps: 16
Total timesteps: 4397
Total episodes: 10
Evaluation mean return: -15.9999

 Run: 5 


 Run: 6 


 Run: 7 

Ep. timesteps: 16
Total timesteps: 1966
Total episodes: 5
Evaluation mean return: -15.9999

 Run: 8 

Ep. timesteps: 16
Total timesteps: 1745
Total episodes: 4
Evaluation mean return: -15.9999

 Run: 9 

Ep. timesteps: 16
Total timesteps: 2573
Total episodes: 6
Evaluation mean return: -15.9999

 Run: 10 

Ep. timesteps: 16
Total timesteps: 1587
Total episodes: 4
Evaluation mean return: -15.9999

 Run: 11 

Ep. timesteps: 15
Total timesteps: 1629
Total episodes: 4
Evaluation mean return: -14.999900000000002

 Run: 12 

Ep. timesteps: 16
Tota

In [6]:
with open(f'{environment}_se_{models[j]}.csv', 'ab') as se:
    np.savetxt(se, [n_eps], delimiter=',')
    np.savetxt(se, [n_steps], delimiter=',')
    np.savetxt(se, [not_solved], delimiter=',')

In [22]:
mean_eps = np.mean(n_eps)
std_eps = np.std(n_eps)
print(f"Average n_eps: {mean_eps}")
print(f"Std n_eps: {std_eps}")
print(f"St.error n_eps: {std_eps/np.sqrt(n_seeds)}")

mean_steps = np.mean(n_steps)
std_steps = np.std(n_steps)
print(f"Average n_steps: {mean_steps}0")
print(f"Std n_steps: {std_steps}")
print(f"St.error n_steps: {std_steps/np.sqrt(n_seeds)}")

print(f"Not solved: {np.sum(not_solved)} runs")

Average n_eps: 21.633333333333333
Std n_eps: 27.45721924902245
St.error n_eps: 5.0129794496848845
Average n_steps: 900.60
Std n_steps: 1383.9435826651315
St.error n_steps: 252.67237284673604
Not solved: 3 runs


In [6]:
## Training time

times = []
for j in range(2,8):
        time = train_time(env, 
                CONFIGS[j], 
                fa=function_approximators[j], 
                agent = agents[j],
                online=onlines[j],
                threshold=0)
        env.close()
        times.append(time)

print(time)

20466it [00:33, 608.90it/s]
  0%|          | 0/20000 [00:00<?, ?it/s]

-500


100%|██████████| 20000/20000 [02:58<00:00, 111.84it/s]
  0%|          | 0/20000 [00:00<?, ?it/s]

-500


100%|██████████| 20000/20000 [00:24<00:00, 807.87it/s]
  0%|          | 0/20000 [00:00<?, ?it/s]

-500


20036it [02:14, 149.31it/s]
  0%|          | 0/20000 [00:00<?, ?it/s]

-500


20130it [00:25, 785.00it/s]
  0%|          | 0/20000 [00:00<?, ?it/s]

-499.9996000000001


100%|██████████| 20000/20000 [04:19<00:00, 77.06it/s]

-500
259.5355157852173





In [7]:
with open(f'{environment}_times.csv', 'ab') as t:
    np.savetxt(t, [times], delimiter=',')