In [1]:
import gym 
import numpy as np
from custom_envs.gridworlds import WindyGridworldEnv, SimpleGridworldEnv

from function_approximators.function_approximators import NeuralNetwork, LinearModel, DecisionTree, RandomForest, ExtraTrees, SupportVectorRegressor
from train_utils import train

from agents.agents import DQNAgent, LinearAgent, NonParametricAgent


In [2]:
function_approximators = [NeuralNetwork, LinearModel, DecisionTree, RandomForest, ExtraTrees, SupportVectorRegressor]
agents = [DQNAgent, LinearAgent, *[NonParametricAgent]*4]

RENDER = False
# env = gym.make("CartPole-v1")
# env = gym.make("Acrobot-v1")
# env = gym.make("MountainCar-v0")
env = WindyGridworldEnv()
# env = gym.make("LunarLander-v2")
# env = SimpleGridworldEnv()

In [5]:

# for lr in np.linspace(0.0005,0.005,10):

returns = []
n_seeds=3

# Decision Tree Config
CONFIG = {
    "episode_length": 200,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "model_save_freq": 1000,
    "model_save_capacity": 20,
    "batch_size": 128,
    "gamma": 0.99,
    "buffer_capacity": int(1e7),
    "epsilon": 1,
    "max_deduct": 0.95,
    "decay": 0.5,
    "max_steps": 200,
    "non_param": True,
    "model_params": {"max_depth": 100, "min_samples_split": 20, "min_samples_leaf": 5},
}
    
for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _ = train(env, 
            CONFIG, 
            fa=function_approximators[2], 
            agent = agents[2], 
            render=RENDER)
    env.close()
    returns.append(r[-1])

print(f"Mean returns: {np.mean(returns)}")
print(f"Std returns: {np.std(returns)}")

  1%|          | 200/20000 [00:00<00:18, 1076.70it/s]
 Run: 1 

  5%|▌         | 1000/20000 [00:02<00:27, 693.95it/s]Evaluation at timestep 1000 returned a mean returns of -200.0
Epsilon = 0.924
 10%|█         | 2000/20000 [00:04<00:34, 517.13it/s]Evaluation at timestep 2000 returned a mean returns of -200.0
Epsilon = 0.829
 15%|█▌        | 3000/20000 [00:08<00:42, 399.44it/s]Evaluation at timestep 3000 returned a mean returns of -200.0
Epsilon = 0.734
 20%|██        | 4000/20000 [00:12<00:48, 330.29it/s]Evaluation at timestep 4000 returned a mean returns of -200.0
Epsilon = 0.639
 25%|██▌       | 5000/20000 [00:18<00:57, 263.02it/s]Evaluation at timestep 5000 returned a mean returns of -200.0
Epsilon = 0.544
 30%|███       | 6000/20000 [00:24<01:01, 228.19it/s]Evaluation at timestep 6000 returned a mean returns of -200.0
Epsilon = 0.44900000000000007
 35%|███▌      | 7000/20000 [00:31<01:05, 199.45it/s]Evaluation at timestep 7000 returned a mean returns of -200.0
Epsilon = 0.354
 40%|

KeyboardInterrupt: 

In [None]:
# DQN Config
CONFIG = {
    "episode_length": 200,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 10,
    "learning_rate": 0.001,
    "hidden_size": (16,32),
    "target_update_freq": 200,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e7),
    "plot_loss": False,
    "epsilon": 1,
    "max_deduct": 0.97,
    "decay": 0.5,
    "lr_step_size": 1000,
    "lr_gamma": 0.99,
    "max_steps": 200,
    "non_param": False,
}

returns = []
n_seeds = 3

for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _ = train(env, 
            CONFIG, 
            fa=function_approximators[0], 
            agent = agents[0], 
            render=RENDER)
    env.close()
    returns.append(r[-1])

print(f"Mean returns: {np.mean(returns)}")
print(f"Std returns: {np.std(returns)}")


In [3]:
# Linear Config
CONFIG = {
    "episode_length": 200,
    "max_timesteps": 20000,
    "max_time": 30 * 60,
    "eval_freq": 1000, 
    "eval_episodes": 5,
    "learning_rate": 0.005,
    "target_update_freq": 200,
    "batch_size": 32,
    "gamma": 0.99,
    "buffer_capacity": int(1e7),
    "plot_loss": False,
    "epsilon": 1,
    "max_steps": 200,
    "poly_degree": 1,
    "max_deduct": 0.95,
    "decay": 0.5,
    "lr_step_size": 1000,
    "lr_gamma": 0.99,
    "non_param": False,
}

returns = []
n_seeds = 3

for i in range(n_seeds):
    print(f"\n Run: {i+1} \n")
    r, _ = train(env, 
            CONFIG, 
            fa=function_approximators[1], 
            agent = agents[1], 
            render=RENDER)
    env.close()
    returns.append(r[-1])

print(f"Mean returns: {np.mean(returns)}")
print(f"Std returns: {np.std(returns)}")

  0%|          | 0/20000 [00:00<?, ?it/s]
 Run: 1 

q_loss: 0.6585001945495605
  5%|▌         | 1072/20000 [00:03<00:43, 431.99it/s]q_loss: 0.5454514026641846
Evaluation at timestep 1072 returned a mean returns of -5.0
Epsilon = 0.911745
Learning rate = 0.00495
 10%|█         | 2003/20000 [00:04<00:33, 543.53it/s]Evaluation at timestep 2003 returned a mean returns of -5.0
Epsilon = 0.81323
Learning rate = 0.00495
q_loss: 0.024255715310573578
 15%|█▌        | 3063/20000 [00:07<00:43, 389.01it/s]Evaluation at timestep 3014 returned a mean returns of -5.0
Epsilon = 0.71557
Learning rate = 0.0049005
q_loss: 0.00022447688388638198
 20%|██        | 4069/20000 [00:09<00:38, 412.51it/s]Evaluation at timestep 4013 returned a mean returns of -5.0
Epsilon = 0.620285
Learning rate = 0.004851495
q_loss: 7.4903960012306925e-06
 25%|██▌       | 5061/20000 [00:11<00:35, 419.79it/s]Evaluation at timestep 5002 returned a mean returns of -5.0
Epsilon = 0.5270900000000001
Learning rate = 0.00480298005
q_l

KeyboardInterrupt: 