In [2]:
import numpy as np
import random
import torch
from collections import defaultdict
import matplotlib.pyplot as plt
from dqn import DQNAgent

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [4]:
SEEDS = [23, 42, 2025]

In [5]:
def average_by_position(dict1, dict2, dict3):
    # Sort each dict by key and get the values in order
    vals1 = [v for k, v in sorted(dict1.items())]
    vals2 = [v for k, v in sorted(dict2.items())]
    vals3 = [v for k, v in sorted(dict3.items())]

    # Use keys from sorted dict1 as output keys
    keys = [k for k, v in sorted(dict1.items())]

    result = {}
    for i, k in enumerate(keys):
        # Get i-th value from each dict
        v1 = vals1[i]
        v2 = vals2[i]
        v3 = vals3[i]
        values = [v1, v2, v3]
        result[k] = {
            'mean': np.mean(values),
            'std': np.std(values)
        }

    return result

In [6]:
#############################
# HARD UPDATE
#############################

eps_decay_list = [0.99, 0.995, 0.999]
lrs = [3e-3, 1e-3, 3e-4, 1e-4]
bs = [32, 64, 128]
num_layer=[2,3,4]

results_dict = {}
times = []
agents1 = []

for x in eps_decay_list:
    all_train_rewards = []
    all_eval_rewards = []
    for seed in SEEDS:
        print("LR = ", x, "seed = ", seed)
        set_seed(seed)

        dqn_agent = DQNAgent(
            env_name="Acrobot-v1",
            hidden_dim=128,
            episodes=1400,
            batch_size=64,
            num_steps=500,
            num_layers=4,
            learning_rate=3e-4,
            eval_interval=5,
            epsilon=1.0,
            tau = 1.0,
            gamma = 0.99,
            epsilon_min=0.05,
            epsilon_decay=x,
            update_freq=1000
        )

        rewards_history, eval_rewards_history, total_time = dqn_agent.train_with_seed(seed=seed)

        all_train_rewards.append(rewards_history)
        all_eval_rewards.append(eval_rewards_history)
        agents1.append(dqn_agent)
        times.append(total_time)


    avg_train_rewards = average_by_position(all_train_rewards[0], all_train_rewards[1], all_train_rewards[2])
    avg_eval_rewards = average_by_position(all_eval_rewards[0], all_eval_rewards[1], all_eval_rewards[2])
    print("eval: ", len(avg_eval_rewards))

    results_dict[f"LR={x}"] = {
        "avg_train_rewards": avg_train_rewards,
        "avg_eval_rewards": avg_eval_rewards
    }

LR =  0.99 seed =  23
LR =  0.99 seed =  42
LR =  0.99 seed =  2025
eval:  280
LR =  0.995 seed =  23
LR =  0.995 seed =  42
LR =  0.995 seed =  2025
eval:  280
LR =  0.999 seed =  23
LR =  0.999 seed =  42
LR =  0.999 seed =  2025
eval:  280


In [9]:
for idx in range(len(agents1)):
    print(agents1[idx].best_reward_mean, "std", agents1[idx].best_reward_std)

-65.8 std 3.7094473981982814
-64.1 std 5.156549233741495
-68.7 std 6.466065264130884
-66.95 std 3.6806928695559487
-64.3 std 4.371498598878878
-66.7 std 6.805145112339633
-66.8 std 7.865112840894274
-67.2 std 7.858753081755401
-65.45 std 6.224748990923249


In [None]:
for eps_decay, results in results_dict.items():
    filename = f'mc_avg_eval_rewards_{eps_decay}_hard_update_300_4.npy'
    np.save(filename, results["avg_eval_rewards"], allow_pickle=True)

In [None]:
for eps_decay, results in results_dict.items():
    filename = f'mc_avg_train_rewards_{eps_decay}_hard_update_300_4.npy'
    np.save(filename, results["avg_train_rewards"], allow_pickle=True)

In [10]:
#############################
# SOFT UPDATE
#############################

eps_decay_list = [0.99]
lrs = [3e-3, 1e-3, 3e-4, 1e-4]
bs = [32, 64, 128]
num_layers = [2,3,4]

tresults = {}
times2 = []
agents2 = []
# gammas = [0.9, 0.95, 0.99]
for x in eps_decay_list:
    all_train_rewards = []
    all_eval_rewards = []
    for seed in SEEDS:
        print("LR = ", x, "seed = ", seed)
        set_seed(seed)

        dqn_agent = DQNAgent(
            env_name="MountainCar-v0",
            hidden_dim=128,
            episodes=1300,
            batch_size=64,
            num_steps=200,
            num_layers=3,
            learning_rate=3e-4,
            eval_interval=5,
            epsilon=1.0,
            tau = 0.05,
            gamma = 0.99,
            epsilon_min=0.05,
            epsilon_decay=x
        )

        rewards_history, eval_rewards_history, total_time = dqn_agent.train_with_seed(seed=seed)

        all_train_rewards.append(rewards_history)
        all_eval_rewards.append(eval_rewards_history)
        agents2.append(dqn_agent)
        times2.append(total_time)


    avg_train_rewards = average_by_position(all_train_rewards[0], all_train_rewards[1], all_train_rewards[2])
    avg_eval_rewards = average_by_position(all_eval_rewards[0], all_eval_rewards[1], all_eval_rewards[2])
    print("eval: ", len(avg_eval_rewards))

    tresults[f"LR={x}"] = {
        "avg_train_rewards": avg_train_rewards,
        "avg_eval_rewards": avg_eval_rewards
    }

LR =  0.99 seed =  23
LR =  0.99 seed =  42
LR =  0.99 seed =  2025
eval:  260


In [11]:
for idx in range(len(agents2)):
    print(agents2[idx].best_reward_mean, "std", agents2[idx].best_reward_std)

-99.5 std 11.390785749894517
-103.0 std 11.117553687749837
-103.65 std 10.135457562438905


In [None]:
for eps_decay, results in tresults.items():
    filename = f'mountain_Car_eps_0.995_soft.npy'
    np.save(filename, results["avg_eval_rewards"], allow_pickle=True)

In [None]:
for eps_decay, results in tresults.items():
    filename = f'mc_avg_train_rewards_{eps_decay}_soft_update_4.npy'
    np.save(filename, results["avg_train_rewards"], allow_pickle=True)