In [None]:
import numpy as np
from collections import namedtuple, deque
import datetime
import gym
import glob
import io
import base64
import matplotlib.pyplot as plt
import tensorflow as tf
from PIL import Image
import tensorflow_probability as tfp
import seaborn as sbn
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## One-Step Actor-Critic Algorithm

**Actor-Critic methods** learn both a policy $\pi(a|s;\theta)$ and a state-value function $v(s;w)$ simultaneously. The policy is referred to as the actor that suggests actions given a state. The estimated value function is referred to as the critic. It evaluates actions taken by the actor based on the given policy. In this exercise, both functions are approximated by feedforward neural networks. 

- The policy network is parametrized by $\theta$ - it takes a state $s$ as input and outputs the probabilities $\pi(a|s;\theta)\ \forall\ a$
- The value network is parametrized by $w$ - it takes a state $s$ as input and outputs a scalar value associated with the state, i.e., $v(s;w)$
- The single step TD error can be defined as follows:
$$\delta_t  = R_{t+1} + \gamma v(s_{t+1};w) - v(s_t;w)$$
- The loss function to be minimized at every step ($L_{tot}^{(t)}$) is a summation of two terms, as follows:
$$L_{tot}^{(t)} = L_{actor}^{(t)} + L_{critic}^{(t)}$$
where,
$$L_{actor}^{(t)} = -\log\pi(a_t|s_t; \theta)\delta_t$$
$$L_{critic}^{(t)} = \delta_t^2$$
- **NOTE: Here, weights of the first two hidden layers are shared by the policy and the value network**
    - First two hidden layer sizes: [1024, 512]
    - Output size of policy network: 2 (Softmax activation)
    - Output size of value network: 1 (Linear activation)

<!-- $$\pi(a|s;\theta) = \phi_{\theta}(a,s)$$ -->

### Actor-Critic Network

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 512]
LR = 1e-4
NUM_EPISODES = 1800
AVG_NUM = 50
SOLVE_CUTOFF = 195

class ActorCriticModel(tf.keras.Model):
    """
    Defining policy and value networkss
    """
    def __init__(self, action_size):
        super(ActorCriticModel, self).__init__()

        self.fcls = []
        for i in range(NUM_HIDDEN):
            fc = tf.keras.layers.Dense(HIDDEN_SIZES[i], activation='relu')
            self.fcls.append(fc)

        self.pi_out = tf.keras.layers.Dense(action_size, activation='softmax') #Output Layer for policy
        self.v_out = tf.keras.layers.Dense(1) #Output Layer for state-value

    def call(self, state):
        """
        Computes policy distribution and state-value for a given state
        """
        out = state
        for l in self.fcls:
            out = l(out)

        pi = self.pi_out(out)
        v = self.v_out(out)

        return pi, v

### Agent Class



In [None]:
class Agent:
    """
    Agent class
    """
    def __init__(self, action_size, gamma=0.99):
        self.gamma = gamma
        self.ac_model = ActorCriticModel(action_size=action_size)
        self.ac_model.compile(tf.keras.optimizers.Adam(learning_rate=LR))
    
    def sample_action(self, state):
        """
        Given a state, compute the policy distribution over all actions and sample one action
        """
        pi,_ = self.ac_model(state)

        action_probabilities = tfp.distributions.Categorical(probs=pi)
        sample = action_probabilities.sample()
        return int(sample.numpy()[0])

    def actor_loss(self, action, pi, delta):
        """
        Compute Actor Loss
        """
        return -tf.math.log(pi[0,action]) * delta

    def critic_loss(self,delta):
        """
        Critic loss aims to minimize TD error
        """
        return delta**2

    @tf.function
    def learn(self, state, action, reward, next_state, done):
        """
        For a given transition (s,a,s',r) update the paramters by computing the
        gradient of the total loss
        """
        with tf.GradientTape(persistent=True) as tape:
            pi, V_s = self.ac_model(state)
            _, V_s_next = self.ac_model(next_state)

            V_s = tf.squeeze(V_s)
            V_s_next = tf.squeeze(V_s_next)
            
            #### TO DO: Write the equation for delta (TD error)
            ## Write code below
            delta = reward + (self.gamma * V_s_next) - V_s
            loss_a = self.actor_loss(action, pi, delta)
            loss_c = self.critic_loss(delta)
            loss_total = loss_a + loss_c
        
        gradient = tape.gradient(loss_total, self.ac_model.trainable_variables)
        self.ac_model.optimizer.apply_gradients(zip(gradient, self.ac_model.trainable_variables))


### Code to train the agent

In [None]:
# do 1 run of the AC-agent in an environment for num_episodes
def run_ac_net(env, tf_seed=42):

    agent = Agent(action_size=env.action_space.n)
    tf.keras.utils.set_random_seed(tf_seed)
    tf.compat.v1.reset_default_graph()

    reward_list = []
    step_list = []
    begin_time = datetime.datetime.now()
    ep_count = NUM_EPISODES

    for ep in range(1, NUM_EPISODES + 1):
        state = env.reset().reshape(1,-1)
        done = False
        ep_rew = 0
        steps = 0
        while not done:
            action = agent.sample_action(state) ##Sample Action
            next_state, reward, done, info = env.step(action) ##Take action
            next_state = next_state.reshape(1,-1)
            ep_rew += reward  ##Updating episode reward
            agent.learn(state, action, reward, next_state, done) ##Update Parameters
            state = next_state ##Updating State
            steps += 1
        reward_list.append(ep_rew)
        step_list.append(steps)
    
        if ep % 10 == 0:
            avg_rew = np.mean(reward_list[-10:])
            print('Episode ', ep, 'Reward %f' % ep_rew, 'Average Reward %f' % avg_rew)

        if ep % 100:
            avg_100 =  np.mean(reward_list[-100:])
            if avg_100 > SOLVE_CUTOFF:
                ep_count = min(ep_count, ep) # keep track of number of episodes to solve environment

    time_taken = datetime.datetime.now() - begin_time
    print(time_taken)
    return reward_list, step_list, ep_count

In [None]:
def rolling_average(reward_list, avg_num=AVG_NUM):
    average_reward_list = np.zeros_like(reward_list)
    for i in range(0, len(reward_list)):
        if i < avg_num:
            average_reward_list[i] = np.mean(reward_list[:(i+1)])
        else:
            average_reward_list[i] = np.mean(reward_list[(i-avg_num+1):(i+1)])
    return average_reward_list

In [None]:
def do_multiple_runs(env_name, num_runs):
    reward_lists, step_lists, comp_list = [], [], []
    env = gym.make(env_name)
    env.seed(42)

    for r in range(num_runs):
        r, s, c = run_ac_net(env)
        reward_lists.append(r)
        step_lists.append(list(map(int,s)))
        comp_list.append(c)
    
    mean_rewards = np.mean(reward_lists, axis=0)
    var_rewards = np.sqrt(np.var(reward_lists, axis=0))

    return mean_rewards, var_rewards, comp_list, reward_lists, step_lists

## Plotting Functions and Other Helpers

In [None]:
PATH = '/content/drive/MyDrive/Colab Notebooks/CS6700/Assignments/Assignment2/AC1/'

def gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists):
    
    m_fig, m_ax = plt.subplots(figsize=(10,10))
    v_fig, v_ax = plt.subplots(figsize=(10,10))
    m_ax.set_title(f'Mean Reward over 10 runs')
    m_ax.set_xlabel('Episodes')
    m_ax.set_ylabel('Mean Reward')
    v_ax.set_title(f'Variance of Reward over 10 runs')
    v_ax.set_xlabel('Episodes')
    v_ax.set_ylabel('Variance')

    sbn.lineplot(data=mean_rewards, ax=m_ax)
    sbn.lineplot(data=var_rewards, ax=v_ax)
      
    m_fig.savefig(PATH+'/'+fname+'/mean_reward.png')
    v_fig.savefig(PATH+'/'+fname+'/var_reward.png')

    sc_fig, sc_ax = plt.subplots(figsize=(10,10))
    st_fig, st_ax = plt.subplots(figsize=(10,10))
    sc_ax.set_title(f'Running average of previous {AVG_NUM} rewards')
    sc_ax.set_xlabel('Episodes')
    sc_ax.set_ylabel('Reward')
    st_ax.set_title(f'Number of steps curve')
    st_ax.set_xlabel('Episodes')
    st_ax.set_ylabel('Steps')
    
    run_count = len(reward_lists)
    labels = [f'run {i+1}' for i in range(3)]
    for idx in range(min(run_count, 3)):

        score_list = rolling_average(reward_lists[idx])
        step_list = step_lists[idx]
        sbn.lineplot(data=score_list, label=labels[idx], ax=sc_ax)
        sbn.lineplot(data=step_list, label=labels[idx], ax=st_ax)

    sc_fig.savefig(PATH+'/'+fname+'/sc_fig.png')
    st_fig.savefig(PATH+'/'+fname+'/st_fig.png')

    with open(PATH+'/'+fname+'/'+'config.txt', 'w') as f:
        for s in data:
            f.write(s + '\n')

### Experiments

In [None]:
# STEP = 0 does not mean anything. We are running 1 step AC.

#### CartPole

In [None]:
NUM_HIDDEN = 1
HIDDEN_SIZES = [1024]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [512, 512]
LR = 0.5e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [512, 512]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [512, 512]
LR = 2e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 512]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 1024]
LR = 0.5e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 1024]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 1024]
LR = 2e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 3
HIDDEN_SIZES = [512, 1024, 512]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'CartPole-v1'
SOLVE_CUTOFF = 195.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

#### Acrobot

In [None]:
NUM_HIDDEN = 1
HIDDEN_SIZES = [1024]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [512, 512]
LR = 0.5e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [512, 512]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [512, 512]
LR = 2e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 512]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 1024]
LR = 0.5e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 1024]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 1024]
LR = 2e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

In [None]:
NUM_HIDDEN = 3
HIDDEN_SIZES = [512, 1024, 512]
LR = 1e-4
NUM_EPISODES = 1000
AVG_NUM = 50
ENV_NAME = 'Acrobot-v1'
SOLVE_CUTOFF = -100.0
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)

#### MountainCar

In [None]:
NUM_HIDDEN = 2
HIDDEN_SIZES = [1024, 512]
LR = 1e-4
NUM_EPISODES = 600
AVG_NUM = 50
ENV_NAME = 'MountainCar-v0'
SOLVE_CUTOFF = -125
STEP = 0

ENV = gym.make(ENV_NAME)
mean_rewards, var_rewards, comp_list, reward_lists, step_lists = do_multiple_runs(env_name=ENV_NAME, num_runs=10)

data = [f'Number of Hidden Layers - {NUM_HIDDEN}', f'Hidden Sizes - {HIDDEN_SIZES}', f'Learning Rate - {LR}', 
        f'Run number of episodes to solve - {comp_list}', f'Mean number of episodes to solve- {np.mean(comp_list)}', 
        f'Number of episodes - {NUM_EPISODES}', f'Environment Name - {ENV_NAME}', f'NSTEP = {STEP}']

fname = f'hl={NUM_HIDDEN}_hs={HIDDEN_SIZES}_lr={LR}_env={ENV_NAME}_numep={NUM_EPISODES}_nstep={STEP}'
os.mkdir(PATH+'/'+fname)
gen_plots_and_save_data(fname, data, mean_rewards, var_rewards, reward_lists, step_lists)