# <center> CS6700: Reinforcement Learning
# <center> Programming Assignment 2: DQN and AC
## <center> Submitted By:
## <center> Gautham Govind A: EE19B022
## <center> Vishnu Vinod : CS19B048 

This notebook focuses on the first set of tasks based on Deep Q-Networks (DQNs). The key objective is to train DQNs on 3 different classic control environments, benchamark their performance and use hyperparameter tuning to optimize their performance.

We start with importing necessary libraries:

In [None]:
import numpy as np
import random
import torch
import torch.nn as nn  
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
#from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp
from scipy.special import softmax

Since we are training a neural network, using a GPU can speed up the training process. So we use one if available:

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

The two key components of the DQN approach are:
- Q-Network
- Replay memory

### Q-Network

The Q-Network learns the action values for each state. It takes a particular state as input and outputs the action values corresponding to each action. The hyperparameters associated with the Q-Network are:
- Architecture: The neural network architecture itself can be thought of as a hyperaparameter.
- Learning rate: The learning rate of the network is also a hyperparameter.

Note that since the network architecture itself may change, we need to write separate class definitions for each architecture. For now, the architecture given in the tutorial is used:

In [None]:
class QNetwork1(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_size)

    # Forward pass of the network
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
class QNetwork2(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork2, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, action_size)

    # Forward pass of the network
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

In [None]:
class QNetwork3(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork3, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, action_size)

    # Forward pass of the network
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)

### Replay Memory

The replay memory stores past experiences and samples them instead of using new experiences directly so as to break correlation. We have two hyperparameters associated with the replay memory:
- Buffer size: This represents the maximum number of past experiences that are stored in the replay buffer.
- Batch size: This represents the number of experiences that are sampled from the buffer for each training iteration.

Note that since we do not have any "architecture" as such as we had in the case of Q-Network, a single class definition for replay memory will suffice; hyperparameters can be changed by initializing the class with the required values.

In [None]:
# Replay memory to store past experiences

class ReplayBuffer:
    
    def __init__(self, action_size, buffer_size, batch_size, seed):
        
        '''Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        '''
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    # Add a new experience to memory
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    # Sampling a batch of past experiences from memory
    def sample(self):
        
        experiences = random.sample(self.memory, k = self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    # Return the current size of internal memory
    def __len__(self):
        return len(self.memory)

We next write a class which brings together Q-Network and Replay memory and allows for the training of the agent. We also now have the hyperparameters:
- Update frequency of target network
- Gradient truncation limit
- Discount factor
- Control Parameter

We now define all the hyperparamters at one location for easier accesibility:

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
class DQNAgent():

    def __init__(self, state_size, action_size, q_network, seed):

        # Environment-related parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Defining the local network and the target network as well as the optimizer
        self.qnetwork_local = q_network.to(device)
        self.qnetwork_target = q_network.to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Defining the replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Keeping track of timesteps as this is needed for tracking update frequency of target network
        self.t_step = 0
    
    # Method for an agent' step
    # The agent first stores the new experience in replay memory
    # Then replay memory is sampled and the agent learns
    def step(self, state, action, reward, next_state, done):

        # Save experience in replay memory 
        self.memory.add(state, action, reward, next_state, done)
        
        # If enough samples are available in memory, get random subset and learn '''
        if len(self.memory) >= BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        
        # Updating the taget network after every UPDATE_FREQ steps    
        self.t_step = (self.t_step + 1) % UPDATE_FREQ
        if self.t_step == 0:
            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    # Choosing an action given a state
    # Epsilon-greedy policy is adopted
    def act(self, state, eps=0.):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection 
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    # Learning of the agent
    def learn(self, experiences, gamma):

        # Sampled experiences
        states, actions, rewards, next_states, dones = experiences

        # Q-value predcition for the next state from the target network
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Bootstrapped Q-value prediction for the current state from the target network
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model 
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss 
        loss = F.mse_loss(Q_expected, Q_targets)

        # Optimizie
        self.optimizer.zero_grad()
        loss.backward()
        
        # Gradiant Clipping 
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-GRAD_CLIP, GRAD_CLIP)
            
        self.optimizer.step()

Next, we define a DQNSolve() class which takes in the environment and solves it using the DQN agent:

In [None]:
class DQNSolve():

  def __init__(self, env, nn_arch, seed = 1):

    self.state_shape = env.observation_space.shape[0]
    self.action_shape = env.action_space.n
    self.env = env

    if nn_arch == "q1":
      self.q_network = QNetwork1(self.state_shape, self.action_shape, seed)

    if nn_arch == "q2":
      self.q_network = QNetwork2(self.state_shape, self.action_shape, seed)

    if nn_arch == "q3":
      self.q_network = QNetwork3(self.state_shape, self.action_shape, seed)

    self.agent = DQNAgent(self.state_shape, self.action_shape, self.q_network, seed)

  
  def solve(self, n_episodes=10000, max_t = 500, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores = []                 
    ''' list containing scores from each episode '''

    rewards = []

    steps = []
    
    scores_window = deque(maxlen=100) 

    eps = eps_start                    
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):

        state = self.env.reset()
        score = 0
        for t in range(max_t):
            action = self.agent.act(state, eps)
            next_state, reward, done, _ = self.env.step(action)
            self.agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
      
        
        scores_window.append(score)   
        rewards.append(score)
        steps.append(t)
        ''' save most recent score '''           

        eps = max(eps_end, eps_decay*eps) 
        ''' decrease epsilon '''


        print('\rEpisode {}\tAverage Score (prev 100): {:.2f}\tSteps:{}'.format(i_episode, np.mean(scores_window), t), end="")         
        
        if ((np.mean(scores_window)>=195.0) and (i_episode > 100) ):
           print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
           break

    return [i_episode, rewards, steps]


We write a function for solving the environment ten times and recording number of episodes and average rewards:

In [None]:
from google.colab import files

In [None]:
def analyze_variant(nn_arch, var_count):

  print("Running variant with hyperparameters: LR: {}, BUFFER_SIZE: {}, BATCH_SIZE: {}, UPDATE_FREQ: {}, GAMMA: {}, GRAD_CLIP: {}".format(LR, BUFFER_SIZE, BATCH_SIZE, UPDATE_FREQ, GAMMA, GRAD_CLIP))
  print("Using network architecture: ", nn_arch)
  run_episode_counts = []
  run_rewards = []
  run_steps = []

  # Averaging over 10 runs
  for i in range(10):
    print("Performing Run ", i+1)
    dq_solver = DQNSolve(env, nn_arch, seed = i)
    epi_count, rewards, steps = dq_solver.solve()
    run_episode_counts.append(epi_count)
    run_rewards.append(rewards)
    run_steps.append(steps)

  avg_num_episodes = np.mean(run_episode_counts)
  avg_rewards = np.ones((10, np.max(run_episode_counts)))
  avg_steps = np.ones((10, np.max(run_episode_counts)))
  for i in  range(10):
    avg_rewards[i, :run_episode_counts[i]] = run_rewards[i]
    avg_rewards[i, run_episode_counts[i]:] = run_rewards[i][-1]
    avg_steps[i, :run_episode_counts[i]] = run_steps[i]
    avg_steps[i, run_episode_counts[i]:] = run_steps[i][-1]

  avg_rewards =  np.mean(avg_rewards, axis = 0)
  avg_steps = np.mean(avg_steps, axis = 0)

  print("Average number of episodes taken to solve the environment:", avg_num_episodes)

  # Plotting average rewards
  plt.plot(list(range(avg_rewards.shape[0])), avg_rewards)
  plt.xlabel("Episode count")
  plt.ylabel("Total reward ")
  plt.title("Total reward per episode (averaged over 10 runs)")
  plt.savefig("cart_variant_{}_reward_plot.png".format(var_count))
  plt.show()
  files.download("cart_variant_{}_reward_plot.png".format(var_count)) 
  

  # Plotting average steps
  plt.plot(list(range(avg_steps.shape[0])), avg_steps)
  plt.xlabel("Episode count")
  plt.ylabel("Steps ")
  plt.title("Steps per episode (averaged over 10 runs)")
  plt.savefig("cart_variant_{}_step_plot.png".format(var_count))
  plt.show()
  files.download("cart_variant_{}_step_plot.png".format(var_count)) 
  

## CartPole

In [None]:
env = gym.make('CartPole-v1')

### First trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 1)

### Second Trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q2", 2)

### Third Trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 3)

### Fourth Trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 4)

### Fifth Trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 5)

### Sixth Trial

In [None]:
LR = 5e-4              # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q2", 6)

### Seventh Trial

In [None]:
LR = 7.5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128        # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q2", 7)

### Eighth Trial

In [None]:
LR = 7.5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 256         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 7)

## Acrobot

We define DQNSolve() with the conditions for the acrobot environment:

In [None]:
class DQNSolve():

  def __init__(self, env, nn_arch, seed = 1):

    self.state_shape = env.observation_space.shape[0]
    self.action_shape = env.action_space.n
    self.env = env

    if nn_arch == "q1":
      self.q_network = QNetwork1(self.state_shape, self.action_shape, seed)

    if nn_arch == "q2":
      self.q_network = QNetwork2(self.state_shape, self.action_shape, seed)

    if nn_arch == "q3":
      self.q_network = QNetwork3(self.state_shape, self.action_shape, seed)

    self.agent = DQNAgent(self.state_shape, self.action_shape, self.q_network, seed)

  
  def solve(self, n_episodes=10000, max_t = 500, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores = []                 
    ''' list containing scores from each episode '''

    rewards = []

    steps = []
    
    scores_window = deque(maxlen=100) 

    eps = eps_start                    
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):

        state = self.env.reset()
        score = 0
        for t in range(max_t):
            action = self.agent.act(state, eps)
            next_state, reward, done, _ = self.env.step(action)
            self.agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
      
        
        scores_window.append(score)   
        rewards.append(score)
        steps.append(t+1)
        ''' save most recent score '''           

        eps = max(eps_end, eps_decay*eps) 
        ''' decrease epsilon '''


        print('\rEpisode {}\tAverage Score (prev 100): {:.2f}\tSteps:{}'.format(i_episode, np.mean(scores_window), t+1), end="")         
        
        if ((np.mean(scores_window)>=-100.0) and (i_episode > 100) ):
           print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
           break

    return [i_episode, rewards, steps]

In [None]:
def analyze_variant(nn_arch, var_count):

  print("Running variant with hyperparameters: LR: {}, BUFFER_SIZE: {}, BATCH_SIZE: {}, UPDATE_FREQ: {}, GAMMA: {}, GRAD_CLIP: {}".format(LR, BUFFER_SIZE, BATCH_SIZE, UPDATE_FREQ, GAMMA, GRAD_CLIP))
  print("Using network architecture: ", nn_arch)
  run_episode_counts = []
  run_rewards = []
  run_steps = []

  # Averaging over 10 runs
  for i in range(10):
    print("Performing Run ", i+1)
    dq_solver = DQNSolve(env, nn_arch, seed = i)
    epi_count, rewards, steps = dq_solver.solve()
    run_episode_counts.append(epi_count)
    run_rewards.append(rewards)
    run_steps.append(steps)

  avg_num_episodes = np.mean(run_episode_counts)
  avg_rewards = np.ones((10, np.max(run_episode_counts)))
  avg_steps = np.ones((10, np.max(run_episode_counts)))
  for i in  range(10):
    avg_rewards[i, :run_episode_counts[i]] = run_rewards[i]
    avg_rewards[i, run_episode_counts[i]:] = run_rewards[i][-1]
    avg_steps[i, :run_episode_counts[i]] = run_steps[i]
    avg_steps[i, run_episode_counts[i]:] = run_steps[i][-1]

  avg_rewards =  np.mean(avg_rewards, axis = 0)
  avg_steps = np.mean(avg_steps, axis = 0)

  print("Average number of episodes taken to solve the environment:", avg_num_episodes)

  # Plotting average rewards
  plt.plot(list(range(avg_rewards.shape[0])), avg_rewards)
  plt.xlabel("Episode count")
  plt.ylabel("Total reward ")
  plt.title("Total reward per episode (averaged over 10 runs)")
  plt.savefig("acro_variant_{}_reward_plot.png".format(var_count))
  plt.show()
  

  # Plotting average steps
  plt.plot(list(range(avg_steps.shape[0])), avg_steps)
  plt.xlabel("Episode count")
  plt.ylabel("Steps ")
  plt.title("Steps per episode (averaged over 10 runs)")
  plt.savefig("acro_variant_{}_step_plot.png".format(var_count))
  plt.show()

In [None]:
env = gym.make('Acrobot-v1')

### First trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 1)

### Second Trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q2", 2)

### Third trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 3)

### Fourth trial

In [None]:
LR = 10e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q2", 4)

### Fifth trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.999            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 5)

### Sixth trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 6)

### Seventh trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 7)

### Eighth trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 256         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 9)

## Mountain Car

In [None]:
class DQNSolve():

  def __init__(self, env, nn_arch, seed = 1):

    self.state_shape = env.observation_space.shape[0]
    self.action_shape = env.action_space.n
    self.env = env

    if nn_arch == "q1":
      self.q_network = QNetwork1(self.state_shape, self.action_shape, seed)

    if nn_arch == "q2":
      self.q_network = QNetwork2(self.state_shape, self.action_shape, seed)

    if nn_arch == "q3":
      self.q_network = QNetwork3(self.state_shape, self.action_shape, seed)

    self.agent = DQNAgent(self.state_shape, self.action_shape, self.q_network, seed)

  
  def solve(self, n_episodes=10000, max_t = 500, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores = []                 
    ''' list containing scores from each episode '''

    rewards = []

    steps = []
    
    scores_window = deque(maxlen=100) 

    eps = eps_start                    
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):

        state = self.env.reset()
        score = 0
        for t in range(max_t):
            action = self.agent.act(state, eps)
            next_state, reward, done, _ = self.env.step(action)
            self.agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
      
        
        scores_window.append(score)   
        rewards.append(score)
        steps.append(t+1)
        ''' save most recent score '''           

        eps = max(eps_end, eps_decay*eps) 
        ''' decrease epsilon '''


        print('\rEpisode {}\tAverage Score (prev 100): {:.2f}\tSteps:{}'.format(i_episode, np.mean(scores_window), t+1), end="")         
        
        if ((done) and (t < 199) ):
           print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
           break

    return [i_episode, rewards, steps]

In [None]:
def analyze_variant(nn_arch, var_count):

  print("Running variant with hyperparameters: LR: {}, BUFFER_SIZE: {}, BATCH_SIZE: {}, UPDATE_FREQ: {}, GAMMA: {}, GRAD_CLIP: {}".format(LR, BUFFER_SIZE, BATCH_SIZE, UPDATE_FREQ, GAMMA, GRAD_CLIP))
  print("Using network architecture: ", nn_arch)
  run_episode_counts = []
  run_rewards = []
  run_steps = []

  # Averaging over 10 runs
  for i in range(10):
    print("Performing Run ", i+1)
    dq_solver = DQNSolve(env, nn_arch, seed = i)
    epi_count, rewards, steps = dq_solver.solve()
    run_episode_counts.append(epi_count)
    run_rewards.append(rewards)
    run_steps.append(steps)

  avg_num_episodes = np.mean(run_episode_counts)
  avg_rewards = np.ones((10, np.max(run_episode_counts)))
  avg_steps = np.ones((10, np.max(run_episode_counts)))
  for i in  range(10):
    avg_rewards[i, :run_episode_counts[i]] = run_rewards[i]
    avg_rewards[i, run_episode_counts[i]:] = run_rewards[i][-1]
    avg_steps[i, :run_episode_counts[i]] = run_steps[i]
    avg_steps[i, run_episode_counts[i]:] = run_steps[i][-1]

  avg_rewards =  np.mean(avg_rewards, axis = 0)
  avg_steps = np.mean(avg_steps, axis = 0)

  print("Average number of episodes taken to solve the environment:", avg_num_episodes)

  # Plotting average rewards
  plt.plot(list(range(avg_rewards.shape[0])), avg_rewards)
  plt.xlabel("Episode count")
  plt.ylabel("Total reward ")
  plt.title("Total reward per episode (averaged over 10 runs)")
  plt.savefig("mount_variant_{}_reward_plot.png".format(var_count))
  plt.show()
  

  # Plotting average steps
  plt.plot(list(range(avg_steps.shape[0])), avg_steps)
  plt.xlabel("Episode count")
  plt.ylabel("Steps ")
  plt.title("Steps per episode (averaged over 10 runs)")
  plt.savefig("mount_variant_{}_step_plot.png".format(var_count))
  plt.show()

In [None]:
env = gym.make('MountainCar-v0')

### First trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q1", 1)

### Second trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q2", 1)

### Third trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q3", 3)

### Fourth trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q3", 4)

### Fifth trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 40        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 1.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q3", 5)

### Sixth trial

In [None]:
LR = 5e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 128         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.99            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q3", 6)

### Seventh trial

In [None]:
LR = 10e-4               # Learning rate
BUFFER_SIZE = int(1e5)  # Replay buffer size 
BATCH_SIZE = 64         # Training batch size
UPDATE_FREQ = 20        # Update frequency of target network 
GAMMA = 0.9            # Discount factor
GRAD_CLIP = 100.0         # The gradient is clipped between (-GRAD_CLIP and GRAD_CLIP)

In [None]:
analyze_variant("q3", 7)