In [1]:
# IMPORT NECESSARY LIBRARIES

import gym
import math
import random
import numpy as np
import matplotlib
import tensorflow as tf
from itertools import count
import matplotlib.pyplot as plt
from collections import namedtuple
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Flatten, Conv2D, Dense, Input
%matplotlib inline


In [2]:
# GET THE ENVIRONMENT

# Initialize the FrozenLake-v0 gym environment
env = gym.make("FrozenLake-v0")

# Get the number of possible actions
n_outputs = env.action_space.n


### NEURAL NETWORK MODEL

In [3]:
# Function to return the neural network model
# The input to the network is the state of the environment
# It should return the q-value for all possible actions

def get_model(name):
  model = Sequential(name=name)
  model.add(Input(1, name="Input"))
  model.add(Dense(24, activation='relu', name="Hidden1"))
  model.add(Dense(32, activation='relu', name="Hidden2"))
  model.add(Dense(n_outputs, activation='linear', name="Output"))
  return model


### REPLAY MEMORY

In [4]:
# Creating a namedtuple class Experience that will hold the state, action, next state and reward
Experience = namedtuple('Experience', ('state', 'action', 'next_state', 'reward'))


In [5]:
# Class ReplayMemory to add experience to our class instance and
# sample memory 
class ReplayMemory():

  # __init__ function to initialize the memory capacity
  def __init__ (self, capacity):
    self.memory_capacity = capacity
    self.memory = []
    self.push_count = 0

  # Method to add memory instance
  def push(self, experience):
    if len(self.memory) < self.memory_capacity:
      self.memory.append(experience)
    else:
      self.memory[self.push_count % self.capacity] = experience
    self.push_count+=1

  # Method to sample from memory
  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)




### ACTION SELECTION - EPSILON GREEDY POLICY

In [6]:
# Define a function epsilon greedy that takes as parameter, the neural network,
# state and episode number
def epsilon_greedy(policy_network, state, episode):

  # Compute a random threshold value
  threshold = np.random.uniform(0,1)

  # Compute the epsilon value (exploration rate) based on the equation in the isntructions
  epsilon = eps_min + (eps_max-eps_min) * np.exp(-eps_decay_rate*episode)

  # Check if the threshold value is lower than the updated epsilon
  if threshold < epsilon:

      # Take a random action
      action = env.action_space.sample()

  # Else take the best action for that state by selecting the one that gives the
  # highest q-value when given as input to the neural network
  else:
      q_value = tf.reshape(tf.convert_to_tensor(state), shape=(-1,1) )
      action = np.argmax(policy_network(q_value))

  # Return the action
  return action


### ALGORITHM PARAMETERS

In [7]:
# Set the batch size
batch_size = 256

# Set the discount rate
gamma = 0.99

# Set the minimum discount rate
eps_min = 0.01

# Set the maximum discount rate
eps_max = 1

# Set the epsilon decay rate
eps_decay_rate = 0.001

# Set the number of updates to the policy network after
# which we update the target network
target_update = 100

# Set capacity of replay memory
memory_size = 100000

# Set learning rate of policy network
lr = 0.001

# Set the number of episodes
num_episodes = 1000

# Warm-up steps - the number of experiences to store before
# model training begins
# It has to be atleast one more than the batch size
warm_up_steps = batch_size*2

# Set the number of steps in each episode
steps_per_episode = 100


### THE DQN - TRAINING

In [8]:
# Create an instance of the ReplayMemory class by specifying the 
# memory capacity
memory = ReplayMemory(memory_size)

In [9]:
# Initialize the policy network by calling the get_model function
policy_network = get_model(name='Policy_Network')

# Initialize the target network by calling the get_model function
target_network = get_model(name='Target_Network')

# Set the target network weights equal to that of the policy network
target_network.set_weights(policy_network.get_weights())


In [10]:
# Define the optimizer to train the model
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# Define the loss function of the model
loss_fn = tf.keras.losses.MeanSquaredError()


In [11]:
# Variable to store the total number of steps in all episodes
global_step_count = 0


In [12]:
# Loop over all the episodes
for episode in range(1,num_episodes+1):

  # Get the initial state of the environment  
  state = env.reset()

  # Initialize a counter to update the target network
  update_count = 0

  # Loop over the maximium number of steps in the environment
  for step in range(steps_per_episode):

    # Increment the total step count variable
    global_step_count+=1

    # Get the action based on the epsilon greedy policy
    action = epsilon_greedy(policy_network, state, episode)

    # Get the next state, reward, whether or not the episode is done 
    # by calling the step method with the action
    next_state, reward, done, info = env.step(action)

    # Push this experience to the replay memory by calling the push method 
    # of the ReplayMemory class with the appropriate parameters
    memory.push(Experience(state, action, next_state, reward))

    # Update the current state of the environment
    state = next_state

    # Check if the total number of steps is greater than the warm-up steps to begin training
    if global_step_count > warm_up_steps:

      # Increment the counter to update the target network
      update_count+=1

      # Sample experiences from the memory
      experiences = np.asarray(memory.sample(batch_size))

      # Split the experience into a list of states, actions, next_states and rewards
      states, actions, next_states, rewards = experiences[:,0].tolist(), experiences[:,1].tolist(), experiences[:,2].tolist(), experiences[:,3].tolist()

      # Compute the target q-values from the target network by passing the next states 
      next_q_values = [tf.cast(tf.math.reduce_max(target_network(tf.reshape(tf.convert_to_tensor(next_states[i]), shape=(-1,1)))), dtype=tf.float32) for i in range(batch_size)]
      # next_q_values = [max(target_network(tf.reshape(tf.convert_to_tensor(next_states[i]), shape=(-1,1)))[0].numpy())  for i in range(batch_size)]
      target_q_values = [(next_q_values[i]*gamma)+ rewards[i] for i in range(batch_size)] 

      # Initialize a gradient tape
      with tf.GradientTape() as tape:

        # Get the predicted q-values from the policy network by passing the current states and the specific action from memory
        predicted_q_values = [policy_network(tf.reshape(tf.convert_to_tensor(states[i]), shape=(-1,1)))[0][int(actions[i])] for i in range(batch_size)]
        
        # Compute the loss between the target q-values and the predicted q-values
        loss = loss_fn(target_q_values, predicted_q_values)

      # Get the gradients wrt the policy network
      gradients = tape.gradient(loss, policy_network.trainable_weights)

      # Update the weights of the policy network
      optimizer.apply_gradients(zip(gradients, policy_network.trainable_weights))
    
    # If the update counter has reached the update threshold defined as target_update above
    if update_count%target_update==0:

      # Update the weights of the target network with that of the policy network
      print("Target Policy Updated after ", update_count, "updates to the policy network")
      target_network.set_weights(policy_network.get_weights())

    # If done is True, then end the episode
    if done==True:
      break



Target Policy Updated after epsiode 1
Target Policy Updated after epsiode 1
Target Policy Updated after epsiode 1
Target Policy Updated after epsiode 1
Target Policy Updated after epsiode 2
Target Policy Updated after epsiode 2
Target Policy Updated after epsiode 2
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 3
Target Policy Updated after epsiode 4
Target Policy Updated after epsiode 4
Target Policy Updated after epsiode 4
Target Policy Updated after epsiode 4
Target Polic

### PREDICTION

In [16]:
# Get the initial state of the environemnt
state = env.reset()

# Set done as False
done = False

# Initialize a counter variable to keep track of the number of steps
# before the agent reached the goal or falls into a hole
count = 0

# Loop over the episode
while done!=True:

  # Get the action for the given state from the policy network
  action = np.argmax(policy_network(tf.reshape(tf.convert_to_tensor(state), shape=(-1,1))))

  # Get the next state and reward from the environment for the particular action taken
  next_state, reward, done, info = env.step(action) 

  # Print the state and action taken
  print("State:", state, "\tAction:", action)

  # Update the current state
  state = next_state

  # Increment the step counter
  count+=1

# Check if the agent has reached the goal
if state ==15:
  print("Goal reached after",count, "steps\n")
else:
  print("Oops, fell into the hole!\n")

env.render()

State: 0 	Action: 2
State: 0 	Action: 2
State: 0 	Action: 2
State: 4 	Action: 1
State: 4 	Action: 1
Oops, fell into the hole!

  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
