# Assignment 5

In [None]:
!pip install gym torch numpy

 #### Environment Setup
- Initializing the **CartPole-v1** environment using the gymnasium library(maintained fork of OpenAI's gym ).
- Retreving and print the state and action spaces of the environment.


In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt

# Initialize CartPole environment
env = gym.make('CartPole-v1', render_mode='rgb_array')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
print("State size:", state_size)
print("Action size:", action_size)

#### Build the DQN Model
- Creating a neural network class DQN with 4 fully connected layers.
- `state_size` is taken as input and action_size Q-values are output.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# The linear layers are fully connected with Relu being used as the activation function
class DQN(nn.Module):
  def __init__(self, state_size, action_size, hidden_size=64):
    super(DQN, self).__init__()

    self.fc1 = nn.Linear(state_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, hidden_size)
    self.fc3 = nn.Linear(hidden_size, hidden_size)
    self.fc4= nn.Linear(hidden_size, action_size)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = self.fc4(x)
    return x

#### Implement the Replay Buffer
- Creating a ReplayBuffer class using Python’s deque.
- Implementing methods to add new experiences(such as state, action, reward, etc.) and sample a batch of experiences.

In [None]:
import random
from collections import deque

class ReplayBuffer:
  def __init__(self, capacity):
    self.buffer = deque(maxlen=capacity)

  def add(self, state, action, reward, nxt_state, comp):
    exp = (state, action, reward, nxt_state, comp)
    self.buffer.append(exp)

  def sample(self, batch_size):
    batch = random.sample(self.buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    return (np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones))

  def __len__(self):
    return len(self.buffer)

#### Defining the DQN Agent

- Initializing the main and target networks, optimizer, and epsilon variables in **DQNAgent**.
- Implementing methods for choosing an action (**get_action**) and updating the target network (**update_target_network**).
- Adding a train method in DQNAgent to sample from the buffer and train the model.
- Using the Mean Squared Loss as the Loss function.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

class DQNAgent:
  def __init__(self, state_dim, action_dim, hidden_dim=64, lr=0.001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_min = epsilon_min
    self.epsilon_decay = epsilon_decay

    # Initializing the main and target network
    self.main_network = DQN(state_dim, action_dim, hidden_dim)
    self.target_network = DQN(state_dim, action_dim, hidden_dim)
    self.update_target_network()

    # Using Adam as the optimizer and setting mse as the loss function
    self.optimizer = optim.Adam(self.main_network.parameters(), lr=lr)
    self.loss_fn = nn.MSELoss()

  # here the epsilo-greedy policy is used, implying ki as we go forward exploration is reduced and more focus is given on exploitation
  def get_action(self, state):
    if random.random() < self.epsilon:
        action = random.choice(range(self.action_dim))
    else:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.main_network(state_tensor)
        action = torch.argmax(q_values).item()
    return action

  def update_target_network(self):
    self.target_network.load_state_dict(self.main_network.state_dict())

  def update_epsilon(self):
    self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

  def train(self, replay_buffer, batch_size):
    if len(replay_buffer) < batch_size:
        return

    # Sampling batch of experiences from the replay buffer
    states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

    # Convert to torch tensors
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions).unsqueeze(1)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    # Finding the Q(s, a) using the main network
    q_values = self.main_network(states).gather(1, actions).squeeze()

    # Finding target Q-values using the target network
    with torch.no_grad():
        next_q_values = self.target_network(next_states).max(1)[0]
        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)

    loss = self.loss_fn(q_values, target_q_values)

    # backpropagation and optimization and finally updating the epsilon
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    self.update_epsilon()

#### Training the DQN Agent
- Train the agent over 600 episodes.

In [None]:
import gym
import torch
import numpy as np
import matplotlib.pyplot as plt

# Hyperparameters
hidden_dim = 64;  lr = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
capacity = 10000
batch_size = 64
n_eps = 500

# Initialising the agent and the replay buffer
agent = DQNAgent( state_size, action_size, hidden_dim, lr, gamma, epsilon, epsilon_min, epsilon_decay)
replay_buffer = ReplayBuffer(capacity)

# List to keep track of rewards
rewards_list = []
for episode in range(n_eps):
    state = env.reset()
    total_reward = 0

    while True:
        # finding action from the agent
        action = agent.get_action(state)

        # Taking the action in the environment and finding reward associated
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # Storing this experience in buffer
        replay_buffer.add(state, action, reward, next_state, done)

        agent.train(replay_buffer, batch_size)
        state = next_state

        if done:
            break

    # Append the total reward of this episode to rewards_list
    rewards_list.append(total_reward)
    if episode % 10 == 0:
        agent.update_target_network()
    if episode % 50 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.3f}")

env.close()

#### Visualizing Training Performance
Plotting rewards across episodes to observe the learning curve.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
plt.plot(rewards_list)
plt.xlabel("Episode"); plt.ylabel("Reward")
plt.title("DQN Training Rewards")
plt.show()

#### Refrences

- https://www.geeksforgeeks.org/q-learning-in-python/
- https://www.geeksforgeeks.org/deep-q-learning/
- https://www.geeksforgeeks.org/how-are-neural-networks-used-in-deep-q-learning/
- https://huggingface.co/learn/deep-rl-course/en/unit3/deep-q-algorithm