In [3]:
#!pip install pygame
import gym
import numpy as np
import torch
from torch import nn, optim
import ptan
import pygame

class CartPoleEnv(gym.Env):
    def __init__(self):
        super(CartPoleEnv, self).__init__()
        self.env = gym.make("CartPole-v1")
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

    def reset(self):
        return self.env.reset()

    def step(self, action):
        next_obs, reward, _is_done, _, _ = self.env.step(action)
        return next_obs, reward, _is_done, {}





Deep Reinforcement Learning with PyTorch and PTAN: A Practical Guide

Title: Deep Reinforcement Learning with PyTorch and PTAN: A Practical Guide
Introduction
Reinforcement Learning (RL) is a powerful paradigm in machine learning, and its application to deep neural networks has led to significant advancements. In this blog post, we'll explore a practical implementation of a Deep Q-Network (DQN) using PyTorch and PTAN (PyTorch Agent Net) and discuss the benefits of deploying this architecture.

Environment Setup
To get started, we'll create a custom RL environment using the CIFAR-10 dataset. The environment provides a simplified interface for training an agent to perform tasks related to image classification. It utilizes PyTorch's DataLoader for efficient data handling.

In [7]:

# class DQN(nn.Module):
#     def __init__(self, input_size, n_actions):
#         super(DQN, self).__init__()

#         self.fc_layers = nn.Sequential(
#             nn.Linear(input_size, 128),
#             nn.ReLU(),
#             nn.Linear(128, n_actions)
#         )

#     def forward(self, x):
#         print("model forward")
#         x = self.fc_layers(x)

#         return x
# class LinearAlgebraActionSelector(ptan.actions.ActionSelector):
#     def __call__(self, q_values):
#         # Round the output to obtain integer predictions
#         #q_values = torch.round(q_values)
#         print(f"Running Action selector {np.argmax(q_values)}")
#         #action = np.argmax(q_values)  # Corrected method name
#         return q_values





DQN Architecture
The DQN is a convolutional neural network (CNN) designed to process image observations. The architecture includes convolutional layers followed by fully connected layers. PyTorch's neural network module is employed for constructing the model.

In [2]:
# Example of using the CartPoleEnv
env = CartPoleEnv()

# Reset the environment
obs = env.reset()
print(env.action_space)
# Sample random actions for 10 steps
for _ in range(10):
    action = env.action_space.sample()
    next_obs, reward, _is_done, _= env.step(action)
    
    print(f"Action: {action}, Reward: {reward}, Done: {_is_done} Obs: {next_obs}")

Discrete(2)
Action: 1, Reward: 1.0, Done: False Obs: [ 0.0504558   0.23710015 -0.01167121 -0.2672277 ]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.0551978   0.43238673 -0.01701577 -0.56356883]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.06384553  0.62774324 -0.02828714 -0.8615636 ]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.0764004   0.82323873 -0.04551842 -1.1630049 ]
Action: 0, Reward: 1.0, Done: False Obs: [ 0.09286518  0.6287381  -0.06877851 -0.8849339 ]
Action: 0, Reward: 1.0, Done: False Obs: [ 0.10543993  0.434614   -0.08647719 -0.61464113]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.11413222  0.63083106 -0.09877001 -0.9332595 ]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.12674883  0.8271369  -0.1174352  -1.2552743 ]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.14329158  1.0235502  -0.1425407  -1.5823116 ]
Action: 1, Reward: 1.0, Done: False Obs: [ 0.16376257  1.2200516  -0.17418692 -1.9158397 ]


  if not isinstance(terminated, (bool, np.bool8)):


PTAN: PyTorch Agent Net
PTAN is a high-level library that simplifies the implementation of reinforcement learning algorithms. It provides useful abstractions for experience replay buffers, action selectors, and agent interfaces. The DQNAgent class from PTAN is utilized to connect the DQN model with the RL environment.

In [6]:
# Define a simple Experience class for illustration purposes
class Experience:
    def __init__(self, state, action, reward, done, next_state):
        self.state = state
        self.action = action
        self.reward = reward
        self.done = done
        self.next_state = next_state

In [4]:

import torch
import numpy as np








In [6]:

# Get the size of the observation space for CartPole
obs_size = env.observation_space.shape[0]

# Create CartPole environment
env = CartPoleEnv()

# Neural network and optimizer
net = DQN(obs_size, env.action_space.n)
target_net = ptan.agent.TargetNet(net)
selector = ptan.actions.ArgmaxActionSelector()
epsilon_greedy = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.1, selector=selector)
# action_selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.1)
agent = ptan.agent.DQNAgent(net, action_selector=epsilon_greedy, preprocessor=ptan.agent.float32_preprocessor)

# Experience source
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99, steps_count=1)

# Experience buffer
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=1000)

# Loss function and optimizer
loss_func = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=1e-3)

best_reward = float('-inf')  # Initialize with negative infinity or other appropriate value
best_model_path = "best_model.pth"  # Define the path where you want to save the best model
checkpoint_interval = 5
current_reward = 0.0
prev_loss = float('-inf')


  from .autonotebook import tqdm as notebook_tqdm


In [38]:
# env = gym.make("CartPole-v1", render_mode="human")
# env.reset()
# # Get the size of the observation space for CartPole
# obs_size = env.observation_space.shape[0]
# net = DQN(obs_size, env.action_space.n)
# target_net = ptan.agent.TargetNet(net)
# #selector = ptan.actions.ArgmaxActionSelector()
# selector = LinearAlgebraActionSelector()
# #epsilon_greedy = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.1, selector=selector)
# agent = ptan.agent.DQNAgent(net, action_selector=selector, preprocessor=float32_preprocessor)
# # Experience source
# exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99, steps_count=1)

# # Experience buffer
# buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=1000)
# next_obs, reward, _is_done, _, _ = env.step(env.action_space.sample())
# while True:
#     env.render()
    
#     # Wrap the state in a batch-like structure
    
    
#     action = agent([next_obs])
#     next_obs, reward, done, _, _ = env.step(action[0].item())  # Extract the action value from the tensor
#     print(reward,done)
#     # exp_source.append(state=state, action=action, reward=reward, last_state=nstate, done=done)
#     if done:
#         break

# env.close()

In [9]:
print(buffer)

<ptan.experience.ExperienceReplayBuffer object at 0x00000273C1597390>


In [5]:
def float32_preprocessor(states):
    # Assuming states is a list of arrays with varying lengths
    max_len = max(len(state[0]) if hasattr(state[0], '__len__') and len(state[0]) > 0 else 1 for state in states)
    print(f"max_len {max_len}")
    p_states = []

    for state in states:
        observation_state = state[0]
        print(f"observation state {observation_state}")
        if hasattr(observation_state, '__len__') and len(observation_state) > 0:
            padded_state = torch.nn.functional.pad(torch.tensor(observation_state, dtype=torch.float32), (0, max_len - len(observation_state)))
            print(f"pad state up{padded_state.shape }")
        else:
            # Handle scalar values (e.g., NumPy float) by repeating them to the fixed length
            padded_state = torch.tensor([observation_state] * max_len, dtype=torch.float32)
            padded_state= padded_state.expand(4)
            print(f"pad state dow : {padded_state.shape}")
        p_states.append(padded_state)
        print(p_states)
    # Stack the padded states along a new dimension
        np_states = np.stack(p_states, axis=0)

    print(f" np_states shape {np_states.shape}")
    return torch.tensor(np_states)


def is_done(_, last_state):
    return last_state is None  # Example condition, adjust as needed

class DQN(nn.Module):
    def __init__(self, input_size, n_actions):
        super(DQN, self).__init__()

        self.fc_layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        print("model forward")
        x = self.fc_layers(x)

        return x
# Create CartPole environment
env = CartPoleEnv()

# Neural network and optimizer
obs_size = env.observation_space.shape[0]
print(env.action_space)
net = DQN(obs_size, env.action_space.n)
target_net = ptan.agent.TargetNet(net)
selector = ptan.actions.ArgmaxActionSelector()
epsilon_greedy = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.1, selector=selector)
agent = ptan.agent.DQNAgent(net, action_selector=epsilon_greedy, preprocessor=float32_preprocessor)

# Experience source
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99, steps_count=10)#,vectorized=True, is_done=is_done)

# Experience buffer
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=1000)

# Populate the buffer with 100 experiences
buffer.populate(3)


Discrete(2)
max_len 4
observation state [-0.04818577  0.00036734  0.04510041  0.02190833]
pad state uptorch.Size([4])
[tensor([-0.0482,  0.0004,  0.0451,  0.0219])]
 np_states shape (1, 4)
model forward
max_len 1
observation state -0.04817841947078705
pad state dow : torch.Size([4])
[tensor([-0.0482, -0.0482, -0.0482, -0.0482])]
 np_states shape (1, 4)
model forward
max_len 1
observation state -0.044282130897045135
pad state dow : torch.Size([4])
[tensor([-0.0443, -0.0443, -0.0443, -0.0443])]
 np_states shape (1, 4)
model forward
max_len 1
observation state -0.036496978253126144
pad state dow : torch.Size([4])
[tensor([-0.0365, -0.0365, -0.0365, -0.0365])]
 np_states shape (1, 4)
model forward
max_len 1
observation state -0.02482120506465435
pad state dow : torch.Size([4])
[tensor([-0.0248, -0.0248, -0.0248, -0.0248])]
 np_states shape (1, 4)
model forward
max_len 1
observation state -0.009251383133232594
pad state dow : torch.Size([4])
[tensor([-0.0093, -0.0093, -0.0093, -0.0093])]
 n

  if not isinstance(terminated, (bool, np.bool8)):


In [59]:

#buffer.populate(1)

    # Get batch from the buffer
batch = buffer.sample(1)
for experience in batch:
        print(dir(experience))
        print(experience.state)
        print(experience.action)
        print(experience.reward)
        print(experience.last_state)

['__add__', '__class__', '__class_getitem__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '_asdict', '_field_defaults', '_fields', '_make', '_replace', 'action', 'count', 'index', 'last_state', 'reward', 'state']
[-0.03356702 -0.4147404   0.01723967  0.6291015 ]
0
6.793465209301
None


In [6]:
# Loss function and optimizer
loss_func = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=1e-3)

best_reward = float('-inf')  # Initialize with negative infinity or other appropriate value
best_model_path = "best_model.pth"  # Define the path where you want to save the best model
checkpoint_interval = 5
current_reward = 0.0
prev_loss = float('-inf')

  from .autonotebook import tqdm as notebook_tqdm


Training Loop
The training loop involves interacting with the environment, collecting experiences, and updating the DQN model. We utilize an experience replay buffer to store and sample experiences, enabling more stable and efficient learning.


Saving the Best Model
To ensure that the best-performing model is saved during training, we track the highest achieved reward and save the model's state dictionary accordingly.

In [7]:
def unpack_batch(batch):
    states, actions, rewards, next_states = [], [], [], []

    for experience in batch:
        print(experience.state)

        

        
        # if experience.last_state == None:
        #     print("One or more lists are empty, skipping...")
        #     continue  # You might want to return something meaningful or raise an exception here

        # Check if the lengths of states, actions, rewards, and next_states are consistent
        if len(states) != len(actions) or len(states) != len(rewards) or len(states):# != len(next_states):
            print("Inconsistent lengths of states, actions, rewards, or next_states, skipping...")
            continue  # You might want to return something meaningful or raise an exception here
        states.extend(experience.state)  # Use extend instead of append
        actions.append(experience.action)
        rewards.extend([experience.reward])  # Use extend instead of append
        #next_states.extend(experience.last_state)  # Use extend instead of append

    # Convert lists to PyTorch tensors
    states_v = torch.tensor(states, dtype=torch.float32)
    actions_v = torch.tensor(actions)
    rewards_v = torch.tensor(rewards, dtype=torch.float32)
    next_states_v = torch.tensor(next_states, dtype=torch.float32)

    return states_v, actions_v, rewards_v, states_v

In [8]:
# Call the unpack_batch function
batch_result = unpack_batch(batch)

# Check if the result is not None before using the unpacked values
if batch_result is not None:
    states_v, actions_v, rewards_v, next_states_v = batch_result
    # Use states_v, actions_v, rewards_v, next_states_v in your further processing or training
else:
    print("Skipping batch due to unpack_batch returning None.")


NameError: name 'batch' is not defined

In [9]:


# Training loop
for step in range(1000):  # You may want to adjust the number of steps
    buffer.populate(1)

    # Get batch from the buffer
    batch = buffer.sample(1)
    states_v, actions_v, rewards_v, next_states_v = unpack_batch(batch)

    print(f"train loop state{states_v},act{ actions_v}, reward{rewards_v}, ns{next_states_v}")
    print(f"train loop state{states_v.shape},act{ actions_v}, reward{rewards_v}, ns{next_states_v}")
    # Zero gradients
    optimizer.zero_grad()
    if states_v is not None:
        # Forward pass
        expected_state_dimension = 4  # Adjust this based on your expected state dimension
        if len(states_v) == expected_state_dimension:
            q_values = net(states_v)
        else:
            print("skipping")

    # # Get target Q-values
    # target_q_values = target_net.target_model(next_states_v).max(1)[0]

    # # Use view(-1) to ensure dones_mask is 1-dimensional
    # dones_mask = dones_mask.view(-1)

    # # Ensure target_q_values is also 1-dimensional
    # target_q_values = target_q_values.view(-1)

    # # Detach target_q_values
    # target_q_values = target_q_values.detach()

    # # Calculate TD error
    # expected_q_values = rewards_v + target_q_values * 0.99
    # loss = loss_func(q_values, expected_q_values)

#     # Backward pass
#     loss.backward()

#     # Optimize
#     optimizer.step()

#     if step % 10 == 0:
#         target_net.sync()
    
#     if prev_loss > loss.item():
#         prev_loss = loss.item()
#         print(f"\n----------------------------Loss {loss.item()}------------\n")
#         if step % checkpoint_interval == 0:
#             # Check the performance and save the model if it's the best so far
#             if current_reward > best_reward:
#                 best_reward = current_reward
#                 torch.save(net.state_dict(), best_model_path)
#                 print(f"\n----------------------------checkpoint saved ------------\n")

# # After training, you can use the trained model for inference.
# For example, you can run the trained agent in the environment:
# env.close()


[-0.04428213  0.38925767  0.04041436 -0.53418946]
train loop statetensor([-0.0443,  0.3893,  0.0404, -0.5342]),acttensor([1]), rewardtensor([8.6483]), nstensor([-0.0443,  0.3893,  0.0404, -0.5342])
train loop statetorch.Size([4]),acttensor([1]), rewardtensor([8.6483]), nstensor([-0.0443,  0.3893,  0.0404, -0.5342])
model forward
[-0.03649698  0.58378863  0.02973057 -0.8138691 ]
train loop statetensor([-0.0365,  0.5838,  0.0297, -0.8139]),acttensor([1]), rewardtensor([7.7255]), nstensor([-0.0365,  0.5838,  0.0297, -0.8139])
train loop statetorch.Size([4]),acttensor([1]), rewardtensor([7.7255]), nstensor([-0.0365,  0.5838,  0.0297, -0.8139])
model forward
[-0.04817842  0.19481446  0.04553857 -0.25621074]
train loop statetensor([-0.0482,  0.1948,  0.0455, -0.2562]),acttensor([1]), rewardtensor([9.5618]), nstensor([-0.0482,  0.1948,  0.0455, -0.2562])
train loop statetorch.Size([4]),acttensor([1]), rewardtensor([9.5618]), nstensor([-0.0482,  0.1948,  0.0455, -0.2562])
model forward
[ 0.010

  states_v = torch.tensor(states, dtype=torch.float32)


ValueError: expected sequence of length 4 at dim 1 (got 0)

Benefits of Deployment
1. Stability and Efficiency:
The use of experience replay buffers enhances stability during training by reducing the impact of correlated experiences.
Efficient learning is achieved through the replay buffer, allowing the model to revisit and learn from past experiences.
2. Customizable Environments:
The modular design of the custom environment allows for easy adaptation to various tasks beyond image classification.
Customization provides flexibility for addressing specific problem domains.
3. PyTorch and PTAN Integration:
Leveraging PyTorch for building the DQN model provides a seamless experience for researchers and practitioners familiar with the PyTorch ecosystem.
PTAN simplifies the RL implementation, making it more accessible and reducing the boilerplate code.
4. Best Model Selection:
The implementation includes a mechanism for tracking the best-performing model during training, ensuring that the model with the highest reward is saved.
5. Inference with Trained Model:
The trained model can be easily loaded for inference, allowing users to deploy the RL agent in real-world scenarios.
Conclusion
In this blog post, we've walked through the implementation of a Deep Q-Network using PyTorch and PTAN, highlighting the benefits of this architecture. This example serves as a foundation for understanding and applying deep reinforcement learning techniques to custom environments. The flexibility, stability, and efficiency provided by this deployment make it a valuable tool for a wide range of RL applications.

To access the complete code and resources, please refer to the GitHub repository linked [here].