In [1]:
# Importing libraries
import gymnasium as gym
import numpy as np
from itertools import count
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
env = gym.make('CartPole-v1')

In [4]:
class PPOActor(nn.Module):
    def __init__(self):
        super(PPOActor, self).__init__()
        self.fc1 = nn.Linear(4, 128) # 4 parameters in the observation space
        self.fc2 = nn.Linear(128, 2) # 2 possible actions

        self.saved_actions = []
        self.rewards = []
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# Predicts value functions NOT state-value functions
class PPOCritic(nn.Module):
    def __init__(self):
        super(PPOCritic, self).__init__()
        self.fc1 = nn.Linear(4, 128) 
        self.fc2 = nn.Linear(128, 1) 
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [5]:
def select_action(state):
    state = torch.from_numpy(state).float().to(device)
    probs = F.softmax(actor(state), dim=-1)
    m = Categorical(probs)
    action = m.sample()
    state_value = critic(state)
    actor.saved_actions.append((probs[action].detach(), state_value, state, action.detach()))
    return action.item()

In [6]:
def compute_gae(returns, state_values):
    advantages = torch.zeros_like(returns)
    gae = 0

    for t in reversed(range(len(returns))):
        delta = returns[t] - state_values[t].squeeze()
        gae = delta + 0.99 * gae_lambda * gae
        advantages[t] = gae

In [7]:
def finish_episode():
    # Calculating losses and performing backprop
    R = 0
    saved_actions = actor.saved_actions
    returns = []
    epsilon = 0.2
    num_epochs = 10

    for r in actor.rewards[::-1]:
        R = r + 0.99 * R # Gamma is 0.99
        returns.insert(0, R)
    returns = torch.tensor(returns, device=device)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    old_probs, state_values, states, actions = zip(*saved_actions)

    old_probs = torch.stack(old_probs).to(device)
    state_values = torch.stack(state_values).to(device)
    states = torch.stack(states).to(device)
    actions = torch.stack(actions).to(device)

    advantages = returns - state_values.squeeze()

    for epoch in range(num_epochs):

        new_probs = F.softmax(actor(states)).gather(1, actions.unsqueeze(-1)).squeeze()

        ratios = new_probs / old_probs
        #ratios = torch.exp(new_probs - old_probs)

        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - epsilon, 1 + epsilon) * advantages

        actor_loss = -torch.min(surr1, surr2).mean()
        #actor_loss = -surr1.mean()

        actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        actor_optimizer.step()

        if epoch == num_epochs - 1:
            critic_loss = F.smooth_l1_loss(state_values.squeeze(), returns)
            
            critic_optimizer.zero_grad()
            critic_loss.backward(retain_graph=False)
            critic_optimizer.step()

    del actor.rewards[:]
    del actor.saved_actions[:]

In [8]:
actor = PPOActor().to(device=device)
critic = PPOCritic().to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=2e-3)
critic_optimizer = optim.Adam(critic.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()

In [9]:
def train():
    running_reward = 10
    for i_episode in count():
        state, info = env.reset()
        ep_reward = 0
        for t in range(1, 10000):
            action = select_action(state)
            state, reward, terminated, truncated, info = env.step(action)
            actor.rewards.append(reward)
            ep_reward += reward
            if terminated or truncated:
                break

        running_reward = 0.05 * ep_reward + (1-0.05) * running_reward
        finish_episode()
        if i_episode % 10 == 0:
            print(f"Episode {i_episode} Reward: {ep_reward:.2f} Average reward: {running_reward:.2f}")

        if running_reward > env.spec.reward_threshold:
            print(f"Solved, running reward is now {running_reward} and the last episode runs to {t} timesteps")
            break

In [10]:
train()

  new_probs = F.softmax(actor(states)).gather(1, actions.unsqueeze(-1)).squeeze()


Episode 0 Reward: 15.00 Average reward: 10.25
Episode 10 Reward: 22.00 Average reward: 14.34
Episode 20 Reward: 29.00 Average reward: 22.44
Episode 30 Reward: 79.00 Average reward: 41.62
Episode 40 Reward: 258.00 Average reward: 92.47
Episode 50 Reward: 149.00 Average reward: 131.69
Episode 60 Reward: 352.00 Average reward: 172.57
Episode 70 Reward: 303.00 Average reward: 247.13
Episode 80 Reward: 317.00 Average reward: 257.16
Episode 90 Reward: 500.00 Average reward: 314.66
Episode 100 Reward: 314.00 Average reward: 310.15
Episode 110 Reward: 449.00 Average reward: 350.29
Episode 120 Reward: 357.00 Average reward: 336.76
Episode 130 Reward: 286.00 Average reward: 345.20
Episode 140 Reward: 323.00 Average reward: 368.60
Episode 150 Reward: 500.00 Average reward: 414.24
Episode 160 Reward: 186.00 Average reward: 350.70


KeyboardInterrupt: 