In [1]:
# Importing libraries
import gymnasium as gym
import numpy as np
from itertools import count
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fac946e7070>

In [3]:
SavedAction = namedtuple('SavedAction', ['log_prob', 'state_value', 'state', 'action'])

In [4]:
env = gym.make('CartPole-v1')

In [5]:
class PPOActor(nn.Module):
    def __init__(self):
        super(PPOActor, self).__init__()
        self.fc1 = nn.Linear(4, 128) # 4 parameters in the observation space
        self.fc2 = nn.Linear(128, 2) # 2 possible actions

        self.saved_actions = []
        self.rewards = []
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# Predicts value functions NOT state-value functions
class PPOCritic(nn.Module):
    def __init__(self):
        super(PPOCritic, self).__init__()
        self.fc1 = nn.Linear(4, 128) 
        self.fc2 = nn.Linear(128, 1) 
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [6]:
def select_action(state):
    state = torch.from_numpy(state).float()
    probs = F.softmax(actor(state), dim=-1)
    m = Categorical(probs)
    action = m.sample()
    actor.saved_actions.append((probs[action], state, action.detach()))
    return action.item()

In [7]:
def finish_episode():
    # Calculating losses and performing backprop
    R = 0
    saved_actions = actor.saved_actions
    returns = []
    epsilon = 3
    num_epochs = 10

    for r in actor.rewards[::-1]:
        R = r + 0.99 * R # Gamma is 0.99
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for epoch in range(num_epochs):
        actor_losses = []
        critic_losses = []

        for (old_prob, state, action), R in zip(saved_actions, returns):
            advantage = R - critic(state).item()

            #if epoch == 0:
            #    log_prob = old_log_prob
            #    old_log_prob = 0
            #else:
            #    probs = F.softmax(actor(state), dim=-1)
            #    m = Categorical(probs)
            #    log_prob = m.log_prob(action)


            # Calculate the probability ratio
            ratio = actor(state)[action] / old_prob.detach()

            # Calculate the clipped and unclipped surrogate losses
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon) * advantage

            actor_losses.append(-torch.min(surr1, surr2))
            
            critic_losses.append(F.smooth_l1_loss(critic(state), torch.tensor([R])))

        actor_optimizer.zero_grad()
        actor_loss = torch.stack(actor_losses).mean()
        actor_loss.backward()
        actor_optimizer.step()

        critic_optimizer.zero_grad()
        critic_loss = torch.stack(critic_losses).mean()
        critic_loss.backward()
        critic_optimizer.step()

    del actor.rewards[:]
    del actor.saved_actions[:]

In [8]:
actor = PPOActor()
critic = PPOCritic()
actor_optimizer = optim.Adam(actor.parameters(), lr=3e-2)
critic_optimizer = optim.Adam(critic.parameters(), lr=4e-1)
eps = np.finfo(np.float32).eps.item()

In [9]:
def train():
    running_reward = 10
    for i_episode in count():
        state, info = env.reset()
        ep_reward = 0
        for t in range(1, 10000):
            action = select_action(state)
            state, reward, terminated, truncated, info = env.step(action)
            actor.rewards.append(reward)
            ep_reward += reward
            if terminated or truncated:
                break

        running_reward = 0.05 * ep_reward + (1-0.05) * running_reward
        finish_episode()
        if i_episode % 10 == 0:
            print(f"Episode {i_episode} Reward: {ep_reward:.2f} Average reward: {running_reward:.2f}")

        if running_reward > env.spec.reward_threshold:
            print(f"Solved, running reward is now {running_reward} and the last episode runs to {t} timesteps")
            break

In [10]:
train()

Episode 0 Reward: 14.00 Average reward: 10.20
Episode 10 Reward: 32.00 Average reward: 19.69
Episode 20 Reward: 13.00 Average reward: 23.91
Episode 30 Reward: 89.00 Average reward: 35.70
Episode 40 Reward: 94.00 Average reward: 52.50
Episode 50 Reward: 35.00 Average reward: 53.82
Episode 60 Reward: 61.00 Average reward: 64.78
Episode 70 Reward: 70.00 Average reward: 56.31
Episode 80 Reward: 52.00 Average reward: 63.12
Episode 90 Reward: 44.00 Average reward: 57.62
Episode 100 Reward: 104.00 Average reward: 67.56
Episode 110 Reward: 23.00 Average reward: 52.20
Episode 120 Reward: 39.00 Average reward: 44.76
Episode 130 Reward: 18.00 Average reward: 41.90
Episode 140 Reward: 74.00 Average reward: 36.89
Episode 150 Reward: 26.00 Average reward: 48.89
Episode 160 Reward: 14.00 Average reward: 38.56
Episode 170 Reward: 24.00 Average reward: 29.25
Episode 180 Reward: 16.00 Average reward: 24.62


KeyboardInterrupt: 