In [1]:
# Source: https://youtu.be/hlv79rcHws0
# Proximal Policy Optimization (PPO) is Easy With PyTorch | Full PPO Tutorial

In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

import gym

In [3]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.actions = []
        self.probs = []
        self.vals = []
        self.rewards = []
        self.dones = []
        
        self.batch_size = batch_size
    
    def generate_batchs(self):
        n_states = len(self.states)
        
        batch_start = np.arrange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        indices = np.random.shuffle(indices)
        
        batches = [indices[i:i+self.batch_size] for i in batch_start]
        
        return np.array(self.states),\
               np.array(self.actions),\
               np.array(self.probs),\
               np.array(self.rewards),\
               np.array(self.dones),\
               batches
        # return batches
    
    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)
    
    def clear_memory(self):
        self.states = []
        self.actions = []
        self.probs = []
        self.vals = []
        self.rewards = []
        self.dones = []

In [26]:
class ActorNetwork(nn.Module):
    def __init__(self, n_actions, n_observations, fc1_dim=256, fc2_dim=256):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Linear(n_observations, fc1_dim),
            nn.ReLU(),
            nn.Linear(fc1_dim, fc2_dim),
            nn.ReLU(),
            nn.Linear(fc2_dim, n_actions),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        
        return dist

In [18]:
class CriticNetwork(nn.Module):
    def __init__(self, n_observations, fc1_dim=256, fc2_dim=256):
        super().__init__()
        self.critic = nn.Sequential(
            nn.Linear(n_observations, fc1_dim),
            nn.ReLU(),
            nn.Linear(fc1_dim, fc2_dim),
            nn.ReLU(),
            nn.Linear(fc2_dim, 1),
        )
    
    def forward(self, state):
        value = self.critic(state)
        return value

In [27]:
ACTOR_LR = 1e-3
CRITIC_LR = 1e-3

In [58]:
class Agent:
    def __init__(
        self, n_actions, n_observations,
        gamma=0.99, alpha=0.0003, gae_lambda=0.95,
        policy_clip=0.2, batch_size=64, max_steps=2048,
        n_epochs=10
    ):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        
        # TODO: add alpha, gamma
        self.actor = ActorNetwork(n_actions, n_observations)
        self.critic = CriticNetwork(n_observations)
        self.memory = PPOMemory(batch_size)
    
    def remember(self, state, action, log_prob, value, reward, done):
        """Store transitions in one step

        Args:
            state (_type_): _description_
            action (_type_): _description_
            log_prob (_type_): _description_
            value (_type_): _description_
            reward (_type_): _description_
            done (function): _description_
        """
        self.memory.store_memory(state, action, log_prob, value, reward, done)
    
    def choose_action(self, observation):
        state = torch.tensor([observation], dtype=torch.float)
        
        # action distribution from a given state
        dist = self.actor(state)
        # the value of the state
        value = self.critic(state)
        
        action = dist.sample()
        
        log_prob = torch.squeeze(dist.log_prob(action)).item()
        action = torch.squeeze(action).item()
        value = torch.squeeze(value).item()
        
        return action, log_prob, value

    # def compute_advantage(self, rewards):
    #     n_rewards = len(rewards)
    #     advantages = np.zeros(n_rewards, dtype=torch.float32)
        
    #     for t in range(n_rewards - 1):
    #         discount = 1
    #         a_t = 0
            
    #         for k in range(t, n_rewards - 1):
    #             pass        

    def learn(self):
        for _ in range(self.n_epochs):
            states, actions, probs, values, rewards, dones, batches = self.memory.generate_batchs()
            
            advantage = np.zeros(len(rewards), dtype=np.float32)

            for t in range(len(rewards)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(rewards)-1):
                    a_t += discount*(rewards[k] + self.gamma*values[k+1]*\
                            (1-int(dones[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t
            advantage = torch.tensor(advantage)
            
            for batch in batches:
                states = torch.tensor(states[batch], dtype=torch.float)
                probs = torch.tensor(probs[batch], dtype=torch.float)
                actions = torch.tensor(actions[batch], dtype=torch.float)
                
                old_probs = probs
                
                dist = self.actor(states)
                critic_value = self.critic(states)
                critic_value = torch.squeeze(critic_value)
                
                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                
                weighted_probs = prob_ratio * advantage[batch]
                
                policy_clipped = torch.clamp(prob_ratio, min=1-self.policy_clip, max=1+self.policy_clip)
                weighted_clipped_ratio = policy_clipped * advantage[batch]
                actor_loss = -torch.min(weighted_probs, weighted_clipped_ratio)
                
                # TODO: this loss difference from the loss in paper
                
                returns = advantage[batch] + values[batch]
                
                # TODO: where is this loss
                critic_loss = (returns - critic_value)**2
                critic_loss = critic_loss.mean()
                
                # TODO: where is this
                total_loss = actor_loss + 0.5*critic_loss
                
                # actor_optimizer.zero_grad()
                # critic_optimizer.zero_grad()
                # total_loss.backward()
                # actor_optimizer.step()
                # critic_optimizer.step()
        
        self.memory.clear_memory()

In [59]:
env = gym.make("CartPole-v1")

In [60]:
env.reward_range

(-inf, inf)

In [61]:
N = 20
batch_size = 5
n_epochs = 5
alpha = 0.0003
n_games = 300
best_score = env.reward_range[0]
score_history = []

learn_iters = 0

In [62]:
n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]

In [63]:
nn.Linear(n_observations, 128)

Linear(in_features=4, out_features=128, bias=True)

In [64]:
agent = Agent(
    n_actions=n_actions,
    n_observations=n_observations,
    batch_size=batch_size, n_epochs=n_epochs
)

In [65]:
actor_optimizer = optim.Adam(agent.actor.parameters(), lr=ACTOR_LR)
critic_optimizer = optim.Adam(agent.critic.parameters(), lr=CRITIC_LR)

In [67]:
for i in range(n_games):
    state, _ = env.reset()
    done = False
    episode_score = 0
    n_steps = 0
    
    while not done:
        action, log_prob, value = agent.choose_action(state)
        new_state, reward, done, truncated, info = env.step(action)
        
        episode_score += reward
        n_steps += 1
        
        agent.remember(state, action, log_prob, value, reward, done)
        
        if n_steps % N == 0:
            agent.learn()
            learn_iters += 1
        
        state = new_state
    
    score_history.append(episode_score)
    avg_score = np.mean(score_history[-100:])
    
    if avg_score > best_score:
        best_score = avg_score

AttributeError: module 'numpy' has no attribute 'arrange'

In [None]:
: