In [1]:
#Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install gym --upgrade
!{sys.executable} -m pip install gym[classic_control] --upgrade
!{sys.executable} -m pip install moviepy==1.0.3 --upgrade
!{sys.executable} -m pip install ffmpeg --upgrade




In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np
import gym

class ActorCritic(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=16):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, state):
        action_probs = self.actor(state)
        state_value = self.critic(state)
        return action_probs, state_value

def compute_returns(rewards, gamma):
    R = 0
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R
        returns.insert(0, R)
    return returns



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def play(steps_per_batch, model, env, gamma, max_steps_before_done = 1024):
    states, actions, log_probs, returns, rewards = [], [], [], [], []
    
    t = 0
    n_eps = 0
    while t < steps_per_batch:
        state, _ = env.reset()
        done = False
        eps_rewards = []
        
        cur_t = 0
        while done is False and cur_t < max_steps_before_done:
            state = torch.FloatTensor(state).unsqueeze(0)
            action_probs, _ = model(state)
            distribution = Categorical(action_probs)

            action = distribution.sample()
            log_prob = distribution.log_prob(action)

            next_state, reward, done, _, _ = env.step(action.item())

            eps_rewards.append(reward)
            log_probs.append(log_prob)
            # if done: print(reward)

            states.append(state)
            actions.append(action)
            
            state = next_state
            cur_t += 1

        returns.extend(compute_returns(eps_rewards, gamma))
        rewards.append(eps_rewards)
        t += len(returns)
        n_eps += 1

    return states, actions, log_probs, returns, rewards, n_eps

In [10]:
from statistics import mean 
import copy

def train_ppo(
        env_name="CartPole-v1", gamma=0.99, lr=5e-4,
        clip_epsilon=0.2, epochs=1001, num_updates=100,
        steps_per_batch=512, updates_per_epoch=5):
    env = gym.make(env_name)
    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    old_models = []
    model = ActorCritic(input_dim, output_dim, hidden_dim = 32)
    policy_optimizer = optim.Adam(model.actor.parameters(), lr=lr)
    value_optimizer = optim.Adam(model.critic.parameters(), lr=lr)
        
    for epoch in range(epochs):
        states, actions, log_probs, returns, rewards, n_eps = play(steps_per_batch, model, env, gamma)
    
        states = torch.stack(states) 
        actions = torch.stack(actions)
        returns = torch.FloatTensor(returns).unsqueeze(1)
        log_probs = torch.FloatTensor(log_probs).unsqueeze(1)

        for _ in range(updates_per_epoch):
            values = model.critic(states).squeeze().unsqueeze(1)
            
            cur_action_probs = model.actor(states)
            dist = Categorical(cur_action_probs)
            cur_log_probs = dist.log_prob(actions).squeeze().unsqueeze(1)

            advantage = (returns - values.detach()).squeeze().unsqueeze(1)
            advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
            

            ratio = (cur_log_probs - log_probs.detach()).exp()
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
            policy_loss = -torch.min(surr1, surr2).mean()

            policy_optimizer.zero_grad()
            policy_loss.backward(retain_graph=True)
            policy_optimizer.step()
  
            value_loss = nn.MSELoss(reduction="mean")(returns, values)

            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()

        if epoch % num_updates == 0:
            print("Epoch: {}, Eps count: {} Reawrds: {:.4f} Policy Loss: {:.4f}, Value Loss: {:.4f}".format(epoch, n_eps, mean(sum(r) for r in rewards), policy_loss.item(), value_loss.item()))
            old_model = copy.deepcopy(model)
            old_models.append(old_model)
    env.close()
    return old_models

models = train_ppo()

Epoch: 0, Eps count: 7 Reawrds: 18.8571 Policy Loss: -0.0018, Value Loss: 135.9920
Epoch: 100, Eps count: 3 Reawrds: 135.0000 Policy Loss: -0.0016, Value Loss: 2086.0027
Epoch: 200, Eps count: 2 Reawrds: 238.5000 Policy Loss: -0.0009, Value Loss: 2603.6438
Epoch: 300, Eps count: 1 Reawrds: 1002.0000 Policy Loss: -0.0004, Value Loss: 2608.9373
Epoch: 400, Eps count: 2 Reawrds: 699.5000 Policy Loss: -0.0012, Value Loss: 983.3181
Epoch: 500, Eps count: 1 Reawrds: 1024.0000 Policy Loss: -0.0008, Value Loss: 516.5413
Epoch: 600, Eps count: 1 Reawrds: 1024.0000 Policy Loss: -0.0008, Value Loss: 405.1033
Epoch: 700, Eps count: 1 Reawrds: 1024.0000 Policy Loss: -0.0002, Value Loss: 387.0038
Epoch: 800, Eps count: 1 Reawrds: 761.0000 Policy Loss: -0.0005, Value Loss: 509.6077
Epoch: 900, Eps count: 1 Reawrds: 1024.0000 Policy Loss: -0.0000, Value Loss: 392.6066
Epoch: 1000, Eps count: 1 Reawrds: 1024.0000 Policy Loss: -0.0001, Value Loss: 384.0493


In [12]:
from gym.wrappers import RecordVideo
env = RecordVideo(gym.make('CartPole-v1', render_mode="rgb_array"), "./video", episode_trigger = lambda x: True)

for model in models:
    state, _ = env.reset()
    state = torch.FloatTensor(state)
    total_reward = 0
    cur_t = 0
    done = False
    while done is False:
        state = torch.FloatTensor(state).unsqueeze(0)        
        action_probs, _ = model(state)
        # Deterministic action selection use the line below
        #action = torch.argmax(action_probs.squeeze())
        
        distribution = Categorical(action_probs)
        action = distribution.sample()
        
        next_state, reward, done, _, _ = env.step(action.item())

        cur_t += 1
        state = next_state
env.close()

Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-0.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-0.mp4




Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-1.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-1.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-2.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-2.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-2.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-3.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-3.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-3.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-4.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-4.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-4.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-5.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-5.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-5.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-6.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-6.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-6.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-7.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-7.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-7.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-8.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-8.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-8.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-9.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-9.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-9.mp4
Moviepy - Building video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-10.mp4.
Moviepy - Writing video /Users/hiya/Code/GPT_Learning/video/rl-video-episode-10.mp4



                                                                                

Moviepy - Done !
Moviepy - video ready /Users/hiya/Code/GPT_Learning/video/rl-video-episode-10.mp4
