In [1]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import count
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
import imageio
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# Superparameters
OUTPUT_GRAPH = False
MAX_EPISODE = 3000
DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
MAX_EP_STEPS = 1000   # maximum time step in one episode
RENDER = False  # rendering wastes time
GAMMA = 0.9     # reward discount in TD error
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic

env = gym.make('CartPole-v0')
env.seed(1)  # reproducible
env = env.unwrapped

N_F = env.observation_space.shape[0]
N_A = env.action_space.n

In [3]:
class Actor(nn.Module):
    def __init__(self, n_features, n_actions, n_hidden=20) -> None:
        super().__init__()
        self.l1 = nn.Linear(n_features, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_actions)
    
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.softmax(self.l2(x), dim=-1)
        distribution = Categorical(x)
        return distribution

In [4]:
actor = Actor(N_F, N_A)
c = actor(torch.from_numpy(np.array([1,2,3,4], dtype=np.float32)))

In [5]:
class Critic(nn.Module):
    def __init__(self, n_features, n_actions, n_hiddens=20) -> None:
        super().__init__()
        self.l1 = nn.Linear(n_features, n_hiddens)
        self.l2 = nn.Linear(n_hiddens, 1)
    
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x
        

In [6]:
cr = Critic(N_F, N_A)
cr(torch.from_numpy(np.array([1,2,3,4], dtype=np.float32)))

tensor([1.1868], grad_fn=<AddBackward0>)

In [7]:
def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        # returns.insert(0, R)
        returns.append(R)
    return returns[::-1]
    # r = torch.cat([gamma * i * rewards[i] * masks[i] for i in range(len(rewards))])
    # r = r.flip(-1).cumsum(-1).flip(-1)
    # return r

In [12]:
def trainIters(actor, critic, n_iters):
    optimizerA = optim.Adam(actor.parameters(), lr=LR_A)
    optimizerC = optim.Adam(critic.parameters(), lr=LR_C)
    for iter in range(n_iters):
        state = env.reset()
        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0
        env.reset()
    
        for i in count():
            # env.render()
            state = torch.FloatTensor(state).to(device)
            dist, value = actor(state), critic(state)
            action = dist.sample()
            next_state, reward, done, _ = env.step(action.cpu().numpy())
            log_prob = dist.log_prob(action).unsqueeze(0)
            entropy += dist.entropy().mean()
            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.tensor([reward], dtype=torch.float, device=device))
            masks.append(torch.tensor([1-done], dtype=torch.float, device=device))
            
            state = next_state
            
            if done:
                print('Iteration: {}, Score: {}'.format(iter, i))
                break 
    
        next_state = torch.FloatTensor(next_state).to(device)
        next_value = critic(next_state)
        returns = compute_returns(next_value, rewards, masks)
        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)
        
        advantage = returns - values
        
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()
        
        optimizerA.zero_grad()
        optimizerC.zero_grad()
        actor_loss.backward()
        critic_loss.backward()
        optimizerA.step()
        optimizerC.step()
    
    # torch.save(actor, 'model/actor.pkl')
    # torch.save(critic, 'model/critic.pkl')
    env.close()
                

In [14]:
actor = Actor(N_F, N_A).to(device)
critic = Critic(N_F, N_A).to(device)
trainIters(actor, critic, 1000)

Iteration: 0, Score: 10
Iteration: 1, Score: 15
Iteration: 2, Score: 14
Iteration: 3, Score: 10
Iteration: 4, Score: 18
Iteration: 5, Score: 13
Iteration: 6, Score: 35
Iteration: 7, Score: 10
Iteration: 8, Score: 31
Iteration: 9, Score: 14
Iteration: 10, Score: 22
Iteration: 11, Score: 25
Iteration: 12, Score: 27
Iteration: 13, Score: 8
Iteration: 14, Score: 22
Iteration: 15, Score: 11
Iteration: 16, Score: 10
Iteration: 17, Score: 19
Iteration: 18, Score: 54
Iteration: 19, Score: 15
Iteration: 20, Score: 9
Iteration: 21, Score: 33
Iteration: 22, Score: 34
Iteration: 23, Score: 16
Iteration: 24, Score: 25
Iteration: 25, Score: 22
Iteration: 26, Score: 17
Iteration: 27, Score: 8
Iteration: 28, Score: 25
Iteration: 29, Score: 15
Iteration: 30, Score: 13
Iteration: 31, Score: 34
Iteration: 32, Score: 14
Iteration: 33, Score: 20
Iteration: 34, Score: 15
Iteration: 35, Score: 34
Iteration: 36, Score: 24
Iteration: 37, Score: 23
Iteration: 38, Score: 15
Iteration: 39, Score: 17
Iteration: 40