In [1]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
import imageio
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
env_id = "CartPole-v1"
env = gym.make(env_id)
eval_env = gym.make(env_id)

s_size = env.observation_space.shape[0]
a_size = env.action_space.n


In [3]:
print("Sample Observation", env.observation_space.sample())
print("Sample Action", env.action_space.sample())

Sample Observation [ 4.9990320e-01 -9.0856687e+37 -3.7153059e-01  2.6389975e+38]
Sample Action 0


In [4]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super().__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=-1)
        return x
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)
        

In [5]:
p = Policy(s_size, a_size, 16)
p(torch.tensor([0.3,0.4,0.5,0.3]).float())
p.act(np.array([0.3,0.4,0.5,0.3]))

(0, tensor([-0.4959], grad_fn=<SqueezeBackward1>))

In [18]:
def discount_rewards(rewards, gamma=0.99):
    r = np.array([gamma**i * rewards[i] for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r[::-1].cumsum()[::-1]
    return torch.tensor((r - r.mean())/r.std(), dtype=torch.float)


def reinforce(policy, optimizer, n_training_epsides, max_t, gamma, print_every):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_epsides + 1):
        saved_log_probs = []
        rewards = []
        net_rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            x, x_dot, theta, theta_dot = state 
            r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
            net_reward = r1 + r2
            rewards.append(reward)
            net_rewards.append(net_reward)
            if done:
                break 
        rewards_sum = sum(rewards)
        scores_deque.append(rewards_sum)
        scores.append(rewards_sum)
        
        # discounts = [gamma ** i for i in range(len(rewards) + 1)]
        # R = sum([a * b for a, b in zip(discounts, rewards)])
        Rs = discount_rewards(net_rewards, gamma)
        # policy_loss = []
        # for log_prob in saved_log_probs:
        #     policy_loss.append(-log_prob * R)
        # policy_loss = torch.cat(policy_loss).sum()
        # policy_loss = -torch.cat(saved_log_probs).sum() * R
        # Rs = torch.tensor([a * b for a, b in zip(discounts, rewards)])
        policy_loss = -torch.cat(saved_log_probs) @ Rs
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Epison {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
    
    return scores

In [20]:
cartpole_hyperparameters = {
    "h_size": 16,
    "n_training_episodes": 3000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [21]:
# Create policy and place it to the device
cartpole_policy = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])

In [22]:
scores = reinforce(cartpole_policy,
                   cartpole_optimizer,
                   cartpole_hyperparameters["n_training_episodes"], 
                   cartpole_hyperparameters["max_t"],
                   cartpole_hyperparameters["gamma"], 
                   100)

Epison 100	Average Score: 46.24
Epison 200	Average Score: 74.87
Epison 300	Average Score: 303.25
Epison 400	Average Score: 491.05
Epison 500	Average Score: 500.00
Epison 600	Average Score: 496.12
Epison 700	Average Score: 500.00
Epison 800	Average Score: 500.00
Epison 900	Average Score: 473.59
Epison 1000	Average Score: 500.00


KeyboardInterrupt: 