In [1]:
import gym

In [2]:
env = gym.make("CartPole-v1")

In [3]:
obs = env.reset()

In [4]:
obs

array([-0.01329987,  0.02098095,  0.00331455, -0.0332566 ])

In [5]:
# two possible actions, 0 and 1
env.action_space

Discrete(2)

In [6]:
action = 1 # accelerate to the right
obs, reward, done, info = env.step(action)
print(obs)
print(reward)
print(done)
print(info)

[-0.01288026  0.21605521  0.00264941 -0.32489191]
1.0
False
{}


In [7]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

totals = []
for episode in range(500):
    episode_rewards = 0
    obs = env.reset()
    for step in range(1000):
        action = basic_policy(obs)
        obs, reward, done, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

In [8]:
import numpy as np

np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

(42.048000000000002, 9.1449273370541331, 25.0, 68.0)

In [9]:
# Define a neural network
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class CartPoleNet(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs):
        super(CartPoleNet, self).__init__()
        self.hidden = nn.Linear(n_inputs, n_hidden)
        self.logits = nn.Linear(n_hidden, n_outputs)

    def forward(self, x):
        x = F.elu(self.hidden(x))
        # output is the probability of going left
        return F.sigmoid(self.logits(x))

net = CartPoleNet(4, 4, 1)
print(net)

CartPoleNet (
  (hidden): Linear (4 -> 4)
  (logits): Linear (4 -> 1)
)


In [10]:
print(torch.randn(4))
input = Variable(torch.randn(4))
output = net(input)
print(output)
print(1 - output)
print(torch.cat((output, 1 - output)))



 0.5551
 0.7948
-0.0617
-0.0369
[torch.FloatTensor of size 4]

Variable containing:
 0.5250
[torch.FloatTensor of size 1]

Variable containing:
 0.4750
[torch.FloatTensor of size 1]

Variable containing:
 0.5250
 0.4750
[torch.FloatTensor of size 2]



In [11]:
def get_action(prob):
    # we're not really using pytorch's stochastic reinforcement mechanism here
    # since we're calculating the gradients directly
    # so we need to use numpy's multinomial so as not to make this a stochastic tensor
    # https://discuss.pytorch.org/t/backpropagate-on-a-stochastic-variable/3496/13
    raw_prob = prob.data[0]
    # actions has the number of times each action was chosen.
    actions = np.random.multinomial(1, (raw_prob, 1 - raw_prob))
    # so we need to drop the first element since that's the number of times "left" was selected
    # if left was selected, the first element would be 1, and the second element would be 0
    # since action 0 is left, we want to keep the second element.
    action = np.delete(actions, 0)
    return torch.from_numpy(action)
    #return torch.multinomial(torch.cat((prob, 1 - prob)), 1)

for i in range(10):
    print(get_action(net(input)))


 1
[torch.LongTensor of size 1]


 1
[torch.LongTensor of size 1]


 1
[torch.LongTensor of size 1]


 1
[torch.LongTensor of size 1]


 0
[torch.LongTensor of size 1]


 0
[torch.LongTensor of size 1]


 0
[torch.LongTensor of size 1]


 1
[torch.LongTensor of size 1]


 1
[torch.LongTensor of size 1]


 1
[torch.LongTensor of size 1]



In [12]:
env.reset()
obs, award, done, info = env.step(get_action(net(input))[0])
tensor = torch.from_numpy(obs).float()
print(tensor)
net(Variable(tensor))


 0.0060
-0.1989
 0.0269
 0.2569
[torch.FloatTensor of size 4]



Variable containing:
 0.4654
[torch.FloatTensor of size 1]

In [13]:
net = CartPoleNet(4, 4, 1)
action = get_action(net(input))
print(action)

# if action is 0 (left), the target probability must be 1 since it's the probability of going left, vice versa
y = 1. - action.float()
print(y)


 0
[torch.LongTensor of size 1]


 1
[torch.FloatTensor of size 1]



In [14]:
# discount and normalize rewards
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    #all_normalized_rewards = 
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
    #for action, reward in zip(all_actions, all_normalized_rewards):
    #    action.reinforce(reward)

In [80]:
import torch.optim as optim
import torch.autograd as autograd

net = CartPoleNet(4, 4, 1)
env = gym.make("CartPole-v1")

# loss criterion
criterion = nn.MSELoss()

# optimizer
# the book uses lr = 0.01, but let's see if this works well
optimizer = optim.Adam(net.parameters(), lr=0.01)

n_iter = 250  # number of training iterations
n_max_steps = 1000 # max steps per episode
n_games_per_update = 10 # train the policy every 10 episodes
discount_rate = 0.95

for iteration in range(n_iter):
    all_rewards = []  # all sequences of raw rewards for each episode
    all_gradients = []   # gradients saved at each step of each episode
    for game in range(n_games_per_update):
        current_rewards = []  # all raw rewards from the current episode
        current_gradients = [] # all gradients from the current episode
        obs = env.reset()
        for step in range(n_max_steps):
            out = net(Variable(torch.from_numpy(obs).float()))  # probability to go left
            action = get_action(out)
            obs, reward, done, info = env.step(action[0])
            current_rewards.append(reward)
            y = Variable(1. - action.float())
            loss = criterion(out, y)
            loss.backward()
            current_gradients.append([p.grad.clone().data.numpy() for p in net.parameters()])
            if done:
                break
        all_rewards.append(current_rewards)
        all_gradients.append(current_gradients)
    # at this point, we have run the policy for 10 episodes, and we're ready for a 
    # policy update.
    optimizer.zero_grad()
    all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate)
    for par_index, param in enumerate(net.parameters()):
        mean_gradients = np.mean(
            [reward * all_gradients[game_index][step][par_index]
                for game_index, rewards in enumerate(all_rewards)
                for step, reward in enumerate(rewards)],
            axis=0
        )
        param.grad = Variable(torch.from_numpy(mean_gradients))
    optimizer.step()
    if iteration % 10 == 0:
        print('Episode {}\tLast step: {:5d}\t'.format(
            iteration, step))

Episode 0	Last step:    28	
Episode 10	Last step:    21	
Episode 20	Last step:    31	
Episode 30	Last step:    37	
Episode 40	Last step:    12	
Episode 50	Last step:    18	
Episode 60	Last step:    14	
Episode 70	Last step:    13	
Episode 80	Last step:    17	
Episode 90	Last step:    18	
Episode 100	Last step:    28	
Episode 110	Last step:    10	
Episode 120	Last step:    17	
Episode 130	Last step:    11	
Episode 140	Last step:     7	
Episode 150	Last step:    11	
Episode 160	Last step:     9	
Episode 170	Last step:     9	
Episode 180	Last step:    11	
Episode 190	Last step:    21	
Episode 200	Last step:    14	
Episode 210	Last step:    10	
Episode 220	Last step:    16	
Episode 230	Last step:    20	
Episode 240	Last step:     9	


In [26]:
# evaluate
def evaluate(model):
    totals = []
    for episode in range(500):
        episode_rewards = 0
        obs = env.reset()
        for step in range(1000):
            out = model(Variable(torch.from_numpy(obs).float()))
            action = get_action(out)
            obs, reward, done, info = env.step(action[0])
            episode_rewards += reward
            if done:
                break
        totals.append(episode_rewards)
    return np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

evaluate(net)

(10.994, 2.4927823811957595, 8.0, 27.0)

In [75]:
env = gym.make('CartPole-v1')

from itertools import count

#n_iter = 250  # number of training iterations
discount_rate = 0.95
n_max_steps = 1000 # max steps per episode

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.affine2 = nn.Linear(128, 2)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.affine2(x)
        return F.softmax(action_scores)


policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)


def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(Variable(state))
    action = probs.multinomial()
    policy.saved_actions.append(action)
    return action.data


def finish_episode():
    R = 0
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + discount_rate * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    for action, r in zip(policy.saved_actions, rewards):
        action.reinforce(r)
    optimizer.zero_grad()
    autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions])
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_actions[:]


running_reward = 10
for i_episode in count(1):
    state = env.reset()
    for t in range(n_max_steps): # Don't infinite loop while learning
        action = select_action(state)
        state, reward, done, _ = env.step(action[0,0])
        #env.render()
        policy.rewards.append(reward)
        if done:
            break

    running_reward = running_reward * 0.99 + t * 0.01
    finish_episode()
    if i_episode % 10 == 0:
        print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
            i_episode, t, running_reward))
    if running_reward > 200:
        print("Solved! Running reward is now {} and "
              "the last episode runs to {} time steps!".format(running_reward, t))
        break

Episode 10	Last length:    22	Average length: 12.20
Episode 20	Last length:    69	Average length: 16.16
Episode 30	Last length:    93	Average length: 24.72
Episode 40	Last length:    94	Average length: 32.71
Episode 50	Last length:   165	Average length: 50.06
Episode 60	Last length:    55	Average length: 51.92
Episode 70	Last length:   306	Average length: 59.58
Episode 80	Last length:   126	Average length: 69.65
Episode 90	Last length:    48	Average length: 73.95
Episode 100	Last length:   160	Average length: 78.44
Episode 110	Last length:   164	Average length: 85.45
Episode 120	Last length:   158	Average length: 91.52
Episode 130	Last length:   124	Average length: 94.66
Episode 140	Last length:   106	Average length: 96.83
Episode 150	Last length:    83	Average length: 92.48
Episode 160	Last length:   125	Average length: 93.00
Episode 170	Last length:   132	Average length: 99.80
Episode 180	Last length:   128	Average length: 104.77
Episode 190	Last length:   290	Average length: 112.88


In [76]:
# evaluate
def evaluate(model):
    totals = []
    for episode in range(500):
        episode_rewards = 0
        obs = env.reset()
        for step in range(1000):
            obs = torch.from_numpy(obs).float().unsqueeze(0)
            probs = model(Variable(obs))
            action = probs.multinomial()
            obs, reward, done, _ = env.step(action.data[0,0])
            episode_rewards += reward
            if done:
                break
        totals.append(episode_rewards)
    return np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

evaluate(policy)

(498.05200000000002, 6.0687145261579074, 454.0, 500.0)