# Simple Reinforcement Learning in Tensorflow Part 2: Vanilla Policy Gradient Agent

This tutorial contains a simple example of how to build a policy-gradient based agent that can solve the CartPole problem with pyTorch. For original series with tensorflow code samples see [Medium post](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724#.mtwpvfi8b).

Also take a look at alternative implementation of Cart Pole policy based on book Deep Reinforcement Learning Hands-On in this repo.

In [1]:
%matplotlib inline

import gym
import numpy as np
import torch
import matplotlib.pyplot as plt

xrange = range

In [2]:
env = gym.make('CartPole-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


### The Policy-Based Agent

In [3]:
def discount_rewards(r, gamma=0.99):
    """
    Take 1D float array of rewards and compute discounted reward .
    For example: 0, 1, 2, 3, 4 will be trasnformed to 9.7, 9.8, 8.9, 6.96, 4
    """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

### Training the Agent

In [4]:
total_episodes = 3000 # Set total number of episodes to train agent on.
max_ep = 999

agent = torch.nn.Sequential(
          torch.nn.Linear(4, 8),
          torch.nn.ReLU(),
          torch.nn.Linear(8, 2),
          torch.nn.Softmax())
adam = torch.optim.Adam(agent.parameters(), lr=1e-3)

i = 0
total_reward = []

for i in range(total_episodes):
    state = env.reset()
    running_reward = 0
    state_history = []
    action_history = []
    reward_history = []
    for j in range(max_ep):
        # Probabilistically pick an action given our network outputs
        action_prob = agent.forward(torch.tensor(state, dtype=torch.float))
        action = np.random.choice([0, 1], p=action_prob.detach().numpy())

        state_history.append(state)
        action_history.append(action)
        state, reward, done, info = env.step(action)
        reward_history.append(reward)
        running_reward += reward

        if done:
            discounted_rewards = torch.tensor(discount_rewards(np.array(reward_history)), dtype=torch.float)
            action_prob = agent.forward(torch.tensor(np.stack(state_history), dtype=torch.float))
            loss = -torch.mean(torch.log(action_prob[np.arange(j+1), action_history]) * discounted_rewards)

            adam.zero_grad()
            loss.backward()
            adam.step()

            total_reward.append(running_reward)
            break

    if i % 100 == 0:
        print(i, np.mean(total_reward[-100:]))

0 12.0


  input = module(input)


100 23.71
200 25.7
300 24.1
400 24.53
500 27.45
600 26.23
700 32.69
800 31.85
900 43.0
1000 41.57
1100 50.14
1200 56.34
1300 60.61
1400 60.38
1500 77.05
1600 97.14
1700 118.26
1800 121.57
1900 142.01
2000 136.88
2100 151.31
2200 174.71
2300 175.93
2400 167.19
2500 176.27
2600 185.49
2700 182.37
2800 181.99
2900 187.86
