In [3]:

# code inspired by https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical
import gym
from collections import namedtuple

try:
    import matplotlib.pyplot as plt
except:
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

#Net class defines basic neural net functionality we'll use later
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # an affine transform (a linear transform with constant offset, of the form y = Wx + b)
        self.fc1 = nn.Linear(num_inputs, 128)
        self.fc2 = nn.Linear(128, num_outputs)
        self.fc3 = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.fc1(x))
        actions = self.fc2(x)
        states = self.fc3(x)
        # use softmax to get the probabilities of each action
        return F.softmax(actions, dim=-1), states

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

def select_action(state):
    #action is selected probabilistically given the output of our "actor"--the "policy" in "policy gradient"
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()

def finish_episode():
    #finish_episode takes rewards and uses them to update both the q-function and the policy
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    returns = []
    for r in model.rewards[::-1]:
        R = r + 0.99 * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()
        policy_losses.append(-log_prob * advantage)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss.backward()
    optimizer.step()
    #clear out the rewards and saved actions for the next episode to start fresh
    del model.rewards[:]
    del model.saved_actions[:]


In [None]:
#select the environment
env = gym.make('CartPole-v0')
#seed chosen as a good demonstrator of learning results--adds consistency
env.seed(543)
torch.manual_seed(543)
#env = gym.make('Acrobot-v1')
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.n

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
model = Net()
#use Adam as the optimizer--probably not the most important choice, but it works well
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()
running_reward = 10
i = 1
rewards = []
ep_rewards = []
#loop until break condition is met
while True:
    state, ep_reward = env.reset(), 0
    #this loop represents an iteration through the game
    for t in range(1, 10000):  # Don't infinite loop while learning
        action = select_action(state)
        #step function iterates state, shows reward, and tells us if the session has ended yet
        state, reward, done, _ = env.step(action)
        #the render function shows us the game as the agent learns to play it
        env.render()
        model.rewards.append(reward)
        ep_reward += reward
        if done:
            break

    ##between iterations of the game we update our cumulative reward, our policy, and our q-function
    #update cumulative reward
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    rewards.append(running_reward)
    ep_rewards.append(ep_reward)
    #finish_episode does all the heavy lifting in terms of updating the model
    finish_episode()
    ##this logging stuff is unused, ignore unless you want higher verbosity
    # log every 10 episodes
    #if i % 10 == 0:
        #print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
              #i, ep_reward, running_reward))
    #if we have updated our model such that it exceeds our reward threshold, we have finished training
    if running_reward > env.spec.reward_threshold:
        print("Training Completed")
        #print("Solved! Running reward is now {} and "
              #"the last episode runs to {} time steps!".format(running_reward, t))
        break
    #i tracks loop iterations both for verbosity and a final check on total iterations
    i += 1

<figure>
  <img src= 'Rewards.png' width="50%" alt=""/>
</figure>