In [3]:

# code inspired by https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical
import gym
from collections import namedtuple

try:
    import matplotlib.pyplot as plt
except:
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(num_inputs, 128)
        self.fc2 = nn.Linear(128, num_outputs)
        self.fc3 = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.fc1(x))
        actions = self.fc2(x)
        states = self.fc3(x)
        # use softmax to get the probabilities of each action
        return F.softmax(actions, dim=-1), states

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()

def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    returns = []
    for r in model.rewards[::-1]:
        R = r + 0.99 * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()
        policy_losses.append(-log_prob * advantage)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]


In [None]:
#func = 'CartPole-v0'
func = 'Acrobot-v1'
env = gym.make(func)
#pick arbitrary seed to get same results every time
env.seed(50)
torch.manual_seed(50)
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.n

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
model = Net()
optimizer = optim.Adam(model.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()
running_reward = 10
i = 1
rewards = []
ep_rewards = []
while True:
    state, ep_reward = env.reset(), 0
    for t in range(1, 10000):  # Don't infinite loop while learning
        action = select_action(state)
        state, reward, done, _ = env.step(action)
        env.render()
        model.rewards.append(reward)
        ep_reward += reward
        if done:
            break

    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    rewards.append(running_reward)
    ep_rewards.append(ep_reward)
    finish_episode()
    if func == 'Acrobot-v1':
        if running_reward < env.spec.reward_threshold:
            print("Training Completed")
            break
    else:
        if running_reward > env.spec.reward_threshold:
            print("Training Completed")
            break
    i += 1
    
"""if func == 'Acrobot-v1':
    title = 'Rewards for Acrobat problem'
    fig_name = 'AcrobotRewards'
else:
    title = 'Rewards for CartPole problem'
    fig_name = 'CartPoleRewards'
plt.plot(rewards)
plt.title(title)
plt.xlabel('Iterations')
plt.ylabel('Rewards')
plt.savefig(fig_name)"""

<figure>
  <img src= 'CartPoleRewards.png' width="50%" alt=""/>

</figure>

<figure>
  <img src= 'AcrobotRewards.png' width="50%" alt=""/>

</figure>