In [1]:
!pip install gym
!pip install Box2D gym
#!pip install torch

Collecting Box2D
[?25l  Downloading https://files.pythonhosted.org/packages/a9/0b/d48d42dd9e19ce83a3fb4eee074e785b6c6ea612a2244dc2ef69427d338b/Box2D-2.3.10-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 3.3MB/s 
Installing collected packages: Box2D
Successfully installed Box2D-2.3.10


In [0]:
import easydict
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import matplotlib.pyplot as plt
# Cart Pole
# based on:
# https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py

#args = parser.parse_args()

args = easydict.EasyDict({
    "gamma": 0.99,
    "seed": 203,
    "render":False,
    "log_interval":10
})

env = gym.make('LunarLanderContinuous-v2')

env.seed(args.seed)
torch.manual_seed(args.seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self, hidden_dim1=64, hidden_dim2=32, output_dim = 128):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(state_dim, hidden_dim1) 
        self.affine2 = nn.Linear(hidden_dim1,hidden_dim2)
        self.affine3 = nn.Linear(hidden_dim2,output_dim)
        self.act1 = nn.ReLU()
        # actor's layer
        self.action_mean = nn.Linear(output_dim, action_dim) 
        self.action_var = nn.Linear(output_dim, action_dim)
        # critic's layer
        self.value_head = nn.Linear(output_dim, 1)
        # action & reward buffer
        self.saved_actions = []
        self.rewards = []
        
    def forward(self, x):
        """
        forward of both actor and critic
        """
        # TODO map input to:
        # mean of action distribution,
        # variance of action distribution (pass this through a non-negative function),
        # state value
        
        x = self.act1(self.affine1(x))
        x = self.act1(self.affine2(x))
        x = self.act1(self.affine3(x))
        action_mean = self.action_mean(x)
        action_var = F.softplus(self.action_var(x))
        state_values = self.value_head(x) # <= Value Function not value of state
        
        return 0.5*action_mean, 0.5*action_var, state_values
    
model = Policy().float()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
eps = np.finfo(np.float32).eps.item()

def select_action(state):
    state = torch.from_numpy(state).float()
    mu, sigma, state_value = model(state)
    
    # create a normal distribution over the continuous action space
    m = Normal(loc=mu,scale=sigma)
    
    # and sample an action using the distribution
    action = m.sample()
    
    # save to action buffer
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    
    # the action to take (left or right)
    return action.data.numpy()

def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values
    
    # calculate the true value using rewards returned from the environment
    for r in model.rewards[::-1]:
        # TODO compute the value at state x
        # via the reward and the discounted tail reward
        R = args.gamma*R + r
        
        returns.insert(0, R)
        
    # whiten the returns
    returns = torch.tensor(returns).float()
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    for (log_prob, value), R in zip(saved_actions, returns):
        # TODO compute the advantage via subtracting off value
        advantage = R-value.item()
        
        # TODO calculate actor (policy) loss, from log_prob (saved in select action)
        # and from advantage
        policy_loss = -log_prob*advantage
        # append this to policy_losses
        policy_losses.append(policy_loss)
        # TODO calculate critic (value) loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
    # reset gradients
    
    optimizer.zero_grad()
    
    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    
    # perform backprop
    loss.backward()
    optimizer.step()
    
    # reset rewards and action buffer
    del model.rewards[:]
    del model.saved_actions[:]
    
def main():
    running_reward = -100
    
    # run infinitely many episodes, until performance criteria met
    episodic_rewards = []
    episodes = []
    
    for i_episode in count(1):
        # reset environment and episode reward
        state = env.reset()
        ep_reward = 0

        for t in range(1, 2500):
            # select action from policy
            action = select_action(state)
            
            # take the action
            state, reward, done, _ = env.step(action)
            
            if args.render and i_episode % 100 == 0:
                env.render()
    
            model.rewards.append(reward)
            ep_reward += reward
            if done:
                episodes.append(i_episode) # added
                episodic_rewards.append(ep_reward)
                break
                
        # update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        
        # perform backprop
        finish_episode()
        
        # log results
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            
        # check if we have "solved" the problem
        #if running_reward > 200:
        if i_episode > 6000:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))

            # TODO plot episodic_rewards --- submit this plot with your code
            plt.figure
            plt.plot(episodes, episodic_rewards)
            break
            
if __name__ == '__main__':
    main()

Episode 10	Last reward: -54.27	Average reward: -148.28
Episode 20	Last reward: -248.84	Average reward: -159.68
Episode 30	Last reward: -132.31	Average reward: -169.70
Episode 40	Last reward: -306.82	Average reward: -189.38
Episode 50	Last reward: -298.98	Average reward: -190.97
Episode 60	Last reward: -52.21	Average reward: -175.61
Episode 70	Last reward: -161.41	Average reward: -203.55
Episode 80	Last reward: -139.27	Average reward: -210.64
Episode 90	Last reward: -80.62	Average reward: -206.73
Episode 100	Last reward: -152.32	Average reward: -209.79
Episode 110	Last reward: -135.64	Average reward: -166.81
Episode 120	Last reward: -270.60	Average reward: -181.45
Episode 130	Last reward: -3.26	Average reward: -171.95
Episode 140	Last reward: -250.65	Average reward: -206.45
Episode 150	Last reward: -312.92	Average reward: -188.63
Episode 160	Last reward: -183.09	Average reward: -188.15
Episode 170	Last reward: -147.20	Average reward: -168.86
Episode 180	Last reward: -201.31	Average rewa