In [3]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical


from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "CartPole-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

[2018-07-24 00:08:18,789] Making new env: CartPole-v0
[2018-07-24 00:08:18,792] Making new env: CartPole-v0
[2018-07-24 00:08:18,795] Making new env: CartPole-v0
[2018-07-24 00:08:18,798] Making new env: CartPole-v0
[2018-07-24 00:08:18,807] Making new env: CartPole-v0
[2018-07-24 00:08:18,802] Making new env: CartPole-v0
[2018-07-24 00:08:18,817] Making new env: CartPole-v0
[2018-07-24 00:08:18,820] Making new env: CartPole-v0
[2018-07-24 00:08:18,901] Making new env: CartPole-v0
[2018-07-24 00:08:18,836] Making new env: CartPole-v0
[2018-07-24 00:08:18,867] Making new env: CartPole-v0
[2018-07-24 00:08:18,871] Making new env: CartPole-v0
[2018-07-24 00:08:18,876] Making new env: CartPole-v0
[2018-07-24 00:08:18,838] Making new env: CartPole-v0
[2018-07-24 00:08:18,839] Making new env: CartPole-v0
[2018-07-24 00:08:18,847] Making new env: CartPole-v0
[2018-07-24 00:08:18,849] Making new env: CartPole-v0


In [7]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1))
        
        self.actor = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_outputs),
            nn.Softmax(dim=1))
        
    def forward(self, x):
        value = self.critic(x)
        probs = self.actor(x)
        dist = Categorical(prods)
        return value, dist

In [8]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward

In [None]:
def compute_returns(next_value, rewards, masks, gamma=0.99):
    ## need to add if statement for next_value. Should be zero when done.
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        # if mask is false (implying done) then reward for that step should be zero.
        R = rewards[step] + R * gamma * masks[step]
        returns.insert(0, R)
    return returns