In [1]:
%matplotlib inline

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import gym
import numpy as np
from collections import namedtuple
import random
from matplotlib import pyplot as plt
from IPython.display import clear_output
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

In [7]:
env = gym.make('Breakout-v1')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [8]:
env.reset()
env.render()
# render = lambda : plt.imshow(env.render(mode='rgb_array'))
# render()

NameError: name 'base' is not defined

In [None]:
def make_env(seed):
    def _func():
        env = gym.make('Acrobot-v1')
        env.seed(seed)
        obs_shape = env.observation_space.shape
 
        return env
    return _func

In [None]:
lr = 0.01
gamma = 0.99
num_steps = 2e6
rollout_steps = 40
hidden_layer = 50

num_processes = 64
entropy_start = 10.0
entropy_end = 0
entropy_decay_steps = 1e6

value_coeff = 0.5
l1_regularization = 0
dropout = 0
lambd = 0.95

In [None]:
envs = [make_env(i) for i in range(num_processes)]
if len(envs) > 1:
    envs = SubprocVecEnv(envs)
else:
    envs = DummyVecEnv(envs)

input_size = envs.observation_space.n
output_size = envs.action_space.n
    
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

In [None]:
class ACTOR_MLP(nn.Module):
    def __init__(self, in_size, out_size, hidden_size, dropout_prob):
        super().__init__()
        self.lin1 = nn.Linear(in_size, hidden_size)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.lin2 = nn.Linear(hidden_size, out_size)
        self.dropout2 = nn.Dropout(dropout_prob)
        
        self.softmax = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.dropout1(F.relu(self.lin1(x)))
        x = self.dropout2(self.lin2(x))
        return x
    
class CRITIC_MLP(nn.Module):
    def __init__(self, in_size, hidden_size, dropout_prob):
        super().__init__()
        self.lin1 = nn.Linear(in_size, hidden_size)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.lin2 = nn.Linear(hidden_size, 1)
        self.dropout2 = nn.Dropout(dropout_prob)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.dropout1(F.relu(self.lin1(x)))
        return self.dropout2(self.lin2(x))

def init_weights(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = np.prod(weight_shape[1:4])
        fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)
    elif classname.find('Linear') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = weight_shape[1]
        fan_out = weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)

In [None]:
actor = ACTOR_MLP(input_size, output_size, hidden_layer, dropout)
actor.apply(init_weights)

critic = CRITIC_MLP(input_size, hidden_layer, dropout)
critic.apply(init_weights)

actor_optimizer = optim.Adam(actor.parameters(), lr=lr)
critic_optimizer = optim.Adam(critic.parameters(), lr=lr)

In [None]:
def mean_std_groups(x, y, group_size):
    num_groups = int(len(x) / group_size)

    x, x_tail = x[:group_size * num_groups], x[group_size * num_groups:]
    x = x.reshape((num_groups, group_size))

    y, y_tail = y[:group_size * num_groups], y[group_size * num_groups:]
    y = y.reshape((num_groups, group_size))

    x_means = x.mean(axis=1)
    x_stds = x.std(axis=1)

    if len(x_tail) > 0:
        x_means = np.concatenate([x_means, x_tail.mean(axis=0, keepdims=True)])
        x_stds = np.concatenate([x_stds, x_tail.std(axis=0, keepdims=True)])

    y_means = y.mean(axis=1)
    y_stds = y.std(axis=1)

    if len(y_tail) > 0:
        y_means = np.concatenate([y_means, y_tail.mean(axis=0, keepdims=True)])
        y_stds = np.concatenate([y_stds, y_tail.std(axis=0, keepdims=True)])

    return x_means, x_stds, y_means, y_stds

In [None]:
def make_state(indices):
    state = torch.zeros([num_processes, input_size], dtype=torch.float32)
    state[[i for i in range(num_processes)], indices] = 1
    return state

In [None]:
def process_rollout(steps):
    # bootstrap discounted returns with final value estimates
    _, _, _, _, last_values = steps[-1]
    returns = last_values.data

    advantages = torch.zeros(num_processes, 1)

    out = [None] * (len(steps) - 1)

    # run Generalized Advantage Estimation, calculate returns, advantages
    for t in reversed(range(len(steps) - 1)):
        rewards, masks, actions, policies, values = steps[t]
        _, _, _, _, next_values = steps[t + 1]

        returns = rewards + returns * gamma * masks

        deltas = rewards + next_values.data * gamma * masks - values.data
        advantages = advantages * gamma * lambd * masks + deltas

        out[t] = actions, policies, values, returns, advantages

    # return data as batched Tensors, Variables
    return map(lambda x: torch.cat(x, 0), zip(*out))

In [None]:
state_idx = envs.reset()

total_steps_plt = []
ep_reward_plt = []

steps = []
total_steps = 0
ep_rewards = [0.] * num_processes
render_timer = 0
plot_timer = 0
while total_steps < num_steps:
    for _ in range(rollout_steps):
        obs = make_state(state_idx)

        # network forward pass
        policies = actor(obs)
        probs = F.softmax(policies)
        actions = probs.multinomial(1).data

        values = critic(obs)

        # gather env data, reset done envs and update their obs
        state_idx, rewards, dones, _ = envs.step(actions.squeeze(-1).cpu().numpy())
        masks = (1. - torch.from_numpy(np.array(dones, dtype=np.float32))).unsqueeze(1)

        total_steps += num_processes
        for i, done in enumerate(dones):
            ep_rewards[i] += rewards[i]
            if done:
                total_steps_plt.append(total_steps)
                ep_reward_plt.append(ep_rewards[i])
                ep_rewards[i] = 0

        plot_timer += num_processes # time on total steps
        if plot_timer >= 100000:
            clear_output()
            x_means, _, y_means, y_stds = mean_std_groups(np.array(total_steps_plt), np.array(ep_reward_plt), 10)
            fig = plt.figure()
            fig.set_size_inches(8, 6)
            plt.ticklabel_format(axis='x', style='sci', scilimits=(-2, 6))
            plt.errorbar(x_means, y_means, yerr=y_stds, ecolor='xkcd:blue', fmt='xkcd:black', capsize=5, elinewidth=1.5, mew=1.5, linewidth=1.5)
            plt.title('Training progress (%s)' % 'Taxi-v2')
            plt.xlabel('Total steps')
            plt.ylabel('Episode reward')
            plt.show()
            plot_timer = 0
            
            print('Mean: ' + str(y_means[-1]) + ', Std: ' + str(y_stds[-1]))

        rewards = torch.from_numpy(rewards).float().unsqueeze(1)

        steps.append((rewards, masks, actions, policies, values))

    final_obs = make_state(state_idx)
    final_values = critic(final_obs)
    steps.append((None, None, None, None, final_values))

    actions, policies, values, returns, advantages = process_rollout(steps)

    # calculate action probabilities
    probs = F.softmax(policies)
    log_probs = F.log_softmax(policies)
    log_action_probs = log_probs.gather(1, actions.detach())

    policy_loss = (-log_action_probs * advantages.detach()).sum()
    value_loss = (.5 * (values - returns.detach()) ** 2.).sum()
    entropy_loss = (log_probs * probs).sum()

    entropy_coeff = max(entropy_end, entropy_start * (1 - total_steps * 1.0 / entropy_decay_steps) + entropy_end * total_steps * 1.0 / entropy_decay_steps)
    
    actor_loss = policy_loss + entropy_loss * entropy_coeff
    actor_loss.backward()
    actor_optimizer.step()
    actor_optimizer.zero_grad()
    
    critic_loss = value_loss * value_coeff
    critic_loss.backward()
    critic_optimizer.step()
    critic_optimizer.zero_grad()

    steps = []

envs.close()

In [None]:
# torch.save(actor.state_dict(), 'a2c')

In [None]:
actor.load_state_dict(torch.load('a2c'))

In [None]:
eval_env = gym.make('Acrobot-v1')
eval_state_idx = eval_env.reset()
while True:
    eval_env.render()
    eval_state = torch.zeros([input_size], dtype=torch.float32)
    eval_state[eval_state_idx] = 1

    policies = actor(eval_state.unsqueeze(0))
    probs = F.softmax(policies)
    actions = probs.multinomial(1).data
    eval_state_idx, reward, done, _ = eval_env.step(actions.item())

    if done:
        break
eval_env.render()

In [None]:
rews = []
for _ in range(1000):
    eval_state_idx = eval_env.reset()
    rew = 0
    while True:
        eval_state = torch.zeros([input_size], dtype=torch.float32)
        eval_state[eval_state_idx] = 1

        policies = actor(eval_state.unsqueeze(0))
        probs = F.softmax(policies)
        actions = probs.multinomial(1).data
    
        eval_state_idx, reward, done, _ = eval_env.step(actions.item())

        rew += reward
        
        if done:
            break
    rews.append(rew)
print('Average reward: ' + str(np.mean(rews)))
print('STD: ' + str(np.std(rews)))