# Setup

## Imports

In [1]:
from tqdm import tqdm_notebook

In [2]:
import gym

PyTorch modules

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable
from vai.torch.utils import cuda

## Define Useful Features

In [4]:
env = gym.make('Pong-ram-v0')

In [5]:
n = env.observation_space.shape[0]
a = 2

# Create Model

In [17]:
class Policy(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(n, 64 * 4)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv1 = nn.ConvTranspose1d(64, 32, 3, 2, padding=2, output_padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.conv2 = nn.ConvTranspose1d(32, a, 3, 2, padding=2, output_padding=1)
        
    
    def forward(self, x):
        x = F.relu(self.bn1(self.fc(x).view(-1, 64, 4)))
        x = F.relu(self.bn2(self.conv1(x)))
        return self.conv2(x)

In [41]:
def get_policy(scores):
    return torch.multinomial(F.softmax(torch.transpose(scores, 0, 1)))

In [8]:
get_action = lambda x: 2 if x == 1 else 3

In [49]:
policy = cuda(Policy())

In [50]:
optimizer = optim.Adam(policy.parameters())

In [44]:
action_size = 10

In [53]:
prog_bar = tqdm_notebook(range(1000))
for epoch in prog_bar:
    env.reset()
    done = False

    obs = env.observation_space.sample()
    last_obs = cuda(torch.from_numpy(obs.astype(np.float32)))
    scores = policy(Variable(last_obs).unsqueeze(0)).squeeze(0)
    actions = get_policy(scores)
    action_roll = actions.data.cpu().numpy()
    time_step = 0
    total_reward = 0
    epoch_reward = 0
    while not done:
        if time_step == action_size:
            grads = np.zeros((a, action_size), np.float32)
            grads[action_roll, np.arange(action_size)] = -total_reward
            
            if total_reward < 0:
                new_scores = policy(Variable(last_obs, volatile=True).unsqueeze(0)).squeeze(0)
                while (get_policy(new_scores) == actions).data.all():
                    optimizer.zero_grad()
                    scores.backward(cuda(torch.from_numpy(grads)), retain_graph=True)
                    optimizer.step()
                    new_scores = policy(Variable(last_obs, volatile=True).unsqueeze(0)).squeeze(0)
                grads = np.zeros((a, action_size), np.float32)
                optimizer.zero_grad()
                scores.backward(cuda(torch.from_numpy(grads)))
                optimizer.step()
            elif total_reward > 0:
                optimizer.zero_grad()
                scores.backward(cuda(torch.from_numpy(grads)))
                optimizer.step()
            
            last_obs = cuda(torch.from_numpy(obs.astype(np.float32)))
            scores = policy(Variable(last_obs).unsqueeze(0)).squeeze(0)
            actions = get_policy(scores)
            action_roll = actions.data.cpu().numpy()
            time_step = 0
            total_reward = 0

        #env.render()
        obs, r, done, _ = env.step(get_action(action_roll[time_step]))
        total_reward += r
        epoch_reward += total_reward

        time_step += 1
        
    prog_bar.desc = str(int(epoch_reward))
    epoch_reward = 0

In [None]:
env.reset()
done = False

obs = env.observation_space.sample()
last_obs = cuda(torch.from_numpy(obs.astype(np.float32)))
scores = policy(Variable(last_obs, volatile=True).unsqueeze(0)).squeeze(0)
actions = get_policy(scores)
action_roll = actions.data.cpu().numpy()
time_step = 0
total_reward = 0
epoch_reward = 0
while not done:
    if time_step == action_size:
        last_obs = cuda(torch.from_numpy(obs.astype(np.float32)))
        scores = policy(Variable(last_obs, volatile=True).unsqueeze(0)).squeeze(0)
        actions = get_policy(scores)
        action_roll = actions.data.cpu().numpy()
        time_step = 0
        total_reward = 0

    env.render()
    obs, r, done, _ = env.step(get_action(action_roll[time_step]))
    total_reward += r
    epoch_reward += total_reward

    time_step += 1

print(epoch_reward)
env.render(close=True)