In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install — upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install pyvirtualdisplay
!pip install piglet
!apt-get install python-opengl -y
!apt install xvfb -y

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
!pip install tensorboardX

In [None]:

from tensorboardX import SummaryWriter 

In [None]:
import numpy as np
import gym
from gym import logger as gymlogger
gymlogger.set_level(40) #error only
from gym.wrappers import Monitor
from itertools import count

import random
import matplotlib
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.autograd import Variable

%matplotlib inline
import math
import glob
import io
import os
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()


In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env
    

## Random agent

In [None]:
env = wrap_env(gym.make('Pong-v0'))
observation = env.reset()
new_observation = observation
prev_input = None
done = False
for _ in range(300):
    new_observation, reward, done, info = env.step(random.randint(1,3))
      
env.close()
show_video()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_path = "/content/drive/My Drive/PongPolicyGrad"

## Policy Gradient model

In [None]:
class Policy_full_connect(nn.Module):
    def __init__(self, n_frames=1):
        super(Policy_full_connect, self).__init__()
        self.n_frames=n_frames
        self.lin1 = nn.Linear(self.n_frames*80*80, 200)
        self.lin2 = nn.Linear(200, 3) 

        self.saved_log_probs = []
        self.rewards = []
        
    def forward(self, input):
        x = F.relu(self.lin1(input))
        output = self.lin2(x)
        return F.softmax(output, dim=2)

    def select_action(self, state, train=True, return_probs=False):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self(Variable(state).to('cuda'))
        m = Categorical(probs)
        action = m.sample() 
        if train:
          self.saved_log_probs.append(m.log_prob(action)) 
        if return_probs: return probs, action.data[0]
        return action.data[0]

In [None]:
class Policy_conv(nn.Module):
    def __init__(self, n_frames=1):
        super(Policy_conv, self).__init__()
        self.n_frames=n_frames
        self.conv1 = nn.Conv2d(self.n_frames, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        self.head = nn.Linear(1568, 3)

        self.saved_log_probs = []
        self.rewards = []
        
    def forward(self, input):
        x = F.relu(self.bn1((self.conv1(input))))
        x = F.relu(self.bn2((self.conv2(x))))
        x = F.relu(self.bn3((self.conv3(x))))
        output =  F.softmax(self.head(x.view(x.size(0), -1)), dim=1) #soft max is not necessary for computing te q value
        #print("res ",output )
        return output

    def select_action(self, state, train=True, return_probs=False):
        #print("state shape", state.shape)
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self(Variable(state).to('cuda'))
        m = Categorical(probs)
        action = m.sample() 
        if train:
            self.saved_log_probs.append(m.log_prob(action)) 
        if return_probs: return probs, action.data[0]
        return action.data[0]

In [None]:
class Trainer():
    def __init__(self, env, model, optimizer, path='', save_fn='log.pkl', restore_fn='log.pkl'):
        self.env = env
        self.model = model
        self.optimizer = optimizer
        
        self.episode_nb = 0
        self.history = []
        self.losses = []

        self.path = path
        self.save_fn = save_fn
        self.restore_fn = restore_fn

    def preprocess(self, I):
        """ preprocess 210x160x3 into 6400 """
        I = I[35:195]
        I = I[::2, ::2, 0]
        I[I == 144] = 0
        I[I == 109] = 0
        I[I != 0 ] = 1
        res = I.astype(np.float)
        return res if 'conv' in str(self.model.__class__) else res.ravel()

    def restore(self):
        restore_path = os.path.join(self.path, self.restore_fn)
        if os.path.isfile(restore_path):
            print(f"Load Policy Network parametets from {restore_path}")
            state = torch.load(restore_path)
            self.model.load_state_dict(state['state_dict'])
            self.optimizer.load_state_dict(state['optimizer'])
            self.episode_nb = state['episode']
            self.history = state['history']
            self.losses = state['losses']
        else:
            print('There is no checkpoint to restore!')

    def save(self):
        state = {
            'episode': self.episode_nb,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'history': self.history,
            'losses': self.losses
        }
        save_path = os.path.join(self.path, self.save_fn)
        torch.save(state, save_path)
        print(f"Saved model parametets to {save_path}")

    def update(self, batch_size):
        R = 0
        loss = []
        rewards = []
        for r in self.model.rewards[::-1]:
            R = r + 0.99 * R
            rewards.insert(0, R)

        # turn rewards to pytorch tensor and standardize
        rewards = torch.Tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        
        for log_prob, reward in zip(self.model.saved_log_probs, rewards):
            loss.append(-log_prob * reward)

        loss = torch.cat(loss).sum()
        loss.backward()
        if self.episode_nb % batch_size == 0:
                print('ep %d: policy network parameters updating...' % (self.episode_nb))
                self.optimizer.step()
                self.optimizer.zero_grad()

        self.losses.append(loss.item())
        # clean rewards and saved_actions
        del self.model.rewards[:]
        del self.model.saved_log_probs[:]

    def train(self, batch_size = 10, save_frequency = 50): 
        self.restore()  
        running_reward = None
        reward_sum = 0
        while True:
            self.episode_nb+=1
            state = self.env.reset()
            latest_states = []
            for t in range(20000):
                state = self.preprocess(state)
                if len(latest_states)==self.model.n_frames:
                    latest_states.pop(0)

                while len(latest_states)<self.model.n_frames:
                    latest_states.append(state)
            
                action = self.model.select_action(np.array(latest_states))
                action = action + 1
                
                state, reward, done, _ = self.env.step(action)
                reward_sum += reward

                self.model.rewards.append(reward)
                if done:
                    # tracking log
                    self.history.append(reward_sum)
                    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                    print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
                    reward_sum = 0
                    break
            
            # use policy gradient update model weights
            self.update(batch_size)

            # Save model
            if self.episode_nb % save_frequency == 0:
                print('Saving model ...')
                self.save()
            

## Training the model

In [None]:
# CNN 1-frame model
env = gym.make('Pong-v0')
policy = Policy_conv(n_frames=1).to('cuda')
optimizer = optim.RMSprop(policy.parameters(), lr=1e-4, weight_decay=0.99)
trainer = Trainer(env, policy, optimizer, path=data_path)#

In [None]:
# CNN 4-frame model
policy2 = Policy_conv(n_frames=4).to('cuda')
optimizer2 = optim.RMSprop(policy2.parameters(), lr=1e-4, weight_decay=0.99)
trainer2 = Trainer(env, policy2, optimizer2, path=data_path, restore_fn='4frames_conv_net_3100.pkl')

In [None]:
# FCN 1-frame model
policy3 = Policy_full_connect().to('cuda')
optimizer3 = optim.RMSprop(policy3.parameters(), lr=1e-4, weight_decay=0.99)
trainer3 = Trainer(env, policy3, optimizer3, path=data_path, restore_fn='full_connect_3020.pkl')

In [None]:
#trainer.train(batch_size=5)
#trainer2.train(batch_size=5)
#trainer3.train(batch_size=5)

In [None]:
trainer.restore()
trainer2.restore()
trainer3.restore()

## Visualizations

In [None]:
plt.plot(running_mean(trainer.history[:2950],50), label='CNN 1 frame')
plt.plot(running_mean(trainer3.history[:2950],50), label='FCN 1 frame')
plt.plot(running_mean(trainer2.history[:2950],50), label='CNN 4 frames')
plt.legend()

In [None]:
pr_state = trainer.preprocess(state)

In [None]:
frames=pr_states
fig = plt.figure(figsize=(10,4))
for i in range(len(frames)):
    plt.subplot(1, len(frames), i+1)
    plt.imshow(frames[i])
    plt.axis('off')

plt.show()

In [None]:
def get_probs(trainer, states):
  pr_states = np.array([trainer.preprocess(s) for s in states])
  probs=[]
  actions=[]
  for i in range(4):
    prob, action = trainer.model.select_action(np.expand_dims(pr_states[i],axis=0),train=False, return_probs=True)
    probs.append(prob)
    actions.append(action)
  probs=[prob.cpu().detach().numpy().squeeze() for prob in probs]
  return probs

In [None]:
probs_cnn = get_probs(trainer, states)
probs_fcn = get_probs(trainer3, states)

In [None]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [None]:
import matplotlib as mpl

In [None]:
mpl.style.use('default')

In [None]:
frames=pr_states
labels=['NOOP','UP','DOWN']
x = np.arange(len(labels)) 
width = 0.35

fig = plt.figure(figsize=(10,2.3))
for i in range(len(frames)):
    plt.subplot(1, len(frames), i+1)
    axes = plt.gca()
    axes.set_xticklabels([' ','NOOP','UP','DOWN'])
    axes.set_ylim([0.,1.])
    #plt.imshow(frames[i])
    
    plt.bar(x - width/2 ,height=probs_cnn[i], width=width, label='CNN 1 frame')
    plt.bar(x + width/2 ,height=probs_fcn[i], width=width, label='FCN 1 frame')
    
    #plt.axis('off')

plt.show()

In [None]:
# Visualize feature maps
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

pr_states= np.array([trainer.preprocess(s) for s in states])

trainer.model.conv3.register_forward_hook(get_activation('conv3'))
trainer.model.select_action(np.expand_dims(pr_states[0], axis=0), train=False, return_probs=True)

act = activation['conv3'].squeeze()


In [None]:
act.shape

## Visualize CNN features

In [None]:
fig = plt.figure(figsize=(10,3))
plt.subplot(1, 4, 1)
axes = plt.gca()
axes.set_xlabel('input')
plt.imshow(pr_states[0])
for i in range(3):
    plt.subplot(1, 4, i+2)
    axes = plt.gca()
    axes.set_xlabel('conv'+str(i+1)) 
    plt.imshow(activation['conv'+str(i+1)].squeeze().cpu().detach().numpy()[11])
    

plt.show()


In [None]:
tb = SummaryWriter()

In [None]:
his_cnn_1 = trainer.history
his_cnn_4 = trainer2.history
his_fc = trainer3.history


In [None]:
for i in range(2900):
    tb.add_scalars('Reward after episode', {'CNN 1 frame': his_cnn_1[i],
                              'CNN 4 frames': his_cnn_4[i],
                              'FCN 1 frame': his_fc[i]}, i)
    #tb.add_scalar("log", his_cnn_1[i], i)
    #tb.add_scalar("log", his_cnn_4[i], i)
    #tb.add_scalar("log", his_fc[i], i)
tb.close()

In [None]:
!kill 2583

In [None]:
%tensorboard --logdir runs

In [None]:
plt.savefig()

In [None]:
def show(threshold, steps=10000):
  running_reward=-21
  while running_reward<threshold:
    obs=[]
    running_reward=0
    show_env = wrap_env(gym.make('Pong-v0'))
    state = show_env.reset()
    for _ in range(steps):
        obs.append(state)
        state = trainer3.preprocess(state)
        
        action = trainer3.model.select_action(np.expand_dims(state,axis=0),train=False)
        action = action + 1
        
        state, reward, done, _ = show_env.step(action)
        running_reward+=reward
        if done: break
    print(running_reward)

  show_env.close()
  show_video()
  return obs

In [None]:
obs=show(-20)