In [None]:
ROM_NAME = "Breakout-v0"
N_OUTPUTS = 4 # Available via game.n_actions
MODE = "start"

In [None]:
from collections import deque

import cv2
import matplotlib.pyplot as plt
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import pathlib
from torch.utils.tensorboard import SummaryWriter

output_folder = "./" + ROM_NAME
pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter('runs/debug')

# Helper Functions for OpenAI Gym Retro

In [None]:
class State(object):
    def __init__(self, n_frames=4, frame_size=(84, 84)):
        self.n_frames = n_frames
        self.frame_size = frame_size
        self.frames = None
        
    def create(self, buffer):
        # The buffer has n_frames+1 frames, as it performs smoothing

        # Each pseudo-frame in the state is the maximum of the channel values
        # between a frame and the previous frame in the buffer (this is done to
        # handle flickering)
        rolling_maxs = []
        for i in range(self.n_frames):
            rolling_maxs.append(np.max([buffer[i+1], buffer[i]], axis=0))
        '''
        rolling_maxs.append(np.max([oldest, buffer[0]], axis=0))    
        for i, frame in enumerate(buffer):
            if i > 0:
                rolling_maxs.append(np.max([buffer[i-1], frame], axis=0))
        '''
        
        # Convert RGB to luminance
        # Note: I am assuming that the RBG values output by OpenAI Gym are
        # already linear
        lums = []
        for frame in rolling_maxs:
            lums.append(0.2126 * frame[:, :, 0] + 0.7152 * frame[:, :, 1] + 0.0722 * frame[:, :, 2])

        # Normalize and resize frames to target size
        self.frames = []
        for frame in lums:
            frame /= 255.
            self.frames.append(cv2.resize(frame, dsize=self.frame_size))

        self.frames = np.array(self.frames)
        return self

    def plot(self):
        fig, axs = plt.subplots(2, 2, figsize=(20, 20))
        axs[0, 0].imshow(self.frames[3], cmap=plt.cm.binary)
        axs[0, 0].set_title("t")
        axs[0, 1].imshow(self.frames[2], cmap=plt.cm.binary)
        axs[0, 1].set_title("t-1")
        axs[1, 0].imshow(self.frames[1], cmap=plt.cm.binary)
        axs[1, 0].set_title("t-2")
        axs[1, 1].imshow(self.frames[0], cmap=plt.cm.binary)
        axs[1, 1].set_title("t-4")

In [None]:
class Game(object):
    def __init__(self, name=ROM_NAME):
        self.env = None
        self.frame = None
        self.n_actions = None
        self.name = name
    
    def create(self):
        if self.env is not None:
            self.close()
        self.env = gym.envs.make(self.name)
        self.frame = self.env.reset()
        self.n_actions = self.env.action_space.n
        return self
    
    def get_action_meanings(self):
        return self.env.unwrapped.get_action_meanings()
    
    def sample(self):
        return self.env.action_space.sample()
    
    def step(self, action):
        frame, reward, done, info = self.env.step(action)
        self.frame = frame
        return frame, reward, done, info

    def step_state(self, action, n_frames=4, frame_size=(84, 84)):
        buffer = deque()
        buffer.append(self.frame)
        
        reward = 0
        for i in range(n_frames):
            frame, r, done, info = self.step(action)
            reward += r
            buffer.append(frame)
        return State(n_frames, frame_size).create(buffer), reward, done, info
    
    def reset(self):
        frame = self.env.reset()
        self.frame = frame
        return frame

    def close(self):
        self.env.render()
        self.env.close()
        self.env = None
        self.frame = None
        self.n_actions = None

Example of the output:

In [None]:
# Get game info
game = Game().create()
game.close()

In [None]:
buffer = deque()
game.create()
action = game.sample()
state, _, _, _ = game.step_state(action)
state.plot()
game.close()

# Set Up Network

The following architecture was taken from the Atari paper, except I downsampled to 168 x 168.

In [None]:
N_FRAMES = 4

class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.conv_1 = nn.Conv2d(N_FRAMES, 32, 8, stride=4)
        self.conv_2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv_3 = nn.Conv2d(64, 64, 3, stride=1)
        self.linear_1 = nn.Linear(64 * 7 * 7, 512)
        self.linear_2 = nn.Linear(512, N_OUTPUTS)
        
    def forward(self, x):
        y = F.relu(self.conv_1(x))
        y = F.relu(self.conv_2(y))
        y = F.relu(self.conv_3(y))
        y = y.view(y.size(0), -1)
        y = F.relu(self.linear_1(y))
        return self.linear_2(y)

In [None]:
def create_model(device):
    net = model()
    net.half()
    net.to(device)
    criterion = nn.SmoothL1Loss()
    opt = optim.RMSprop(params=net.parameters(), lr=0.00025, momentum=0.95, eps=0.01)
    return net, criterion, opt

# Prepopulate Replay Buffer

In [None]:
class ReplayBuffer(object):
    # DeepMind 2015 used last 1 million frames, which corresponds to 250,000 states for 4-frame states
    def __init__(self, max_len=250000):
        self.replay = None
        self.max_len = max_len

    def add(self, state, action, reward, next_state, done):
        # Takes in a collection of Python/numpy primitives and converts them to Tensors
        # before appending to the replay buffer
        state_tensor = torch.tensor(state.frames, dtype=torch.half)
        action_tensor = torch.tensor([action], dtype=torch.uint8)
        reward_tensor = torch.tensor([reward], dtype=torch.half)
        next_state_tensor = torch.tensor(next_state.frames, dtype=torch.half)
        done_tensor = torch.tensor([done], dtype=torch.uint8)

        self.replay.append((state_tensor, action_tensor, reward_tensor, next_state_tensor, done_tensor))
    
    def populate(self, game, n_states=12500, n_frames_per_state=4):
        self.replay = deque(maxlen=self.max_len)
        game.create()
        
        # Create initial state
        action = game.sample()
        state, _, _, _ = game.step_state(action)
    
        for step in range(n_states):
            action = game.sample()
            next_state, reward, done, _ = game.step_state(action)

            self.add(state, action, reward, next_state, done)

            if step % 1000 == 1:
                print(f"On step {step} of replay buffer")

            if done:
                _ = game.reset()

            state = next_state

        return self
    
    def create_mini_batch(self, batch_size=32):
        # Generate mini-batch
        mini_batch = random.sample(self.replay, batch_size)
        
        tensors = dict()
        tensors["action"] = torch.stack([a for (s, a, r, s_n, d) in mini_batch]).squeeze()
        tensors["reward"] = torch.stack([r for (s, a, r, s_n, d) in mini_batch]).squeeze()
        tensors["done"] = torch.stack([d for (s, a, r, s_n, d) in mini_batch]).squeeze()
        tensors["state"] = torch.stack([s for (s, a, r, s_n, d) in mini_batch])
        tensors["next_state"] = torch.stack([s_n for (s, a, r, s_n, d) in mini_batch])        

        return tensors
    
    def __getitem__(self, key):
        return self.replay[key]
    
    def __setitem__(self, key, value):
        self.replay[key] = value
        return self.replay[key]

In [None]:
replay_buffer = ReplayBuffer().populate(game, n_states=1000)

In [None]:
'''
%matplotlib inline
import time
from IPython import display

frame_rate = 1./60.
for i in range(12500):
    state = replay_buffer.replay[i][0].float().numpy()
    for j in range(4):
        frame = state[j]
        plt.imshow(frame, cmap=plt.cm.binary)
        display.clear_output(wait=True)
        display.display(plt.gcf())
        print(f"i {i} j {j}")
        time.sleep(frame_rate)
'''

# Training Loop

In [None]:
import random
from time import time
import pickle as pkl

class QLearning(object):
    def __init__(self, output_folder=None, max_steps_per_eps=50000, n_steps=50000000, gamma=0.99, n_frames_per_state=4):
        self.output_folder = output_folder
        self.max_steps_per_eps = max_steps_per_eps
        self.n_steps = n_steps
        self.gamma = gamma
        self.n_frames_per_state=4
        
        self.timestep = 0
        self.epoch = 0
        self.losses = []
        self.rewards_episode = []

    def save(self, game, net, criterion, optimizer, replay, save_replay=False, suffix=""):
        with open(f"{output_folder}/training{suffix}.pkl", "wb") as training_file:
            pkl.dump(self, training_file)
        with open(f"{output_folder}/game{suffix}.pkl", "wb") as game_file:
            pkl.dump(game, game_file)
        with open(f"{output_folder}/net{suffix}.pth", "wb") as net_file:
            torch.save(net.state_dict(), net_file)
        with open(f"{output_folder}/criterion{suffix}.pth", "wb") as criterion_file:
            torch.save(criterion.state_dict(), criterion_file)
        with open(f"{output_folder}/optimizer{suffix}.pth", "wb") as optimizer_file:
            torch.save(optimizer.state_dict(), optimizer_file)
        # Saving the replay buffer via Pickle can lead to OOM, as Pickle creates a copy
        # of the save in a VM
        # What *should* be done is to not use Pickle at all, but as a stop-gap measure,
        # I'm disabling replay buffer saving by default
        if save_replay:
            with open(f"{output_folder}/replay{suffix}.pkl", "wb") as replay_file:
                pkl.dump(replay, replay_file)
        
    def epsilon_schedule(self, step, n_steps=250000, eps_max=1.0, eps_min=0.1):
        """
        Linear anneal schedule

        Taken from Mnih et al. 2013
        """
        if (step < 1):
            return eps_max
        if (step > n_steps):
            return eps_min
        return (eps_min - eps_max) / (n_steps - 1) * (step - 1) + eps_max        

    def train(self, game, net, criterion, optimizer, device, target_update_freq=10000, save_freq=500, replay=None, populate_states=12500, batch_size=32):
        if replay is None:
            replay = ReplayBuffer().populate(game, n_states=populate_states, n_frames_per_state=self.n_frames_per_state)
        game.create()
        
        while (self.timestep < self.n_steps):
            time_episode = time()
            temp_time = time()
            
            reward_episode = 0
            time_act = 0
            time_replay = 0
            time_batch_create = 0
            time_batch_transfer = 0
            time_qs = 0
            time_train = 0
            time_target_update = 0
            
            # Set up target network
            target_net = model()
            target_net.half()
            target_net.to(device)
            target_net.load_state_dict(net.state_dict())
            
            _ = game.reset()
            # TODO: Should do a proper first step, not a random initialization
            action = game.sample()
            state, _, _, _ = game.step_state(action)
    
            time_init = time() - temp_time
            for step in range(self.max_steps_per_eps):        
                temp_time = time()
                
                # Epsilon scheduling
                epsilon = self.epsilon_schedule(self.timestep)

                # Generate action
                q_values = net(torch.tensor(state.frames, dtype=torch.half, device=device).unsqueeze(0))
                # Needs to reside on CPU to be fed to OpenAI Gym, and argmax doesn't accept half precision
                q_values = q_values.clone().detach().float().cpu()
                if np.random.random() < epsilon:
                    action = game.sample()
                else:
                    # The typecast is needed to handle an edge case:
                    # numpy() returns a 0D ndarray, which will cause the mini-batch
                    # construction to throw an "object does not have length" error
                    # if it's the first element in the mini-batch
                    action = int(torch.argmax(q_values).data.numpy())

                # Run environment and create next state
                # To accelerate performance, we repeat the same action repeatedly
                # for a fixed number of steps
                next_state, reward, done, info = game.step_state(action)
                reward_episode += reward
                time_act += time() - temp_time
                
                temp_time = time()
                replay.add(state, action, reward, next_state, done)
                time_replay += time() - temp_time
                
                temp_time = time()
                mini_batch = replay.create_mini_batch(batch_size=batch_size)
                time_batch_create += time() - temp_time
                
                temp_time = time()
                state_batch = mini_batch["state"].to(device)
                action_batch = mini_batch["action"].to(device)
                reward_batch = mini_batch["reward"].to(device)
                next_state_batch = mini_batch["next_state"].to(device)
                done_batch = mini_batch["done"].to(device)
                time_batch_transfer += time() - temp_time

                temp_time = time()
                # Get model predicted q values
                cur_q = net(state_batch)
                # Get target-model predicted q values
                with torch.no_grad():
                    # torch.max isn't implemented for half precision, so use single
                    next_q = target_net(next_state_batch).float()

                # Get the expected and target q-values
                current_qval = cur_q.gather(dim=1, index=action_batch.long().unsqueeze(dim=1)).squeeze()
                target_qval = reward_batch + self.gamma * ((1 - done_batch) * torch.max(next_q, dim=1)[0])
                # Convert back to half precision for optimizer             
                target_qval = target_qval.half()
                time_qs += time() - temp_time
                
                # Train model
                temp_time = time()
                loss = criterion(current_qval, target_qval.detach())
                optimizer.zero_grad()
                loss.backward()
                # Perform gradient clipping:
                for param in net.parameters():
                    param.grad.clamp(-1, 1)
                optimizer.step()
                self.losses.append(loss.item())
                time_train += time() - temp_time

                temp_time = time()
                if (self.timestep % target_update_freq == 0):
                    target_net.load_state_dict(net.state_dict())
                time_target_update += time() - temp_time
                
                self.timestep += 1

                # Move to next iteration
                if done:
                    break
                state = next_state
            self.rewards_episode.append(reward_episode)
            time_episode = time() - time_episode
            print(f"Finished epoch {self.epoch}, total rewards {reward_episode}, total num steps {step}, epoch time {time_episode} s, final epsilon {epsilon}")
            print(f"- init time {time_init} s")
            print(f"- act time {time_act} s")
            print(f"- replay time {time_replay} s")
            print(f"- batch create time {time_batch_create} s")
            print(f"- batch transfer time {time_batch_transfer} s")
            print(f"- qs time {time_qs} s")
            print(f"- train time {time_train} s")
            print(f"- target update time {time_target_update} s")
            print(f"- replay buffer length {len(replay.replay)}")

            if (self.output_folder is not None):
                if self.epoch % save_freq == 0:
                    time_save = time()
                    self.save(game, net, criterion, optimizer, replay, suffix=f".latest")
                    time_save = time() - time_save
                    print(f"Finished saving epoch {self.epoch}, total time to save {time_save} s")
                        
            self.epoch += 1

        self.save(game, net, criterion, optimizer, replay, suffix=f".latest")
        
        return self.losses, self.rewards_episode, replay

In [None]:
def load_training(output_folder, device, load_replay=False, suffix=""):
    net, criterion, optimizer = create_model(device)

    with open(f"{output_folder}/training{suffix}.pkl", "rb") as training_file:
        training = pkl.load(training_file)
    with open(f"{output_folder}/game{suffix}.pkl", "rb") as game_file:
        game = pkl.load(game_file)

    with open(f"{output_folder}/net{suffix}.pth", "rb") as net_file:
        net.load_state_dict(torch.load(net_file))
        net.to(device)
    with open(f"{output_folder}/criterion{suffix}.pth", "rb") as criterion_file:
        criterion.load_state_dict(torch.load(criterion_file))
    with open(f"{output_folder}/optimizer{suffix}.pth", "rb") as optimizer_file:
        optimizer.load_state_dict(torch.load(optimizer_file))
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)

    replay = None
    if load_replay:
        with open(f"{output_folder}/replay{suffix}.pkl", "rb") as replay_file:
            replay = pkl.load(replay_file)
                    
    return training, game, net, criterion, optimizer, replay

In [None]:
game.create()
if MODE == "start":
    net, criterion, optimizer = create_model(device)
    replay = None
    training = QLearning(output_folder=output_folder)
elif MODE == "load_from_disk":
    training, game, net, criterion, optimizer, replay = load_training(output_folder, device=device, suffix=".latest")
losses, rewards_episode, replay = training.train(game, net, criterion, optimizer, replay=replay, device=device)
game.close()

In [None]:
plt.plot(losses)

# Evaluate Model

In [None]:
from time import sleep
from time import time
from gym import wrappers

def play_game(game, net, device, output_folder, n_steps=500, frame_rate=1./15.):
    game.create()
    _ = game.reset()
    
    # Pick a random action initially
    print("1")
    action = game.sample()
    state, _, _, _ = game.step_state(action)
    reward_game = 0

    while True:
        print("2")
        q_values = net(torch.tensor(state.frames, dtype=torch.half, device=device).unsqueeze(0))
        # Needs to reside on CPU to be fed to OpenAI Gym, and argmax doesn't accept half precision
        with torch.no_grad():
            q_values = net(torch.tensor(state.frames, dtype=torch.half, device=device).unsqueeze(0))
            q_values = q_values.clone().detach().float().cpu()
            action = int(torch.argmax(q_values).data.numpy())

        print("3")
        next_state, reward, done, info = game.step_state(action)
        game.env.render()
        sleep(frame_rate)

        print("4")
        reward_game += reward
    
        if done:
            break

        print("5")
        print(f"Reward: {reward_game}")
        print(f"Action: {action}")
        state = next_state
    game.close()

In [None]:
_, _, net, _, _, _ = load_training(output_folder, device, suffix=".latest")
game = Game()
play_game(game, net, device, "videos/")

In [None]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

In [None]:
from IPython.display import Video

Video("videos/1588787664.302215/openaigym.video.1.7715.video000000.mp4")