In [None]:
ROM = "Breakout-v0"
MODE = "start"

In [None]:
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import gym
import torch
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import pathlib
from torch.utils.tensorboard import SummaryWriter

output_folder = "./" + ROM
pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter('runs/debug')

# Helper Functions for OpenAI Gym Retro

In [None]:
from game import State, Game

Example of the output:

In [None]:
# Get game info
game = Game(rom=ROM).create()
game.close()

In [None]:
buffer = deque()
game.create()
action = game.sample()
state, _, _, _ = game.step_state(action)
state.plot()
game.close()

# Set Up Network

The following architecture was taken from the Atari paper, except I downsampled to 168 x 168.

In [None]:
from qlearning import AtariModel

In [None]:
def create_model(device):
    net = AtariModel()
    net.half()
    net.to(device)
    criterion = nn.SmoothL1Loss()
    opt = optim.RMSprop(params=net.parameters(), lr=0.00025, momentum=0.95, eps=0.01)
    return net, criterion, opt

# Prepopulate Replay Buffer

In [None]:
from game import ReplayBuffer

In [None]:
replay_buffer = ReplayBuffer().populate(game, n_states=1000)

In [None]:
'''
%matplotlib inline
import time
from IPython import display

frame_rate = 1./60.
for i in range(12500):
    state = replay_buffer.replay[i][0].float().numpy()
    for j in range(4):
        frame = state[j]
        plt.imshow(frame, cmap=plt.cm.binary)
        display.clear_output(wait=True)
        display.display(plt.gcf())
        print(f"i {i} j {j}")
        time.sleep(frame_rate)
'''

# Training Loop

In [None]:
from qlearning import QLearning

In [None]:
def load_training(output_folder, device, load_replay=False, suffix=""):
    net, criterion, optimizer = create_model(device)

    with open(f"{output_folder}/training{suffix}.pkl", "rb") as training_file:
        training = pkl.load(training_file)
    with open(f"{output_folder}/game{suffix}.pkl", "rb") as game_file:
        game = pkl.load(game_file)

    with open(f"{output_folder}/net{suffix}.pth", "rb") as net_file:
        net.load_state_dict(torch.load(net_file))
        net.to(device)
    with open(f"{output_folder}/criterion{suffix}.pth", "rb") as criterion_file:
        criterion.load_state_dict(torch.load(criterion_file))
    with open(f"{output_folder}/optimizer{suffix}.pth", "rb") as optimizer_file:
        optimizer.load_state_dict(torch.load(optimizer_file))
        for state in optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(device)

    replay = None
    if load_replay:
        with open(f"{output_folder}/replay{suffix}.pkl", "rb") as replay_file:
            replay = pkl.load(replay_file)
                    
    return training, game, net, criterion, optimizer, replay

In [None]:
import torch.nn as nn

game.create()
if MODE == "start":
    net, criterion, optimizer = create_model(device)
    replay = None
    training = QLearning(output_folder=output_folder)
elif MODE == "load_from_disk":
    training, game, net, criterion, optimizer, replay = load_training(output_folder, device=device, suffix=".latest")
losses, rewards_episode, replay = training.train(game, net, criterion, optimizer, replay=replay, device=device)
game.close()

In [None]:
plt.plot(losses)

# Evaluate Model

In [None]:
from time import sleep
from time import time
from gym import wrappers

def play_game(game, net, device, output_folder, n_steps=500, frame_rate=1./15.):
    game.create()
    _ = game.reset()
    
    # Pick a random action initially
    print("1")
    action = game.sample()
    state, _, _, _ = game.step_state(action)
    reward_game = 0

    while True:
        print("2")
        q_values = net(torch.tensor(state.frames, dtype=torch.half, device=device).unsqueeze(0))
        # Needs to reside on CPU to be fed to OpenAI Gym, and argmax doesn't accept half precision
        with torch.no_grad():
            q_values = net(torch.tensor(state.frames, dtype=torch.half, device=device).unsqueeze(0))
            q_values = q_values.clone().detach().float().cpu()
            action = int(torch.argmax(q_values).data.numpy())

        print("3")
        next_state, reward, done, info = game.step_state(action)
        game.env.render()
        sleep(frame_rate)

        print("4")
        reward_game += reward
    
        if done:
            break

        print("5")
        print(f"Reward: {reward_game}")
        print(f"Action: {action}")
        state = next_state
    game.close()

In [None]:
_, _, net, _, _, _ = load_training(output_folder, device, suffix=".latest")
game = Game()
play_game(game, net, device, "videos/")

In [None]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

In [None]:
from IPython.display import Video

Video("videos/1588787664.302215/openaigym.video.1.7715.video000000.mp4")