# Deep Q-Learning 

Install dependencies for AI gym to run properly (shouldn't take more than a minute). If running on google cloud or running locally, only need to run once. Colab may require installing everytime the vm shuts down.

In [1]:
!pip3 install gym pyvirtualdisplay
!sudo apt-get install -y xvfb python-opengl ffmpeg



'sudo' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
!pip3 install --upgrade setuptools
!pip3 install ez_setup 
!pip3 install gym[atari] 

Requirement already up-to-date: setuptools in c:\users\computer\anaconda3\lib\site-packages (50.3.2)


For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
!sudo apt-get install -y xvfb python-opengl ffmpeg

'sudo' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
%matplotlib inline

import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import find_max_lives, check_live, get_frame, get_init_state
from model import DQN
from config import *

import matplotlib.pyplot as plt
# %load_ext autoreload
# %autoreload 2

## Understanding the environment

In the following cell, we initialize our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In breakout, we will use 3 actions "fire", "left", and "right". "fire" is only used to reset the game when a life is lost, "left" moves the agent left and "right" moves the agent right.

In [2]:
env = gym.make('BreakoutDeterministic-v4')
state = env.reset()

In [3]:
number_lives = find_max_lives(env)
state_size = env.observation_space.shape
action_size = 3 #fire, left, and right

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. Once you've created a working DQN agent, use the code in agent.py to create a double DQN agent in __agent_double.py__. Set the flag "double_dqn" to True to train the double DQN agent.

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [4]:
double_dqn = False # set to True if using double DQN agent

if double_dqn:
    from agent_double import Agent
else:
    from agent import Agent

agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In this training loop, we do not render the screen because it slows down training signficantly. To watch the agent play the game, run the code in next section "Visualize Agent Performance"

In [None]:
rewards, episodes = [], []
best_eval_reward = 0
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    d = False
    state = env.reset()
    next_state = state
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1

        # Perform a fire action if ball is no longer on screen to continue onto next life
        if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
            action = 0
        else:
            action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
        state = next_state
        next_state, reward, done, info = env.step(action + 1)
        
        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])

        life = info['ale.lives']
        r = np.clip(reward, -1, 1) 
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network only for Double DQN only
            if double_dqn and (frame % update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]
            
        if done:
            evaluation_reward.append(score)
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.xlabel('Episodes')
            pylab.ylabel('Rewards') 
            pylab.title('Episodes vs Reward')
            pylab.savefig("./save_graph/breakout_dqn.png") # save graph for training visualization
            
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "   lr:", agent.optimizer.param_groups[0]['lr'], "    evaluation reward:", np.mean(evaluation_reward))

            # if the mean of scores of last 100 episode is bigger than 5 save model
            ### Change this save condition to whatever you prefer ###
            if np.mean(evaluation_reward) > 5 and np.mean(evaluation_reward) > best_eval_reward:
                torch.save(agent.policy_net, "./save_model/breakout_dqn.pth")
                best_eval_reward = np.mean(evaluation_reward)


episode: 0   score: 3.0   memory length: 247   epsilon: 1.0    steps: 247    lr: 0.0001     evaluation reward: 3.0
episode: 1   score: 2.0   memory length: 444   epsilon: 1.0    steps: 197    lr: 0.0001     evaluation reward: 2.5
episode: 2   score: 1.0   memory length: 616   epsilon: 1.0    steps: 172    lr: 0.0001     evaluation reward: 2.0
episode: 3   score: 5.0   memory length: 980   epsilon: 1.0    steps: 364    lr: 0.0001     evaluation reward: 2.75
episode: 4   score: 0.0   memory length: 1103   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 2.2
episode: 5   score: 0.0   memory length: 1226   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.8333333333333333
episode: 6   score: 1.0   memory length: 1376   epsilon: 1.0    steps: 150    lr: 0.0001     evaluation reward: 1.7142857142857142
episode: 7   score: 0.0   memory length: 1498   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.5
episode: 8   score: 2.0   memory length: 1716

episode: 64   score: 2.0   memory length: 12099   epsilon: 1.0    steps: 217    lr: 0.0001     evaluation reward: 1.6
episode: 65   score: 1.0   memory length: 12250   epsilon: 1.0    steps: 151    lr: 0.0001     evaluation reward: 1.5909090909090908
episode: 66   score: 0.0   memory length: 12373   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.5671641791044777
episode: 67   score: 2.0   memory length: 12571   epsilon: 1.0    steps: 198    lr: 0.0001     evaluation reward: 1.5735294117647058
episode: 68   score: 2.0   memory length: 12788   epsilon: 1.0    steps: 217    lr: 0.0001     evaluation reward: 1.5797101449275361
episode: 69   score: 5.0   memory length: 13137   epsilon: 1.0    steps: 349    lr: 0.0001     evaluation reward: 1.6285714285714286
episode: 70   score: 0.0   memory length: 13260   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.6056338028169015
episode: 71   score: 1.0   memory length: 13430   epsilon: 1.0    steps: 170    lr: 

episode: 129   score: 2.0   memory length: 23404   epsilon: 1.0    steps: 198    lr: 0.0001     evaluation reward: 1.39
episode: 130   score: 1.0   memory length: 23573   epsilon: 1.0    steps: 169    lr: 0.0001     evaluation reward: 1.37
episode: 131   score: 1.0   memory length: 23743   epsilon: 1.0    steps: 170    lr: 0.0001     evaluation reward: 1.37
episode: 132   score: 0.0   memory length: 23866   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.34
episode: 133   score: 0.0   memory length: 23989   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.32
episode: 134   score: 0.0   memory length: 24111   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.31
episode: 135   score: 0.0   memory length: 24234   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.28
episode: 136   score: 2.0   memory length: 24432   epsilon: 1.0    steps: 198    lr: 0.0001     evaluation reward: 1.27
episode: 137   score: 2.0   memory lengt

episode: 198   score: 2.0   memory length: 35515   epsilon: 1.0    steps: 199    lr: 0.0001     evaluation reward: 1.35
episode: 199   score: 0.0   memory length: 35638   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.34
episode: 200   score: 0.0   memory length: 35760   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.3
episode: 201   score: 2.0   memory length: 35978   epsilon: 1.0    steps: 218    lr: 0.0001     evaluation reward: 1.31
episode: 202   score: 1.0   memory length: 36149   epsilon: 1.0    steps: 171    lr: 0.0001     evaluation reward: 1.32
episode: 203   score: 1.0   memory length: 36300   epsilon: 1.0    steps: 151    lr: 0.0001     evaluation reward: 1.31
episode: 204   score: 4.0   memory length: 36596   epsilon: 1.0    steps: 296    lr: 0.0001     evaluation reward: 1.33
episode: 205   score: 2.0   memory length: 36813   epsilon: 1.0    steps: 217    lr: 0.0001     evaluation reward: 1.33
episode: 206   score: 2.0   memory length

episode: 267   score: 1.0   memory length: 47879   epsilon: 1.0    steps: 170    lr: 0.0001     evaluation reward: 1.41
episode: 268   score: 0.0   memory length: 48001   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.4
episode: 269   score: 0.0   memory length: 48124   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.4
episode: 270   score: 3.0   memory length: 48370   epsilon: 1.0    steps: 246    lr: 0.0001     evaluation reward: 1.41
episode: 271   score: 0.0   memory length: 48493   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.39
episode: 272   score: 1.0   memory length: 48662   epsilon: 1.0    steps: 169    lr: 0.0001     evaluation reward: 1.39
episode: 273   score: 1.0   memory length: 48813   epsilon: 1.0    steps: 151    lr: 0.0001     evaluation reward: 1.39
episode: 274   score: 3.0   memory length: 49059   epsilon: 1.0    steps: 246    lr: 0.0001     evaluation reward: 1.42
episode: 275   score: 2.0   memory length:

episode: 336   score: 1.0   memory length: 60263   epsilon: 1.0    steps: 168    lr: 0.0001     evaluation reward: 1.41
episode: 337   score: 1.0   memory length: 60435   epsilon: 1.0    steps: 172    lr: 0.0001     evaluation reward: 1.41
episode: 338   score: 1.0   memory length: 60606   epsilon: 1.0    steps: 171    lr: 0.0001     evaluation reward: 1.41
episode: 339   score: 1.0   memory length: 60757   epsilon: 1.0    steps: 151    lr: 0.0001     evaluation reward: 1.4
episode: 340   score: 3.0   memory length: 60983   epsilon: 1.0    steps: 226    lr: 0.0001     evaluation reward: 1.41
episode: 341   score: 0.0   memory length: 61106   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.4
episode: 342   score: 2.0   memory length: 61303   epsilon: 1.0    steps: 197    lr: 0.0001     evaluation reward: 1.4
episode: 343   score: 1.0   memory length: 61472   epsilon: 1.0    steps: 169    lr: 0.0001     evaluation reward: 1.39
episode: 344   score: 1.0   memory length: 

episode: 405   score: 0.0   memory length: 72718   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.36
episode: 406   score: 0.0   memory length: 72841   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.35
episode: 407   score: 2.0   memory length: 73062   epsilon: 1.0    steps: 221    lr: 0.0001     evaluation reward: 1.35
episode: 408   score: 0.0   memory length: 73184   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.32
episode: 409   score: 0.0   memory length: 73306   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.32
episode: 410   score: 0.0   memory length: 73428   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.31
episode: 411   score: 3.0   memory length: 73675   epsilon: 1.0    steps: 247    lr: 0.0001     evaluation reward: 1.31
episode: 412   score: 1.0   memory length: 73826   epsilon: 1.0    steps: 151    lr: 0.0001     evaluation reward: 1.3
episode: 413   score: 3.0   memory length

episode: 474   score: 3.0   memory length: 85593   epsilon: 1.0    steps: 246    lr: 0.0001     evaluation reward: 1.52
episode: 475   score: 4.0   memory length: 85886   epsilon: 1.0    steps: 293    lr: 0.0001     evaluation reward: 1.56
episode: 476   score: 2.0   memory length: 86103   epsilon: 1.0    steps: 217    lr: 0.0001     evaluation reward: 1.56
episode: 477   score: 0.0   memory length: 86226   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.54
episode: 478   score: 1.0   memory length: 86396   epsilon: 1.0    steps: 170    lr: 0.0001     evaluation reward: 1.53
episode: 479   score: 0.0   memory length: 86519   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.51
episode: 480   score: 0.0   memory length: 86642   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.48
episode: 481   score: 3.0   memory length: 86888   epsilon: 1.0    steps: 246    lr: 0.0001     evaluation reward: 1.48
episode: 482   score: 3.0   memory lengt

episode: 543   score: 1.0   memory length: 98409   epsilon: 1.0    steps: 169    lr: 0.0001     evaluation reward: 1.6
episode: 544   score: 4.0   memory length: 98704   epsilon: 1.0    steps: 295    lr: 0.0001     evaluation reward: 1.61
episode: 545   score: 2.0   memory length: 98924   epsilon: 1.0    steps: 220    lr: 0.0001     evaluation reward: 1.63
episode: 546   score: 0.0   memory length: 99047   epsilon: 1.0    steps: 123    lr: 0.0001     evaluation reward: 1.62
episode: 547   score: 0.0   memory length: 99169   epsilon: 1.0    steps: 122    lr: 0.0001     evaluation reward: 1.61
episode: 548   score: 1.0   memory length: 99319   epsilon: 1.0    steps: 150    lr: 0.0001     evaluation reward: 1.62
episode: 549   score: 2.0   memory length: 99538   epsilon: 1.0    steps: 219    lr: 0.0001     evaluation reward: 1.64
episode: 550   score: 2.0   memory length: 99735   epsilon: 1.0    steps: 197    lr: 0.0001     evaluation reward: 1.66
episode: 551   score: 1.0   memory length

  sample = np.array(sample)
  mini_batch = np.array(mini_batch).transpose()


episode: 552   score: 1.0   memory length: 100055   epsilon: 0.9998891200000024    steps: 169    lr: 0.0001     evaluation reward: 1.64
episode: 553   score: 0.0   memory length: 100178   epsilon: 0.9996455800000077    steps: 123    lr: 0.0001     evaluation reward: 1.62
episode: 554   score: 4.0   memory length: 100474   epsilon: 0.9990595000000204    steps: 296    lr: 0.0001     evaluation reward: 1.65
episode: 555   score: 2.0   memory length: 100672   epsilon: 0.9986674600000289    steps: 198    lr: 0.0001     evaluation reward: 1.67
episode: 556   score: 0.0   memory length: 100794   epsilon: 0.9984259000000342    steps: 122    lr: 0.0001     evaluation reward: 1.65
episode: 557   score: 0.0   memory length: 100917   epsilon: 0.9981823600000395    steps: 123    lr: 0.0001     evaluation reward: 1.63
episode: 558   score: 5.0   memory length: 101244   epsilon: 0.9975349000000535    steps: 327    lr: 0.0001     evaluation reward: 1.65
episode: 559   score: 1.0   memory length: 10141

episode: 613   score: 1.0   memory length: 111080   epsilon: 0.9780596200004763    steps: 170    lr: 0.0001     evaluation reward: 1.47
episode: 614   score: 0.0   memory length: 111202   epsilon: 0.9778180600004815    steps: 122    lr: 0.0001     evaluation reward: 1.46
episode: 615   score: 0.0   memory length: 111325   epsilon: 0.9775745200004868    steps: 123    lr: 0.0001     evaluation reward: 1.45
episode: 616   score: 2.0   memory length: 111542   epsilon: 0.9771448600004962    steps: 217    lr: 0.0001     evaluation reward: 1.46
episode: 617   score: 3.0   memory length: 111790   epsilon: 0.9766538200005068    steps: 248    lr: 0.0001     evaluation reward: 1.48
episode: 618   score: 2.0   memory length: 111993   epsilon: 0.9762518800005155    steps: 203    lr: 0.0001     evaluation reward: 1.48
episode: 619   score: 0.0   memory length: 112115   epsilon: 0.9760103200005208    steps: 122    lr: 0.0001     evaluation reward: 1.48
episode: 620   score: 2.0   memory length: 11233

episode: 674   score: 1.0   memory length: 121972   epsilon: 0.9564934600009445    steps: 151    lr: 0.0001     evaluation reward: 1.34
episode: 675   score: 0.0   memory length: 122095   epsilon: 0.9562499200009498    steps: 123    lr: 0.0001     evaluation reward: 1.32
episode: 676   score: 1.0   memory length: 122266   epsilon: 0.9559113400009571    steps: 171    lr: 0.0001     evaluation reward: 1.31
episode: 677   score: 0.0   memory length: 122389   epsilon: 0.9556678000009624    steps: 123    lr: 0.0001     evaluation reward: 1.28
episode: 678   score: 1.0   memory length: 122558   epsilon: 0.9553331800009697    steps: 169    lr: 0.0001     evaluation reward: 1.24
episode: 679   score: 0.0   memory length: 122680   epsilon: 0.9550916200009749    steps: 122    lr: 0.0001     evaluation reward: 1.24
episode: 680   score: 0.0   memory length: 122803   epsilon: 0.9548480800009802    steps: 123    lr: 0.0001     evaluation reward: 1.23
episode: 681   score: 1.0   memory length: 12295

episode: 735   score: 0.0   memory length: 132324   epsilon: 0.9359965000013895    steps: 123    lr: 0.0001     evaluation reward: 1.2
episode: 736   score: 3.0   memory length: 132550   epsilon: 0.9355490200013992    steps: 226    lr: 0.0001     evaluation reward: 1.22
episode: 737   score: 3.0   memory length: 132782   epsilon: 0.9350896600014091    steps: 232    lr: 0.0001     evaluation reward: 1.23
episode: 738   score: 2.0   memory length: 133000   epsilon: 0.9346580200014185    steps: 218    lr: 0.0001     evaluation reward: 1.24
episode: 739   score: 1.0   memory length: 133151   epsilon: 0.934359040001425    steps: 151    lr: 0.0001     evaluation reward: 1.23
episode: 740   score: 0.0   memory length: 133274   epsilon: 0.9341155000014303    steps: 123    lr: 0.0001     evaluation reward: 1.21
episode: 741   score: 1.0   memory length: 133443   epsilon: 0.9337808800014376    steps: 169    lr: 0.0001     evaluation reward: 1.2
episode: 742   score: 0.0   memory length: 133566  

episode: 796   score: 0.0   memory length: 143306   epsilon: 0.9142521400018615    steps: 123    lr: 0.0001     evaluation reward: 1.41
episode: 797   score: 2.0   memory length: 143507   epsilon: 0.9138541600018701    steps: 201    lr: 0.0001     evaluation reward: 1.38
episode: 798   score: 0.0   memory length: 143629   epsilon: 0.9136126000018754    steps: 122    lr: 0.0001     evaluation reward: 1.35
episode: 799   score: 1.0   memory length: 143798   epsilon: 0.9132779800018826    steps: 169    lr: 0.0001     evaluation reward: 1.35
episode: 800   score: 3.0   memory length: 144026   epsilon: 0.9128265400018925    steps: 228    lr: 0.0001     evaluation reward: 1.35
episode: 801   score: 0.0   memory length: 144148   epsilon: 0.9125849800018977    steps: 122    lr: 0.0001     evaluation reward: 1.35
episode: 802   score: 3.0   memory length: 144395   epsilon: 0.9120959200019083    steps: 247    lr: 0.0001     evaluation reward: 1.38
episode: 803   score: 3.0   memory length: 14462

episode: 857   score: 3.0   memory length: 154604   epsilon: 0.8918821000023471    steps: 268    lr: 0.0001     evaluation reward: 1.56
episode: 858   score: 3.0   memory length: 154871   epsilon: 0.8913534400023586    steps: 267    lr: 0.0001     evaluation reward: 1.59
episode: 859   score: 1.0   memory length: 155042   epsilon: 0.891014860002366    steps: 171    lr: 0.0001     evaluation reward: 1.59
episode: 860   score: 4.0   memory length: 155303   epsilon: 0.8904980800023772    steps: 261    lr: 0.0001     evaluation reward: 1.62
episode: 861   score: 1.0   memory length: 155453   epsilon: 0.8902010800023836    steps: 150    lr: 0.0001     evaluation reward: 1.61
episode: 862   score: 1.0   memory length: 155622   epsilon: 0.8898664600023909    steps: 169    lr: 0.0001     evaluation reward: 1.62
episode: 863   score: 0.0   memory length: 155745   epsilon: 0.8896229200023962    steps: 123    lr: 0.0001     evaluation reward: 1.59
episode: 864   score: 2.0   memory length: 155924

episode: 918   score: 2.0   memory length: 165920   epsilon: 0.8694764200028335    steps: 217    lr: 0.0001     evaluation reward: 1.63
episode: 919   score: 2.0   memory length: 166118   epsilon: 0.869084380002842    steps: 198    lr: 0.0001     evaluation reward: 1.64
episode: 920   score: 0.0   memory length: 166240   epsilon: 0.8688428200028473    steps: 122    lr: 0.0001     evaluation reward: 1.63
episode: 921   score: 0.0   memory length: 166363   epsilon: 0.8685992800028526    steps: 123    lr: 0.0001     evaluation reward: 1.57
episode: 922   score: 4.0   memory length: 166622   epsilon: 0.8680864600028637    steps: 259    lr: 0.0001     evaluation reward: 1.6
episode: 923   score: 0.0   memory length: 166745   epsilon: 0.867842920002869    steps: 123    lr: 0.0001     evaluation reward: 1.6
episode: 924   score: 2.0   memory length: 166943   epsilon: 0.8674508800028775    steps: 198    lr: 0.0001     evaluation reward: 1.6
episode: 925   score: 1.0   memory length: 167112   e

episode: 979   score: 2.0   memory length: 177762   epsilon: 0.8460292600033426    steps: 198    lr: 0.0001     evaluation reward: 1.72
episode: 980   score: 4.0   memory length: 178016   epsilon: 0.8455263400033535    steps: 254    lr: 0.0001     evaluation reward: 1.75
episode: 981   score: 2.0   memory length: 178216   epsilon: 0.8451303400033621    steps: 200    lr: 0.0001     evaluation reward: 1.75
episode: 982   score: 2.0   memory length: 178414   epsilon: 0.8447383000033706    steps: 198    lr: 0.0001     evaluation reward: 1.77
episode: 983   score: 2.0   memory length: 178630   epsilon: 0.8443106200033799    steps: 216    lr: 0.0001     evaluation reward: 1.77
episode: 984   score: 3.0   memory length: 178857   epsilon: 0.8438611600033896    steps: 227    lr: 0.0001     evaluation reward: 1.77
episode: 985   score: 3.0   memory length: 179125   epsilon: 0.8433305200034011    steps: 268    lr: 0.0001     evaluation reward: 1.8
episode: 986   score: 3.0   memory length: 179352

episode: 1039   score: 4.0   memory length: 189263   epsilon: 0.8232572800038369    steps: 258    lr: 0.0001     evaluation reward: 1.84
episode: 1040   score: 3.0   memory length: 189510   epsilon: 0.8227682200038475    steps: 247    lr: 0.0001     evaluation reward: 1.86
episode: 1041   score: 5.0   memory length: 189808   epsilon: 0.8221781800038603    steps: 298    lr: 0.0001     evaluation reward: 1.89
episode: 1042   score: 1.0   memory length: 189976   epsilon: 0.8218455400038676    steps: 168    lr: 0.0001     evaluation reward: 1.9
episode: 1043   score: 0.0   memory length: 190099   epsilon: 0.8216020000038728    steps: 123    lr: 0.0001     evaluation reward: 1.9
episode: 1044   score: 3.0   memory length: 190346   epsilon: 0.8211129400038835    steps: 247    lr: 0.0001     evaluation reward: 1.92
episode: 1045   score: 1.0   memory length: 190497   epsilon: 0.82081396000389    steps: 151    lr: 0.0001     evaluation reward: 1.91
episode: 1046   score: 3.0   memory length: 1

episode: 1099   score: 0.0   memory length: 201366   epsilon: 0.7992933400043571    steps: 122    lr: 0.0001     evaluation reward: 1.87
episode: 1100   score: 0.0   memory length: 201488   epsilon: 0.7990517800043624    steps: 122    lr: 0.0001     evaluation reward: 1.87
episode: 1101   score: 0.0   memory length: 201611   epsilon: 0.7988082400043677    steps: 123    lr: 0.0001     evaluation reward: 1.86
episode: 1102   score: 3.0   memory length: 201858   epsilon: 0.7983191800043783    steps: 247    lr: 0.0001     evaluation reward: 1.87
episode: 1103   score: 1.0   memory length: 202009   epsilon: 0.7980202000043848    steps: 151    lr: 0.0001     evaluation reward: 1.86
episode: 1104   score: 3.0   memory length: 202256   epsilon: 0.7975311400043954    steps: 247    lr: 0.0001     evaluation reward: 1.86
episode: 1105   score: 1.0   memory length: 202407   epsilon: 0.7972321600044019    steps: 151    lr: 0.0001     evaluation reward: 1.85
episode: 1106   score: 1.0   memory lengt

episode: 1159   score: 2.0   memory length: 213943   epsilon: 0.7743908800048978    steps: 219    lr: 0.0001     evaluation reward: 2.09
episode: 1160   score: 2.0   memory length: 214140   epsilon: 0.7740008200049062    steps: 197    lr: 0.0001     evaluation reward: 2.1
episode: 1161   score: 0.0   memory length: 214262   epsilon: 0.7737592600049115    steps: 122    lr: 0.0001     evaluation reward: 2.08
episode: 1162   score: 0.0   memory length: 214385   epsilon: 0.7735157200049168    steps: 123    lr: 0.0001     evaluation reward: 2.08
episode: 1163   score: 1.0   memory length: 214554   epsilon: 0.773181100004924    steps: 169    lr: 0.0001     evaluation reward: 2.07
episode: 1164   score: 2.0   memory length: 214772   epsilon: 0.7727494600049334    steps: 218    lr: 0.0001     evaluation reward: 2.07
episode: 1165   score: 4.0   memory length: 215013   epsilon: 0.7722722800049437    steps: 241    lr: 0.0001     evaluation reward: 2.11
episode: 1166   score: 1.0   memory length:

# Visualize Agent Performance

BE AWARE THIS CODE BELOW MAY CRASH THE KERNEL IF YOU RUN THE SAME CELL TWICE.

Please save your model before running this portion of the code.

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn_latest.pth")

In [None]:
from gym.wrappers import Monitor
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

# Displaying the game live
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % ("Agent Playing",step, info))
    plt.axis('off')

    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    
# Recording the game and replaying the game afterwards
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

In [None]:
display = Display(visible=0, size=(300, 200))
display.start()

# Load agent
# agent.load_policy_net("./save_model/breakout_dqn.pth")
agent.epsilon = 0.0 # Set agent to only exploit the best action

env = gym.make('BreakoutDeterministic-v4')
env = wrap_env(env)

done = False
score = 0
step = 0
state = env.reset()
next_state = state
life = number_lives
history = np.zeros([5, 84, 84], dtype=np.uint8)
get_init_state(history, state)

while not done:
    
    # Render breakout
    env.render()
#     show_state(env,step) # uncommenting this provides another way to visualize the game

    step += 1
    frame += 1

    # Perform a fire action if ball is no longer on screen
    if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
        action = 0
    else:
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
    state = next_state
    
    next_state, reward, done, info = env.step(action + 1)
        
    frame_next_state = get_frame(next_state)
    history[4, :, :] = frame_next_state
    terminal_state = check_live(life, info['ale.lives'])
        
    life = info['ale.lives']
    r = np.clip(reward, -1, 1) 
    r = reward

    # Store the transition in memory 
    agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
    # Start training after random sample generation
    score += reward
    
    history[:4, :, :] = history[1:, :, :]
env.close()
show_video()
display.stop()