# Advance Machine Learning Project  Atari Reinforcement Learning with DQN

# Libraries and Dependencies 

### Libraries

In [None]:
import gym
import numpy as np
import tensorflow as tf
from gym import logger as gymlogger
from gym.wrappers import Monitor

import base64
import glob
import gzip
import io
import json
import os
import random
import sys
import time
from datetime import datetime

import matplotlib.pyplot as plt

gymlogger.set_level(40)  # error only
%matplotlib inline

### External Dependencies: Google (Colab Drive), Tensorflow GPU and Video

In [None]:
IS_GOOGLE = False
SAVE_DIRECTORY_PATH = '/content/drive/My Drive/Colab Notebooks/aml_models/' if IS_GOOGLE else './aml_models/'
if IS_GOOGLE:
    from google.colab import drive

    drive.mount('/content/drive')

    # Install rendering libraries 
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    
    from IPython import display as ipythondisplay
    from IPython.display import HTML, clear_output
    from pyvirtualdisplay import Display

    # Install TF 2 and enable GPU
    if "2." not in tf.__version__ or not tf.test.is_gpu_available():
        !pip uninstall tensorflow
        !pip install tensorflow-gpu
        print(f"Python version: {sys.version}")
        print(f"Tensorflow version: {tf.__version__}")

    device_name = tf.test.gpu_device_name()
    print('Found GPU at: {}'.format(device_name))
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

    # Create virtual display to send rendered frames to in Colab 
    display = Display(visible=0, size=(1400, 900))
    display.start()

# Utility Functions

### Timer Utility 
Response for timing model operations during training

In [None]:
class Timer:
    def __init__(self, tag):
        self.start_time = time.time()
        self.tag = tag

    def stop(self):
        end_time = time.time()
        print(f'{self.tag} - Execution time: {round(end_time - self.start_time)} seconds')

    def start(self, tag):
        self.tag = tag
        self.start_time = time.time()

### Filehandler
Responsible for loading and restoring models during training

In [None]:
class FileHandler:
    def __init__(self, filename):
        self.filename = filename
        self.timer = Timer(tag="")
        self.session_path = f'{SAVE_DIRECTORY_PATH}{self.filename}'
        self.memory_path = f'{SAVE_DIRECTORY_PATH}{self.filename}_memory'

    def __save(self, session, replay_memory):
        print(f'Saving session to {self.session_path}')
        self.__save_session_gzip(session, self.session_path)
        print(f'Saving memory to {self.memory_path}')
        self.__save_memory_gzip(replay_memory,self.memory_path)

    def __load(self):
        print(f'Fetching session from {self.session_path}')
        session = self.__load_session_gzip(self.session_path)
        print(f'Fetching memory from {self.memory_path}')
        memory = self.__load_memory_gzip(self.memory_path)
        return session, memory

    def __save_session_gzip(self, session, path):
        data = json.dumps(session, separators=(',', ':')) # compression level 0-9 where 9 is highest
        with gzip.open(f'{path}.gz', 'wt', encoding="ascii", compresslevel=9) as file:  
            file.write(data)
        with gzip.open(f'{path}_backup.gz', 'wt', encoding="ascii", compresslevel=9) as file:  
            file.write(data)

    def __load_session_gzip(self, path):
        try:
            with gzip.open(f'{path}.gz', 'rt', encoding='ascii') as json_file:
                return json.load(json_file)
        except:
            with gzip.open(f'{path}_backup.gz', 'rt', encoding='ascii') as json_file:
                return json.load(json_file)

    def __save_memory_gzip(self,memory,path):
        with gzip.open(f'{path}.gz','wb') as memory_file:
            np.save(file=memory_file, arr = memory )
        with gzip.open(f'{path}_backup.gz','wb') as memory_file:
            np.save(file=memory_file, arr = memory )

    def __load_memory_gzip(self,path):
        try:
            exists = os.path.exists(f'{path}.gz')
            if not exists: return []
            
            with gzip.open(f'{path}.gz','rb') as memory_file:
                return np.load(memory_file, allow_pickle=True)
        except:
            with gzip.open(f'{path}_backup.gz','rb') as memory_file:
                return np.load(memory_file,  allow_pickle=True)
    

    def check_existing_session(self):
        exists = os.path.exists(f'{SAVE_DIRECTORY_PATH}{self.filename}.gz')
        if not exists:
            print(f"No session found with given filename {self.filename}. Continuing")
            return None, None

        response = input(f'{self.filename} session found. Do you want to restore the model? [y/n]')
        if response == 'y':
            return self.__load()
        else:
            raise (BaseException('Cancelled execution - Please use another filename.'))

    def save_session(self, agent, environment, episode, average_losses, total_rewards, average_rewards):
        print(f'Saving session...')
        self.timer.start('Model parameters')
        session = {'date': datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), 'episode': episode}
        print(session)
        session['average_losses'] = average_losses
        session['total_rewards'] = total_rewards
        session['average_rewards'] = average_rewards
        session['agent_params'] = agent.extract_agent_parameters()
        session['environment_params'] = environment.extract_environment_parameters()
        print('Extracting agent memory...')
        replay_memory = agent.extract_replay_memory()
        print('Extracting model parameters...')
        session['model_params'] = agent.extract_model()
        self.timer.stop()
        print('Extracted model parameters.')
        print('Compressing and storing...')
        self.timer.start('Model file storage')
        self.__save(session,replay_memory)
        self.timer.stop()
        print(f'Session saved.')

    def load_session(self, agent, environment):
        episodes, average_losses, total_rewards, average_rewards = 0, [], [], []
        self.timer.start('File fetching.')
        session, replay_memory = self.check_existing_session()
        if session is None:
            self.timer.stop()
            return episodes, average_losses, total_rewards, average_rewards
        print('Session Fetched and decompressed - Loading Parameters...')
        self.timer.start('Load Parameters')
        session_date = session['date']
        episodes = session['episode']
        average_losses = session['average_losses']
        total_rewards = session['total_rewards']
        average_rewards = session['average_rewards']
        print(f'Session Meta - Save Date: {session_date} - episodes: {episodes}')
        print('Loading Agent Parameters...')
        agent.load_agent_parameters(session['agent_params'])
        print('Loading agent memory...')
        agent.load_replay_memory(replay_memory)
        print('Loading Environment Parameters...')
        environment.load_environment_parameters(session['environment_params'])
        print('Loading Model Parameters...')
        agent.load_model(session['model_params'])
        self.timer.stop()
        print(f'Loaded session from {self.filename} - Date: {session_date}.')
        print(f'Continuing from episode {episodes}')
        return episodes, average_losses, total_rewards, average_rewards

### Video Utility
Generates and handles gameplay videos

In [None]:
class VideoUtility:
    """
    Utility functions used to generate video clips of Atari gameplay by the agent.
    """

    def __init__(self, model_name):
        self.model_name = model_name

    def wrap_environment(self, env):
        return Monitor(env, f"{SAVE_DIRECTORY_PATH}{self.model_name}_video", force=True)

    def clear_files(self):
        path = f'{SAVE_DIRECTORY_PATH}{self.model_name}_video/'
        if os.path.exists(path):
            json_files = glob.glob(f'{path}*.json')
            video_files = glob.glob(f'{path}*.mp4')
            if len(video_files) == 1:
                return
            for f in json_files:
                os.remove(f)

            for idx, f in enumerate(video_files):
                if idx < len(video_files) - 1:
                    os.remove(f)

    def show_video(self):
        mp4list = glob.glob(f'{SAVE_DIRECTORY_PATH}{self.model_name}_video/*.mp4')
        if len(mp4list) > 0:
            mp4 = mp4list[0]
            video = io.open(mp4, 'r+b').read()
            encoded = base64.b64encode(video)
            ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                        loop controls style="height: 400px;">
                        <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                    </video>'''.format(encoded.decode('ascii'))))
        else:
            print("Could not find video")

### Plot Method 
Plot method is used to graph rewards and loss over time during training episodes

In [None]:
def plot(episodes, y, title, ylabel, xlabel="Episode (one game until terminal)"):
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.plot(episodes, y)
    plt.grid()
    plt.savefig(f'{SAVE_DIRECTORY_PATH}{title}.png')
    plt.show()

# Main Project
Implementation of Deep Q-learning Agent (DQN) using deep Convolutional Neural Network and model-free Q Reinforcement Learning  

### Hyper parameters

In [None]:
MINIBATCH_SIZE = 32
REPLAY_MEMORY_SIZE = 1000000  # Number of recent frames used for optimization updates during training
AGENT_HISTORY_LENGTH = 4
TARGET_NETWORK_UPDATE_FREQUENCY = 10000  
DISCOUNT_FACTOR = 0.99
ACTION_REPEAT = 4  # Frame skip: Number of times each action is repeated (results in agent seeing only every 4th frame)
UPDATE_FREQUENCY = 4  # Number of actions selected by agent between successive SGD updates
LEARNING_RATE = 0.0000625  # DQN Paper uses 0.00025, Rainbow paper uses 0.0000625, people online use 0.00001
GRADIENT_MOMENTUM = 0.95  # Squared gradient momentum same
MIN_SQUARED_GRADIENT = 0.01
INITIAL_EXPLORATION = 1
FINAL_EXPLORATION = 0.1
FINAL_EXPLORATION_FRAME = 1000000 / ACTION_REPEAT  # Exploration rate decayed over 1 million frames where each frame has 4 frame skips
REPLAY_START_SIZE = 50000
NOOP_MAX = 30  # Max number of "do nothing" actions to be performed by agent at start of an episode
# optimizer = tf.optimizers.RMSprop(learning_rate=LEARNING_RATE, rho=GRADIENT_MOMENTUM, epsilon=MIN_SQUARED_GRADIENT) 
optimizer = tf.optimizers.Adam(learning_rate=LEARNING_RATE)

NOOP_ACTION = 0  # Atari Breakout actions: 0 (noop), 1 (fire), 2 (left) and 3 (right)
PROBLEM = 'BreakoutDeterministic-v4'  # Deterministic: fixed frame skip of 4 (ACTION_REPEAT), v4: 0 repeat action probability
NUMBER_OF_EPISODES = 1000000
IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS = 84, 84, 1
CONV1_NUM_FILTERS, CONV1_FILTER_SIZE, CONV1_FILTER_STRIDES = 32, 8, 4
CONV2_NUM_FILTERS, CONV2_FILTER_SIZE, CONV2_FILTER_STRIDES = 64, 4, 2
CONV3_NUM_FILTERS, CONV3_FILTER_SIZE, CONV3_FILTER_STRIDES = 64, 3, 1
CONV3_OUT_DIM_VALID_PADDING, CONV3_OUT_DIM_SAME_PADDING, PADDING = 7, 11, "VALID"
DENSE_NUM_UNITS, OUTPUT_NUM_UNITS = 512, 4
WEIGHT_INITIALIZER = tf.initializers.VarianceScaling(scale=2.0)  # Scale 2.0 for RELU activation

### Frame Preprocessor
Responsible for preprocessing raw RGB pixel values in Atari video frames into downscaled, cropped and normalized greyscale images

In [None]:
class FramePreprocessor:
    """
    FramePreprocessor re-sizes, normalizes and converts RGB atari frames to gray scale frames.
    """

    def __init__(self, state_space):
        self.state_space = state_space

    @staticmethod
    def convert_rgb_to_grayscale(tf_frame):
        return tf.image.rgb_to_grayscale(tf_frame)

    @staticmethod
    def resize_frame(tf_frame, frame_height, frame_width):  # TODO: crop down to bounding box of playing area
        return tf.image.resize(tf_frame, [frame_height, frame_width], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

    @staticmethod
    def plot_frame_from_greyscale_values(image):
        height, width, _ = image.shape # stacked_image = np.stack((img,)*3, axis=-1)
        grey_image = np.array([[(image[i, j].numpy()[0], image[i, j].numpy()[0], image[i, j].numpy()[0])
                               for i in range(height)]
                               for j in range(width)])
        grey_image = np.transpose(grey_image, (1, 0, 2))  # Switch height and width
        plt.imshow(grey_image)
        plt.show()

    def preprocess_frame(self, frame):
        tf_frame = tf.Variable(frame, shape=self.state_space, dtype=tf.uint8)
        image = self.convert_rgb_to_grayscale(tf_frame)
        image = self.resize_frame(image, IMAGE_HEIGHT, IMAGE_WIDTH)
        image = tf.reshape(image, shape=(IMAGE_HEIGHT, IMAGE_WIDTH))
        return image

### Replay Memory

Memory data structure used to store experiences of agent

In [None]:
# Not currently used to encode experience: (state, action, reward, next_state, is_done)
from typing import NamedTuple, Tuple 
class Experience(NamedTuple): 
  state: Tuple[int, int, int] 
  action: int 
  reward: float 
  next_state: Tuple[int, int, int]
  is_done: bool

In [None]:
class ReplayMemory:
    """
    Memory class holds a list of game plays stored as experiences (s,a,r,s', d) = (state, action, reward, next_state, is_done)
    Credits: https://stackoverflow.com/questions/40181284/how-to-get-random-sample-from-deque-in-python-3 
    """

    def __init__(self, capacity):  # Initialize memory with given capacity
        self.experiences = [None] * capacity
        self.capacity = capacity
        self.index = 0
        self.size = 0

    def add(self, experience):  # Add a sample to the memory, removing the earliest entry if memeory capacity is reached
        self.experiences[self.index] = experience
        self.size = min(self.size + 1, self.capacity)
        self.index = (self.index + 1) % self.capacity  # Overwrites earliest entry if memory capacity reached

    def sample(self, size):
        indices = random.sample(range(self.size), size)
        return [self.experiences[index] for index in indices]  # Efficient random access
    
    def extract_memory(self):
        if self.experiences[0] == None:
            return np.array([])
        return self.sample(REPLAY_START_SIZE+1)

    def load_memory(self,replay_memory): 
        if replay_memory == []: return
        [self.add(experience) for experience in replay_memory if experience != None]

### Model
Convolutional Neural Network used to approximate Q-values from raw Atari video pixels 


#### Parameters
Weights and biases of primary and target network

In [None]:
weights = {  # 4D: Filter Width, Height, In Channel, Out Channel
    # Conv Layer 1: 8x8 conv, 1 input (preprocessed stacked image has 4 channels), 32 output filters
    'conv1_weights': tf.Variable(WEIGHT_INITIALIZER([CONV1_FILTER_SIZE, CONV1_FILTER_SIZE, AGENT_HISTORY_LENGTH, CONV1_NUM_FILTERS])),
    # Conv Layer 2: 4x4 conv, 32 input filters, 64 output filters
    'conv2_weights': tf.Variable(WEIGHT_INITIALIZER([CONV2_FILTER_SIZE, CONV2_FILTER_SIZE, CONV1_NUM_FILTERS, CONV2_NUM_FILTERS])),
    # Conv Layer 3: 3x3 conv, 64 input filters, 64 output filters
    'conv3_weights': tf.Variable(WEIGHT_INITIALIZER([CONV3_FILTER_SIZE, CONV3_FILTER_SIZE, CONV2_NUM_FILTERS, CONV3_NUM_FILTERS])),
    # Fully Connected (Dense) Layer: 7x7x64 inputs (64 filters of size 3x3), 512 output units
    'dense_weights': tf.Variable(WEIGHT_INITIALIZER([CONV3_OUT_DIM_VALID_PADDING * CONV3_OUT_DIM_VALID_PADDING * CONV3_NUM_FILTERS, DENSE_NUM_UNITS])),
    # Output layer: 512 input units, 4 output units (actions)
    'output_weights': tf.Variable(WEIGHT_INITIALIZER([DENSE_NUM_UNITS, OUTPUT_NUM_UNITS]))
}

biases = {
    'conv1_biases': tf.Variable(tf.zeros([CONV1_NUM_FILTERS])),  # 32
    'conv2_biases': tf.Variable(tf.zeros([CONV2_NUM_FILTERS])),  # 64
    'conv3_biases': tf.Variable(tf.zeros([CONV3_NUM_FILTERS])),  # 64
    'dense_biases': tf.Variable(tf.zeros([DENSE_NUM_UNITS])),  # 512
    'output_biases': tf.Variable(tf.zeros([OUTPUT_NUM_UNITS]))  # 4
}

target_weights = {
    'conv1_target_weights': tf.Variable(WEIGHT_INITIALIZER([CONV1_FILTER_SIZE, CONV1_FILTER_SIZE, AGENT_HISTORY_LENGTH, CONV1_NUM_FILTERS])),
    'conv2_target_weights': tf.Variable(WEIGHT_INITIALIZER([CONV2_FILTER_SIZE, CONV2_FILTER_SIZE, CONV1_NUM_FILTERS, CONV2_NUM_FILTERS])),
    'conv3_target_weights': tf.Variable(WEIGHT_INITIALIZER([CONV3_FILTER_SIZE, CONV3_FILTER_SIZE, CONV2_NUM_FILTERS, CONV3_NUM_FILTERS])),
    'dense_target_weights': tf.Variable(WEIGHT_INITIALIZER([CONV3_OUT_DIM_VALID_PADDING * CONV3_OUT_DIM_VALID_PADDING * CONV3_NUM_FILTERS, DENSE_NUM_UNITS])),
    'output_target_weights': tf.Variable(WEIGHT_INITIALIZER([DENSE_NUM_UNITS, OUTPUT_NUM_UNITS]))
}

target_biases = {
    'conv1_target_biases': tf.Variable(tf.zeros([CONV1_NUM_FILTERS])),  # 32
    'conv2_target_biases': tf.Variable(tf.zeros([CONV2_NUM_FILTERS])),  # 64
    'conv3_target_biases': tf.Variable(tf.zeros([CONV3_NUM_FILTERS])),  # 64
    'dense_target_biases': tf.Variable(tf.zeros([DENSE_NUM_UNITS])),  # 512
    'output_target_biases': tf.Variable(tf.zeros([OUTPUT_NUM_UNITS]))  # 4
}

#### Convolutional Neural Network 
Responsible for predicting Q-values of states given actions 

In [None]:
class ConvolutionalNeuralNetwork:
    """
    CNN Architecture of DQN has 1 input layer, 4 hidden layers (3 convolutions and 1 dense after flatten) and 1 output layer:
    Input:  84 X 84 X 4 image (4 due to image stacking) 
    1st Hidden layer: Convolves 32 filters of 8 X 8 with stride 4 (relu)
    2nd hidden layer: Convolves 64 filters of 4 X 4 with stride 2 (relu)
    3rd hidden layer: Convolves 64 filters of 3 X 3 with stride 1 (Relu)
    4th hidden layer: Fully connected, (512 relu units)
    Output: Fully connected linear layer, Separate output unit for each action, outputs are predicted Q-values
    """

    def __init__(self, number_of_states, number_of_actions):
        self.weights = weights 
        self.biases = biases 
        self.target_weights = target_weights
        self.target_biases = target_biases 
        self.number_of_states = number_of_states
        self.number_of_actions = number_of_actions

    @tf.function
    def normalize_images(self, images):
        return tf.cast(images / 255, dtype=tf.float32)

    @tf.function
    def convolutional_2d_layer(self, inputs, filter_weights, biases, strides=1):
        output = tf.nn.conv2d(inputs, filter_weights, strides, padding=PADDING)
        output_with_bias = tf.nn.bias_add(output, biases)
        activation = tf.nn.leaky_relu(output_with_bias)  # non-linearity 
        return activation

    @tf.function
    def flatten_layer(self, layer):  # output shape: [32, 64*84*84]
        # Shape: Minibatches: 32, Num of Filters * Input Height (conv3), Input Width (conv3): 64*7*7 = 3136
        memory_batch_size, image_height, image_width, num_filters = layer.get_shape()
        flattened_layer = tf.reshape(layer, (memory_batch_size, num_filters * image_height * image_width))
        return flattened_layer

    @tf.function
    def dense_layer(self, inputs, weights, biases):
        output = tf.nn.bias_add(tf.matmul(inputs, weights), biases)
        dense_activation = tf.nn.leaky_relu(output)  # non-linearity
        return dense_activation

    @tf.function
    def output_layer(self, input, weights, biases):
        linear_output = tf.nn.bias_add(tf.matmul(input, weights), biases)
        return linear_output

    @tf.function
    def huber_error_loss(self, y_true, y_predictions, delta=1.0):
        y_predictions = tf.cast(y_predictions, dtype=tf.float32)
        errors = y_true - y_predictions
        condition = tf.abs(errors) <= delta
        l2_squared_loss = 0.5 * tf.square(errors)
        l1_absolute_loss = delta * (tf.abs(errors) - 0.5 * delta)
        loss = tf.where(condition, l2_squared_loss, l1_absolute_loss)
        return loss

    @tf.function
    def train(self, inputs, targets):  # Optimization
        # Wrap computation inside a GradientTape for automatic differentiation
        with tf.GradientTape() as tape:
            predictions = self.predict(inputs)  # Q(s,a)
            current_loss = self.huber_error_loss(predictions, targets)

        # Trainable variables to update
        trainable_variables = list(self.weights.values()) + list(self.biases.values())

        gradients = tape.gradient(current_loss, trainable_variables)

        # Update weights and biases following gradients
        optimizer.apply_gradients(zip(gradients, trainable_variables))

        return tf.reduce_mean(current_loss)

    @tf.function
    def predict(self, inputs, is_target=False):  # 4D input for CNN: (batch_size, height, width, depth)
        # Input shape: [32 or 1, 84, 84, 4]. A batch of 84x84x4 stacked (gray scale) images.
        inputs = self.normalize_images(inputs)

        # Convolution Layer 1 with output shape [32 or 1, 20, 20, 32]
        conv1_weights = self.target_weights['conv1_target_weights'] if is_target else self.weights['conv1_weights']
        conv1_biases = self.target_biases['conv1_target_biases'] if is_target else self.biases['conv1_biases']
        conv1 = self.convolutional_2d_layer(inputs, conv1_weights, conv1_biases, strides=CONV1_FILTER_STRIDES)

        # Convolutional Layer 2 with output shape [32 or 1, 9, 9, 64]
        conv2_weights = self.target_weights['conv2_target_weights'] if is_target else self.weights['conv2_weights']
        conv2_biases = self.target_biases['conv2_target_biases'] if is_target else self.biases['conv2_biases']
        conv2 = self.convolutional_2d_layer(conv1, conv2_weights, conv2_biases, strides=CONV2_FILTER_STRIDES)

        # Convolutional Layer 3 with output shape [32 or 1, 7, 7, 64]
        conv3_weights = self.target_weights['conv3_target_weights'] if is_target else self.weights['conv3_weights']
        conv3_biases = self.target_biases['conv3_target_biases'] if is_target else self.biases['conv3_biases']
        conv3 = self.convolutional_2d_layer(conv2, conv3_weights, conv3_biases, strides=CONV3_FILTER_STRIDES)

        # Flatten output of 3nd conv. layer to fit dense layer input, output shape [32 or 1, 64*7*7]
        flattened_layer = self.flatten_layer(layer=conv3)

        # Dense fully connected layer with output shape [32 or 1, 512]
        dense_weights = self.target_weights['dense_target_weights'] if is_target else self.weights['dense_weights']
        dense_biases = self.target_biases['dense_target_biases'] if is_target else self.biases['dense_biases']
        dense_layer = self.dense_layer(flattened_layer, dense_weights, dense_biases)

        # Fully connected output of shape [32 or 1, 4]
        output_weights = self.target_weights['output_target_weights'] if is_target else self.weights['output_weights']
        output_biases = self.target_biases['output_target_biases'] if is_target else self.biases['output_biases']
        output_layer = self.output_layer(dense_layer, output_weights, output_biases)

        return output_layer

    @tf.function
    def overwrite_model_params(self):  # Assume same order and length
        for weight, target_weight_key in zip(self.weights.values(), self.target_weights.keys()):
            self.target_weights[target_weight_key].assign(tf.identity(weight))

        for bias, target_bias_key in zip(self.biases.values(), self.target_biases.keys()):
            self.target_biases[target_bias_key].assign(tf.identity(bias))

    @tf.function
    def load_tf_variables(self, model_parameters, raw_parameters, title=None):
        for key in raw_parameters:
            model_parameters[key].assign(tf.identity(raw_parameters[key]))
            print(f'{title} - Loaded: {key}')
        if title is not None: 
            print(f'Done Loading: {title}')

    def load_model_parameters(self, raw_parameters):
        self.load_tf_variables(self.biases, raw_parameters['biases'], 'Biases')
        self.load_tf_variables(self.weights, raw_parameters['weights'], 'Weights')
        self.overwrite_model_params()  # Copy loaded parameters to target network
        print('Parameters loaded')

    @staticmethod
    def convert_tf_parameters_to_native(parameters, title=None):
        extracted_params = {}
        for key in parameters:
            extracted_params[key] = parameters[key].numpy().tolist()
        if title is not None: print(f'Extracted: {title} - {parameters[key].shape}')
        return extracted_params

    def extract_parameters(self):
        parameters = {
            'weights': self.convert_tf_parameters_to_native(self.weights, 'weights'),
            'biases': self.convert_tf_parameters_to_native(self.biases, 'biases')
        }
        return parameters

### Agent
Deep Q Network Agent uses Deep Convolutional Network to approximate Q-values of states given actions and Q Reinforcement Learning to learn an optimal policy in the Breakout Atari Open AI game (i.e. gym) environment. 

In [None]:
class Agent:
    """
    Agent takes actions and saves them to its memory, which is initialized with a given capacity
    """
    steps = 0
    exploration_rate = INITIAL_EXPLORATION

    def decay_exploration_rate(self):
        decay_rate = (self.exploration_rate - FINAL_EXPLORATION) / FINAL_EXPLORATION_FRAME
        return decay_rate

    # Initialize agent with a given memory capacity, and a state, and action space
    def __init__(self, number_of_states, number_of_actions):
        self.experiences = ReplayMemory(REPLAY_MEMORY_SIZE)
        self.model = ConvolutionalNeuralNetwork(number_of_states, number_of_actions)  
        self.number_of_states = number_of_states
        self.number_of_actions = number_of_actions
        self.decay_rate = self.decay_exploration_rate()

    # The behaviour policy during training was e-greedy with e annealed linearly
    # from 1.0 to 0.1 over the first million frames, and fixed at 0.1 thereafter
    def e_greedy_policy(self, state):
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > self.exploration_rate:
            next_q_values = self.model.predict(state)
            best_action = np.argmax(next_q_values)  
        else:
            best_action = self.random_policy()
        return best_action

    def random_policy(self):
        return random.randint(0, self.number_of_actions - 1)

    def act(self, state):
        return self.random_policy() if self.experiences.size <= REPLAY_START_SIZE else self.e_greedy_policy(state)

    def update_target_model(self):
        self.model.overwrite_model_params()

    def observe(self, experience):
        self.experiences.add(experience)
        self.steps += 1
        self.exploration_rate = (FINAL_EXPLORATION if self.exploration_rate <= FINAL_EXPLORATION 
                                 else self.exploration_rate - self.decay_rate)

        if self.steps % TARGET_NETWORK_UPDATE_FREQUENCY == 0:
            self.update_target_model()

    def replay(self):  # Experience: (state, action, reward, next_state, is_done) # Train neural net with experiences
        memory_batch = self.experiences.sample(MINIBATCH_SIZE)
        states = tf.reshape([state for (state, *rest) in memory_batch],
                            shape=(MINIBATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, AGENT_HISTORY_LENGTH))
        next_states = tf.reshape([next_state for (_, _, _, next_state, _) in memory_batch],
                                 shape=(MINIBATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, AGENT_HISTORY_LENGTH))

        current_q_predictions = self.model.predict(states)
        next_q_predictions_target = self.model.predict(next_states, is_target=True)

        targets = np.zeros(shape=(MINIBATCH_SIZE, self.number_of_actions))
        for i, (state, action, current_reward, next_state, is_done) in enumerate(memory_batch):
            current_q_values = current_q_predictions[i].numpy()  # Q(s,a) ~= Q(s,a,theta)
            next_q_values_target = tf.reshape(next_q_predictions_target[i], shape=[-1])  # Q(s', a') ~ Q(s', a', target)

            future_discounted_reward = DISCOUNT_FACTOR * tf.reduce_max(next_q_values_target)  # max_a' Q(s', a', target)
            current_q_values[action] = current_reward if is_done else current_reward + future_discounted_reward 
            targets[i] = current_q_values
        targets = tf.convert_to_tensor(targets)
        average_loss = self.model.train(states, targets)  # Q(s,a) predictions, Q(s',a',theta_target) targets
        return average_loss

    def extract_model(self):
        return self.model.extract_parameters()
    
    def extract_replay_memory(self):
        return self.experiences.extract_memory()

    def load_replay_memory(self,replay_memory):
        self.experiences.load_memory(replay_memory)

    def extract_agent_parameters(self):
        return {'exploration_rate': self.exploration_rate, 'steps': self.steps}

    def load_agent_parameters(self, parameters):
        self.exploration_rate = parameters['exploration_rate']
        self.steps = parameters['steps']

    def load_model(self, raw_parameters):
        self.model.load_model_parameters(raw_parameters)

### Environment
Environment used to wrap Atari Learning Environment (ALE) from Open Gym library with customized step and reset state functions.

In [None]:
class Environment:
    """
    Creates a game environment which an agent can play using certain actions.
    Run takes an agent as argument that plays the game, until the agent 'dies' (no more lives)
    """

    def __init__(self, problem, video_utility):
        #self.gym = video_utility.wrap_environment(gym.make(problem))
        self.gym = gym.make(problem)
        self.video_utility = video_utility
        self.state_space = (IMAGE_HEIGHT, IMAGE_WIDTH, AGENT_HISTORY_LENGTH)  # Image dimensions before preprocessing
        self.frame_preprocessor = FramePreprocessor(self.gym.observation_space.shape)
        self.best_reward, self.acc_reward, self.life_count, self.steps = 0, 0, 0, 0
        self.state, self.next_state = [], []

    def extract_environment_parameters(self):
        return {'best_reward': self.best_reward, 'acc_reward': self.acc_reward}

    def load_environment_parameters(self, parameters):
        self.best_reward = parameters['best_reward']
        self.acc_reward = parameters['acc_reward']

    def get_num_of_actions(self):
        return self.gym.action_space.n

    def get_num_of_states(self):
        return self.state_space

    @tf.function  # Clip rewards to 1 if positive and -1 if negative
    def clip_reward(self, reward):
        return tf.sign(reward)

    @staticmethod
    def add_frame(frame, state):
        if len(state) >= AGENT_HISTORY_LENGTH:
            state.pop()
        state.append(frame)

    def add_frame_to_state(self, frame, is_next_frame=False):
        self.add_frame(frame, self.next_state) if is_next_frame else self.add_frame(frame, self.state)

    def generate_random_state_from_n_frames(self, agent, current_frame, num_of_frames=AGENT_HISTORY_LENGTH):
        while len(self.state) < num_of_frames:
            self.add_frame_to_state(current_frame)
            action = agent.random_policy()
            next_frame, reward, is_done, info = self.gym.step(action)
            next_frame = self.frame_preprocessor.preprocess_frame(next_frame)
            self.add_frame_to_state(next_frame, is_next_frame=True)
            current_frame = next_frame
            self.steps += 1

    @staticmethod
    def reshape_state(state, batch_size=1):
        state = tf.stack(state, axis=-1)  # stack 4 frames into 4 channels
        return tf.reshape(state, shape=(batch_size, IMAGE_HEIGHT, IMAGE_WIDTH, AGENT_HISTORY_LENGTH))

    def reset(self):
        self.life_count, self.steps = 0, 0
        start_state = self.gym.reset()
        return self.frame_preprocessor.preprocess_frame(start_state)

    def get_terminal(self, info, is_done):
        new_life_count = info['ale.lives']
        lost_life = True if new_life_count < self.life_count else False
        self.life_count = new_life_count
        terminal = True if is_done or lost_life else is_done
        return terminal

    def step(self, agent):  # (Action, State) : New State
        self.state = self.reshape_state(self.state)
        action = agent.act(self.state)
        next_frame, reward, is_done, info = self.gym.step(action)
        next_frame = self.frame_preprocessor.preprocess_frame(next_frame)
        self.add_frame_to_state(next_frame, is_next_frame=True)
        # reward = self.clip_reward(reward) # Generalization to other Atari games 
        is_done = self.get_terminal(info, is_done)
        #if is_done:
            #self.gym.stats_recorder.save_complete()
            #self.gym.stats_recorder.done = True
        next_state = tf.zeros(shape=(1, IMAGE_WIDTH, IMAGE_HEIGHT, AGENT_HISTORY_LENGTH),
                              dtype=tf.uint8) if is_done else self.next_state
        return self.state, action, reward, next_state, is_done

    def run(self, agent, should_print_save=False, is_train=False):
        total_reward = 0
        self.life_count = 0
        average_train_losses = []
        current_frame = self.reset()
        self.generate_random_state_from_n_frames(agent, current_frame)
        [self.gym.step(NOOP_ACTION) for _ in range(random.randint(0, NOOP_MAX))]  # No action random number of times

        while True:
            if should_print_save:
                self.gym.render()

            self.state, action, reward, next_state, is_done = self.step(agent)
            experience = self.reshape_state(self.state), action, reward, self.reshape_state(next_state), is_done
            agent.observe(experience)

            if agent.experiences.size >= REPLAY_START_SIZE and is_train:
                if self.steps % UPDATE_FREQUENCY == 0:  # Only train every 4th step (action)
                    average_train_loss = agent.replay()
                    average_train_losses.append(average_train_loss.numpy())

            self.state = self.next_state
            self.steps += 1
            total_reward += reward

            if is_done:  
                if is_train or agent.experiences.size  >= REPLAY_START_SIZE:
                    break # Break when memory filled or training is done 
                current_frame = self.reset() # Initially play randomly 

        if is_train:
            self.best_reward = total_reward if total_reward > self.best_reward else self.best_reward
            self.acc_reward += total_reward
            average_episode_loss = float(np.mean(average_train_losses)) if len(average_train_losses) > 0 else 0.0
            return average_episode_loss, total_reward
        else:
            total_reward = 0  

### Main Method
Setup model name to be stored in "aml_models" folder in Colab Notebooks folder in Google Drive if IS_GOOGLE is toggled True in top of file. Otherwise, the model is automatically stored locally. Remember to set a model name in "MODEL_NAME" and set the interval of how often the periodic results of the DQN agent should be printed and the models stored down below. The memory of the agent is initally pre-populated with random play experiences before training episodes are started using the Convolutional Neural Network and Q-Learning  during Agent replay.

In [None]:
# Setup learning environment
MODEL_NAME = f"stored_model_original_Adam_He_Leaky_ReLU"  
PRINT_SAVE_INTERVAL = 100  # NB: do not set low save interval since gzip is not atomic
filehandler = FileHandler(filename=MODEL_NAME)
video_utility = VideoUtility(MODEL_NAME)
environment = Environment(PROBLEM, video_utility)
dqn_agent = Agent(environment.get_num_of_states(), environment.get_num_of_actions())


# Load session 
timer = Timer("Total Session")
initial_episode, average_losses, total_rewards, average_rewards = filehandler.load_session(dqn_agent, environment)
episodes = [] if initial_episode == 0 else list(range(1, initial_episode + 1))
labels = [(average_losses, f"Average loss over time ({MODEL_NAME})", "Average loss"),
          (total_rewards, f"Reward over time ({MODEL_NAME})", "Reward"),
          (average_rewards, f"Average reward over time ({MODEL_NAME})", "Average reward")]

# Pre-populate memory with random experiences 
if dqn_agent.experiences.size <= REPLAY_START_SIZE:
    print('Filling memory')
    environment.run(dqn_agent, should_print_save=False, is_train=False)  
    clear_output()

print('Start training')
# Start learning experiemnts 
for episode in range(initial_episode + 1, NUMBER_OF_EPISODES):  # Start train
    SHOULD_PRINT_SAVE = episode % PRINT_SAVE_INTERVAL == 0 and dqn_agent.experiences.size >= REPLAY_START_SIZE
    average_loss, total_reward = environment.run(dqn_agent, SHOULD_PRINT_SAVE, is_train=True)
    average_reward = np.round(environment.acc_reward / (episode), 4)
    episodes.append(episode)
    average_losses.append(average_loss)
    total_rewards.append(total_reward)
    average_rewards.append(average_reward)  

    if SHOULD_PRINT_SAVE:
        video_utility.clear_files()
        clear_output()
        print(f"Episode: {episode} - Overall best reward: {environment.best_reward} - "
              f"Avg reward: {average_reward} - Learning Rate: {dqn_agent.exploration_rate}")
        timer.stop()
        [plot(episodes, y, title, ylabel) for y, title, ylabel in labels]
        #video_utility.show_video()
        filehandler.save_session(dqn_agent, environment, episode, average_losses, total_rewards, average_rewards)