This project aims to study and select the best experience replay method of deep-q-learning in training agent to play the game "Flappy Bird", by experimenting the reward received under different hyperparameters used, ranging from initial epsilon, to learning rate, and memory size used to update the 3 layer convolution fully connected neural network. Agent first receive environment information through image per frame, epsilon greedy algorithm is then used to select agent's action (exploration or exploitation) in the next frame. The outcome, action taken, current frame information and next frame information after taking the action, is then stored into the memory batch which, is used to update the neural network, allowing the agent to learn how to "play the game".

The "assets" folders and its contents, along with the "src/flappy_bird.py", are obtained from  this site: https://github.com/uvipen/Flappy-bird-deep-Q-learning-pytorch

"src/flappy_bird.py" takes in a number, 0 means no action, 1 means jump, and returns 3 variables: state as image, reward as float, terminal as boolean. 

State is the image after action applied

Reward is 0.1 if bird is alive, 1 if pass through pipe, -1 if dead.

Terminal is TRUE if bird is dead, else FALSE

In [None]:
import os
import shutil
import torch.optim as optim
import torch
import torch.nn as nn
import numpy as np
from src.memory import *
from src.convModel import *
from src.utils import *
from src.flappy_bird import *
import random
from matplotlib import pyplot as plt
import time as t

The train_model sub is used to train the agent in playing the game, it first selects the action by either randomly, or prediction from the neural network, 

In [None]:
# train_model will train model 2 million times and save model and graph every 100k times

def train_model():
    
    # action to be taken by agent
    # [1, 0] means no action
    # [0, 1] means jump
    action = torch.zeros(2, dtype=torch.float32)
    
    # used to keep count number of iterations passed (number of frames)
    iter = 0

    # getting initial state
    state_image, reward, terminal = game.next_frame(action[1])
    
    # pre-processing on image, aiming to reduce image size and convert image to monochrome
    state = pre_processing(state_image)
    state = torch.cat((state, state, state, state)).unsqueeze(0)
    alive_stat = []
    alive_time = 0
    log_path = "tensorboard"
    save_path = "trained_models"
    graph_path = "graph"
    if os.path.isdir(log_path):
        shutil.rmtree(log_path)
    os.makedirs(log_path)
    
    # training the agent
    while iter < MAX_ITER:
        
        iter = iter + 1
        q_value = model(state)[0]
        
        action = torch.zeros(2, dtype=torch.float32)
        if torch.cuda.is_available():
            action = action.cuda()
        
        # Epsilon-Greedy implementation
        epsilon = INITIAL_EPSILON * (1 - iter / MAX_ITER)
        u = random.random()
        random_action = False if epsilon < u else True
        index = torch.argmax(q_value).item()
        
        if random_action:
            print("Performed random action!")
            index = randint(0, 1)     
        action[index] = 1  

        # Perform action and get next state information
        next_state_image, reward, terminal = game.next_frame(action[1])
        next_state_image = pre_processing(next_state_image)
        next_state = torch.cat((state.squeeze(0)[1:, :, :], next_state_image)).unsqueeze(0)
        
        reward = torch.from_numpy(np.array([reward], dtype=np.float32)).unsqueeze(0)
        
        action = action.unsqueeze(0)
        
        # save replay
        memory.push(state, action, next_state, reward, terminal)
        
        state = next_state
        
        # update neural network
        update_model()

        # print information of each frame
        print("Iteration: {}/{}, Action: {}, Reward: {}, Q-value: {}".format(
            iter + 1,
            MAX_ITER,
            action[0],
            reward, torch.max(q_value)))

        # saving model every 100000 iteration
        alive_time += 1
        if(iter+1) % 100000 == 0:
            torch.save(model, "{}/flappy_bird_{}".format(save_path, iter+1))
        
        # plot how long agent survived
        if terminal:
            alive_stat.append(alive_time)
            plot_duration(alive_stat)
            alive_time = 0
        
        if(iter+1) % 100000 == 0:
            plt.savefig('{}/train_{}.jpg'.format(graph_path, iter+1))
    torch.save(model,"{}/flappy_bird".format(save_path))

This part updates the model by selecting a batch_size from memory, update the model using the memory along with the loss function

In [None]:
def update_model():
    
    batch = memory.sample(BATCH_SIZE)
    # unpack minibatch
    state_batch = torch.cat(tuple(d[0] for d in batch))
    action_batch = torch.cat(tuple(d[1] for d in batch))
    next_state_batch = torch.cat(tuple(d[2] for d in batch))
    reward_batch = torch.cat(tuple(d[3] for d in batch))
    terminal_batch =[d[4] for d in batch]
    
    if torch.cuda.is_available(): 
        state_batch = state_batch.cuda()
        action_batch = action_batch.cuda()
        reward_batch = reward_batch.cuda()
        next_state_batch = next_state_batch.cuda()
        terminal_batch = terminal_batch.cuda()

    next_action_batch = model(next_state_batch)
    # if dead, rj, otherwise r_j + gamma*max(Q_t+1)
    y_batch = torch.cat(tuple(reward_batch[i] if batch[i][4]
                                  else reward_batch[i] + DISCOUNT_FACTOR * torch.max(next_action_batch[i])
                                  for i in range(len(batch))))
    # Extract Q-value (this part i don't understand)
    
    q_value = torch.sum(model(state_batch) * action_batch, dim=1)

    optimizer.zero_grad()

    # Returns a new Tensor, detached from the current graph, the result will never require gradient
    y_batch = y_batch.detach()

    # Calculate loss
    loss = criterion(q_value, y_batch)

    # Do backward pass
    loss.backward()
    optimizer.step()

This part plots the time agent survived every 100 episodes, used for reporting purpose (see diagrams used above)

In [None]:
def plot_duration(duration):
    """Plot durations of episodes and average over last 100 episodes"""
    plt.figure(1)
    plt.clf()
    durations_t = torch.tensor(duration, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)

We decided to alter the variables (Initial epsilon, learning rate, max experience, batch size) and compare the differences between each output, thus decides which is the best experience replay model:

Training:

![](train.PNG)

Testing:

![](test.PNG)

It clearly shows that if we increase the memory size, the time taken to run the model is almost doubled, however agent is getting the same reward as default (left most image)
If we increase the initial epsilon, i.e. increase the chance to take random action, the agent does not train at all
If we decrease the learning rate however, the agent trains very well and the reward is 4 times higher!


This is the main function to run the code

In [None]:
LEARNING_RATE = 1e-5
MAX_ITER = 500000
MAX_EXPERIENCE = 50
DISCOUNT_FACTOR = 0.99
BATCH_SIZE = 30
INITIAL_EPSILON = 0.2

start_time = t.time()
iter = 0

# Constructing memory class
memory = Memory(MAX_EXPERIENCE)

criterion = nn.MSELoss()
game = FlappyBird()
model = ThreeLayerConvModel()
optimizer = optim.Adam(model.parameters(), LEARNING_RATE)
graph_path = "graph"
plt.ion()
train_model()
print(t.time() - start_time)
plt.ioff()
plt.show()