In [None]:
import tensorflow as tf
import tflearn # 0.3.2
import pygame # 1.9.3
import cv2
import sys
sys.path.append("game/")
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import wrapped_flappy_bird as game
import random
import numpy as np
from collections import deque
import matplotlib.pylab as plt
import time

In [None]:
ACTIONS = 2 # number of actions 
GAMMA = 0.99 # decay rate
OBSERVE = 10000. # timesteps to observe before training
EXPLORE = 3000000. # frames over which to anneal epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
REPLAY_MEMORY_SIZE = 50000 # number of previous transitions to remember
BATCH_SIZE = 32 # size of minibatch
FRAME_PER_ACTION = 1
IMG_SIZE = 80

model_folder = 'saved_networks'

In [None]:
def create_networks():
    # input layer
    input_state = tf.placeholder("float", [None, IMG_SIZE, IMG_SIZE, 4])

    # hidden layers   
    h_conv1 = tflearn.conv_2d(input_state, nb_filter=32, filter_size=8, strides=4, activation='relu',
                              weights_init=tflearn.initializations.truncated_normal(stddev=0.01))
    h_pool1 = tflearn.max_pool_2d(h_conv1, kernel_size=2, strides=2)

    h_conv2 = tflearn.conv_2d(h_pool1, nb_filter=64, filter_size=4, strides=2, activation='relu',
                              weights_init=tflearn.initializations.truncated_normal(stddev=0.01))
    h_conv3 = tflearn.conv_2d(h_conv2, nb_filter=64, filter_size=3, strides=1, activation='relu',
                              weights_init=tflearn.initializations.truncated_normal(stddev=0.01))

    h_conv3_flatten = tflearn.flatten(h_conv3)
    h_fc1 = tflearn.fully_connected(h_conv3_flatten, n_units=512, activation='relu',
                                    weights_init=tflearn.initializations.truncated_normal(stddev=0.01))

    # output: Q values
    q_value = tflearn.fully_connected(incoming=h_fc1, n_units=ACTIONS)

    return input_state, q_value

In [None]:
def load_networks(sess, folder_name='saved_networks'):
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    checkpoint = tf.train.get_checkpoint_state(folder_name)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    return saver

In [None]:
def preprocess_img(img):
    img_gray = cv2.cvtColor(cv2.resize(img, (IMG_SIZE, IMG_SIZE)), cv2.COLOR_BGR2GRAY)
    _, img_binary = cv2.threshold(img_gray, 1, 255, cv2.THRESH_BINARY)
    return img_binary

In [None]:
# choose action epsilon greedy
def choose_action(t, readout_t, epsilon):
    a_t = np.zeros([ACTIONS])
    action_index = 0
    if t % FRAME_PER_ACTION == 0:
        if random.random() <= epsilon: # random action
            print("- Random Action -")
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else: # action of the max Q value
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1
    else:
        a_t[0] = 1  # do nothing
    return a_t, action_index

In [None]:
def train_networks(train):
    # init session
    sess = tf.InteractiveSession()
    # init q_value networks
    s, q_values= create_networks()
    network_params = tf.trainable_variables()
    # init target_q_value networks
    st, target_q_values = create_networks()
    target_network_params = tf.trainable_variables()[len(network_params):]
    # init replay memory for storing the previous observations
    replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
    
    # update target networks operation
    update_target_network_params = [target_network_params[i].assign(network_params[i]) for i in range(len(target_network_params))]
    # define training operation
    y = tf.placeholder("float", [None])
    input_action = tf.placeholder("float", [None, ACTIONS])
    q_eval = tf.reduce_sum(tf.multiply(q_values, input_action), reduction_indices=1) # Q Value of the input action
    cost = tf.reduce_mean(tf.square(y - q_eval))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # initialize a game
    acceleration = train
    game_state = game.GameState()
    game_state.acceleration = acceleration
    # cost list
    cost_tmp = deque(maxlen = 1000) # save a cost list to adjust the step parameter C
    cost_tmp.append(0) # to avoid np.mean nan
    
    # get the first state by doing nothing and preprocess the image to (IMAGE_SIZE x IMAGE_SIZE x 4)
    action_do_nothing = np.zeros(ACTIONS)
    action_do_nothing[0] = 1
    x_t_colored, r_0, terminal = game_state.frame_step(action_do_nothing) # image, reward, terminal
    x_t = preprocess_img(x_t_colored)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    saver = load_networks(sess, folder_name=model_folder)
    t = 0
    score = 0
    start_time = time.time()#to compute the time to avoid ploting too frequently 

    if train:
        sess.run(update_target_network_params)

        # start training
        epsilon = INITIAL_EPSILON
        game_count = 0
        C = 1 # update target Q network every C step
        while 1:
            # choose an action epsilon greedily
            readout_t = q_values.eval(feed_dict={s : [s_t]})[0]
            a_t, action_index = choose_action(t, readout_t, epsilon)
    
            # run the selected action and observe next state and reward
            x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = preprocess_img(x_t1_colored)
            x_t1 = np.reshape(x_t1, (IMG_SIZE, IMG_SIZE, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)
    
            # store the transition in replay memory
            replay_memory.append((s_t, a_t, r_t, s_t1, terminal))
    
            # only train if done observing
            if t > OBSERVE:
                # sample a minibatch for training
                minibatch = random.sample(replay_memory, BATCH_SIZE)
                # extract the batch variables (s_t, a_t, r_t, s_t1)
                s_j_batch = [d[0] for d in minibatch]
                a_batch = [d[1] for d in minibatch]
                r_batch = [d[2] for d in minibatch]
                s_j1_batch = [d[3] for d in minibatch]
                # compute y
                y_batch = []
                readout_j1_batch = target_q_values.eval(feed_dict = {st : s_j1_batch})
                for i in range(len(minibatch)):
                    terminal_ = minibatch[i][4]
                    if terminal_:
                        # y = reward
                        y_batch.append(r_batch[i]) 
                    else:
                        # y = reward + GAMMA * maxQ'
                        y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))
    
                # perform gradient step
                train_step.run(feed_dict = {
                    y : y_batch,
                    input_action : a_batch,
                    s : s_j_batch}
                )
                cost_tmp.append(cost.eval(feed_dict = {y:y_batch,input_action:a_batch,s:s_j_batch}))
                
            # update the old values
            s_t = s_t1
            t += 1
            score += r_t
            
            # scale down epsilon
            if epsilon > FINAL_EPSILON and t > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
                
            # update target network params every C steps
            # adjust C by cost_tmp
            if t % C == 0:
                sess.run(update_target_network_params)
                if np.mean(cost_tmp) > 0.5:
                    C *= 2
                else:
                    C = np.ceil(C/2)
                    
            # save every 10000 iterations
            if t % 10000 == 0:
                saver.save(sess, model_folder + '/bird-dqn', global_step = t)

            # print info
            if t <= OBSERVE:
                state = "observe"
            else:
                state = "train"
            print(f"TIMESTEP: {t} STATE: {state}, EPSILON: {epsilon}, ACTION: {action_index}, reward: {r_t}")

            if terminal:
                game_state.acceleration = acceleration
                game_count += 1
                if time.time() - start_time > 60: # print info every 60 secs
                    print(f'game: {game_count}, state: {state}, train_step: {t} score: {score}')
                    print(f'C: {C}, mean cost: {np.mean(cost_tmp)}')
                    start_time = time.time()
                score=0

    # test network
    else:
        terminal = False
        while not terminal:
            readout_t = q_values.eval(feed_dict={s : [s_t]})[0]
            a_t = np.zeros([ACTIONS])
            if t % FRAME_PER_ACTION == 0:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
            else:
                a_t[0] = 1 # do nothing

            # run the selected action and observe next state and reward
            x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = preprocess_img(x_t1_colored)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)
            s_t = s_t1
            
            score += r_t if r_t == 1 else 0
            if time.time() - start_time > 10:
                print('current score:',score)
                start_time = time.time()
            if terminal:
                print('total score:',score)

In [None]:
# training
# train_networks(train=True)

In [None]:
# testing
model_folder = 'pretrained'
train_networks(train=False)