In [1]:
#!/usr/bin/env python
from __future__ import print_function

import skimage as skimage
from skimage import transform, color, exposure
from skimage.viewer import ImageViewer
import random
import sys
import os
from random import choice
import numpy as np
from collections import deque
import time
import tensorflow as tf
import json
import math
from tensorflow.keras import models


from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import Model
from keras import backend as K
import vizdoom as vzd
from vizdoom import DoomGame, ScreenResolution
from vizdoom import *
import itertools as it
from time import sleep
from time import time


from tensorflow.keras import backend as K
from tensorflow.keras import utils


  from .core import *


In [2]:
def preprocessImg(img, size):

    img = np.rollaxis(img, 0, 2)    # It becomes (640, 480, 3)
    img = skimage.transform.resize(img,size)
    img = skimage.color.rgb2gray(img)

    return img

In [3]:
teststart = time()

In [6]:
class C51Agent:

    def __init__(self, state_size, action_size, num_atoms):

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # these is hyper parameters for the DQN
        self.gamma = 0.99
        self.learning_rate = 0.0001
        self.epsilon = 1.0
        self.initial_epsilon = 1.0
        self.final_epsilon = 0.0001
        self.batch_size = 4 #32
        self.observe = 32 #2000
        self.explore = 64 # 50000
        self.frame_per_action = 4
        self.update_target_freq = 40 #3000 
        self.timestep_per_train = 4 #100 # Number of timesteps between training interval

        # Initialize Atoms
        self.num_atoms = num_atoms # 51 for C51
        self.v_max = 300 # Max possible score for Defend the center is 26 - 0.1*26 = 23.4
        self.v_min = -600 # -0.1*26 - 1 = -3.6
        self.delta_z = (self.v_max - self.v_min) / float(self.num_atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.num_atoms)]

        # Create replay memory using deque
        self.memory = deque()
        self.max_memory = 50000 # number of previous transitions to remember

        # Models for value distribution
        self.model = None
        self.target_model = None

        # Performance Statistics
        self.stats_window_size= 50 # window size for computing rolling statistics
        self.mavg_score = [] # Moving Average of Survival Time
        self.var_score = [] # Variance of Survival Time
        self.mavg_ammo_left = [] # Moving Average of Ammo used
        self.mavg_kill_counts = [] # Moving Average of Kill Counts

    def update_target_model(self):
        """
        After some time interval update the target model to be same with model
        """
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """
        Get action from model using epsilon-greedy policy
        """
        if np.random.rand() <= self.epsilon:
            action_idx = random.randrange(self.action_size)
        else:
            action_idx = self.get_optimal_action(state)

        return action_idx

    def get_optimal_action(self, state):
        """Get optimal action for a state
        """
        z = self.model.predict(state) # Return a list [1x51, 1x51, 1x51]

        z_concat = np.vstack(z)
        q = np.sum(np.multiply(z_concat, np.array(self.z)), axis=1) 

        # Pick action with the biggest Q value
        action_idx = np.argmax(q)
        
        return action_idx

    def shape_reward(self, r_t, misc, prev_misc, t):
        
        # Check any kill count
        if (misc[0] > prev_misc[0]):
            r_t = r_t + 300

        if (misc[1] < prev_misc[1]): # Use ammo
            r_t = r_t - 10

        if (misc[2] < prev_misc[2]): # Loss HEALTH
            r_t = r_t - 0.1

        return r_t

    # save sample <s,a,r,s'> to the replay memory
    def replay_memory(self, s_t, action_idx, r_t, s_t1, is_terminated, t):
        self.memory.append((s_t, action_idx, r_t, s_t1, is_terminated))
        if self.epsilon > self.final_epsilon and t > self.observe:
            self.epsilon -= (self.initial_epsilon - self.final_epsilon) / self.explore

        if len(self.memory) > self.max_memory:
            self.memory.popleft()

        # Update the target model to be same with model
        if t % self.update_target_freq == 0:
            self.update_target_model()

    # pick samples randomly from replay memory (with batch_size)
    def train_replay(self):

        num_samples = min(self.batch_size * self.timestep_per_train, len(self.memory))
        replay_samples = random.sample(self.memory, num_samples)

        state_inputs = np.zeros(((num_samples,) + self.state_size)) 
        next_states = np.zeros(((num_samples,) + self.state_size)) 
        m_prob = [np.zeros((num_samples, self.num_atoms)) for i in range(action_size)]
        action, reward, done = [], [], []

        for i in range(num_samples):
            state_inputs[i,:,:,:] = replay_samples[i][0]
            action.append(replay_samples[i][1])
            reward.append(replay_samples[i][2])
            next_states[i,:,:,:] = replay_samples[i][3]
            done.append(replay_samples[i][4])

        z = self.model.predict(next_states) # Return a list [32x51, 32x51, 32x51]
        z_ = self.model.predict(next_states) # Return a list [32x51, 32x51, 32x51]

        # Get Optimal Actions for the next states (from distribution z)
        optimal_action_idxs = []
        z_concat = np.vstack(z)
        q = np.sum(np.multiply(z_concat, np.array(self.z)), axis=1) # length (num_atoms x num_actions)
        q = q.reshape((num_samples, action_size), order='F')
        optimal_action_idxs = np.argmax(q, axis=1)

        # Project Next State Value Distribution (of optimal action) to Current State
        for i in range(num_samples):
            if done[i]: # Terminal State
                # Distribution collapses to a single point
                Tz = min(self.v_max, max(self.v_min, reward[i]))
                bj = (Tz - self.v_min) / self.delta_z 
                m_l, m_u = math.floor(bj), math.ceil(bj)
                m_prob[action[i]][i][int(m_l)] += (m_u - bj)
                m_prob[action[i]][i][int(m_u)] += (bj - m_l)
            else:
                for j in range(self.num_atoms):
                    Tz = min(self.v_max, max(self.v_min, reward[i] + self.gamma * self.z[j]))
                    bj = (Tz - self.v_min) / self.delta_z 
                    m_l, m_u = math.floor(bj), math.ceil(bj)
                    m_prob[action[i]][i][int(m_l)] += z_[optimal_action_idxs[i]][i][j] * (m_u - bj)
                    m_prob[action[i]][i][int(m_u)] += z_[optimal_action_idxs[i]][i][j] * (bj - m_l)

        loss = self.model.fit(state_inputs, m_prob, batch_size=self.batch_size, epochs=1, verbose=0)

        return loss.history['loss']

    # load the saved model
    def load_model(self, name):
        self.model.load_weights(name)

    # save the model which is under training
    def save_model(self, name):
        self.model.save("/home/spillingvoid/Downloads/programs/Doom/models/c51_ddqn.h5")

In [4]:
if len(tf.config.experimental.list_physical_devices('GPU')) > 1:
    print("GPU available")
    DEVICE = "/gpu:0"
else:
    print("No GPU available")
    DEVICE = "/cpu:0"

No GPU available


In [5]:
def value_distribution_network(input_shape, num_atoms, action_size, learning_rate):
        """Model Value Distribution

        With States as inputs and output Probability Distributions for all Actions
        """

        state_input = tf.keras.Input(shape=(input_shape)) 
        cnn_feature = tf.keras.layers.Conv2D(32, 8, 4, activation='relu')(state_input)
        cnn_feature = tf.keras.layers.Conv2D(64, 4, 2, activation='relu')(cnn_feature)
        cnn_feature = tf.keras.layers.Conv2D(64, 3, 3, activation='relu', padding="same")(cnn_feature)
        cnn_feature = tf.keras.layers.Flatten()(cnn_feature)
        cnn_feature = tf.keras.layers.Dense(512, activation='relu')(cnn_feature)

        distribution_list = []
        for i in range(action_size):
            distribution_list.append(tf.keras.layers.Dense(num_atoms, activation='softmax')(cnn_feature))

        model = Model(state_input, distribution_list)

        adam = tf.keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss='categorical_crossentropy',optimizer=adam)
        model.summary()

        return model

In [None]:
if __name__ == "__main__":

    # Avoid Tensorflow eats up GPU memory
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    #sess = tf.Session(config=config)
    #K.set_session(sess)
    with tf.device(DEVICE):
        game = vzd.DoomGame()
        game.load_config("/home/spillingvoid/Downloads/programs/ViZDoom/scenarios/rocket_basic.cfg")
        game.add_available_game_variable(vzd.GameVariable.KILLCOUNT)
        game.add_available_game_variable(vzd.GameVariable.AMMO5)
        game.add_available_game_variable(vzd.GameVariable.HEALTH)
        game.set_window_visible(True)
        game.set_mode(vzd.Mode.PLAYER)
        game.set_screen_format(vzd.ScreenFormat.GRAY8)
        game.set_screen_resolution(vzd.ScreenResolution.RES_640X480)
        game.get_available_buttons_size()
        game.init()

        game.new_episode()
        game_state = game.get_state()
        misc = game_state.game_variables  # [KILLCOUNT, AMMO, HEALTH]
        prev_misc = misc

        action_size = game.get_available_buttons_size()

        img_rows , img_cols = 30, 45
    # Convert image into Black and white
        img_channels = 4 # We stack 4 frames

    # C51
        num_atoms = 51

        state_size = (img_rows, img_cols, img_channels)
        agent = C51Agent(state_size, action_size, num_atoms)

        agent.model = value_distribution_network(state_size, num_atoms, action_size, agent.learning_rate)
        agent.target_model = value_distribution_network(state_size, num_atoms, action_size, agent.learning_rate)

        x_t = game_state.screen_buffer # 480 x 640
        x_t = preprocessImg(x_t, size=(img_rows, img_cols))
        s_t = np.stack(([x_t]*4), axis=2)    # It becomes 64x64x4
        s_t = np.expand_dims(s_t, axis=0) # 1x64x64x4

        is_terminated = game.is_episode_finished()

    # Start training
        epsilon = agent.initial_epsilon
        GAME = 0
        t = 0
        max_life = 0 # Maximum episode life (Proxy for agent performance)
        life = 0

    # Buffer to compute rolling statistics 
        life_buffer, ammo_buffer, kills_buffer = [], [], [] 

        while not game.is_episode_finished():

            loss = 0
            r_t = 0
            a_t = np.zeros([action_size])

        # Epsilon Greedy
            action_idx  = agent.get_action(s_t)
            a_t[action_idx] = 1

            a_t = a_t.astype(int)
            game.set_action(a_t.tolist())
            skiprate = agent.frame_per_action
            game.advance_action(skiprate)

            game_state = game.get_state()  # Observe again after we take the action
            is_terminated = game.is_episode_finished()

            r_t = game.get_last_reward()  #each frame we get reward of 0.1, so 4 frames will be 0.4

            if (is_terminated):
                if (life > max_life):
                    max_life = life
                GAME += 1
                life_buffer.append(life)
                ammo_buffer.append(misc[1])
                kills_buffer.append(misc[0])
                print ("Episode Finish ", misc)
                game.new_episode()
                game_state = game.get_state()
                misc = game_state.game_variables
                x_t1 = game_state.screen_buffer

            x_t1 = game_state.screen_buffer
            misc = game_state.game_variables

            x_t1 = preprocessImg(x_t1, size=(img_rows, img_cols))
            x_t1 = np.reshape(x_t1, (1, img_rows, img_cols, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

            r_t = agent.shape_reward(r_t, misc, prev_misc, t)

            if (is_terminated):
                life = 0
            else:
                life += 1

        #update the cache
            prev_misc = misc

        # save the sample <s, a, r, s'> to the replay memory and decrease epsilon
            agent.replay_memory(s_t, action_idx, r_t, s_t1, is_terminated, t)

        # Do the training
            if t > agent.observe and t % agent.timestep_per_train == 0:
                loss = agent.train_replay()

            s_t = s_t1
            t += 1

        # save progress every 10000 iterations
            if t % 10000 == 0:
                print("Now we save model")
                agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/c51_ddqn.h5")
                agent.model.save_weights("/home/spillingvoid/Downloads/programs/Doom/models/c51_ddqn.h5",
                                         overwrite=True)

        # print info
            state = ""
            if t <= agent.observe:
                state = "observe"
            elif t > agent.observe and t <= agent.observe + agent.explore:
                state = "explore"
            else:
                state = "train"

            if (is_terminated):
                print("TIME", t, "/ GAME", GAME, "/ STATE", state, \
                      "/ EPSILON", agent.epsilon, "/ ACTION", action_idx, "/ REWARD", r_t, \
                      "/ LIFE", max_life, "/ LOSS", loss)

            # Save Agent's Performance Statistics
                if GAME % agent.stats_window_size == 0 and t > agent.observe: 
                    print("Update Rolling Statistics")
                    agent.mavg_score.append(np.mean(np.array(life_buffer)))
                    agent.var_score.append(np.var(np.array(life_buffer)))
                    agent.mavg_ammo_left.append(np.mean(np.array(ammo_buffer)))
                    agent.mavg_kill_counts.append(np.mean(np.array(kills_buffer)))

                # Reset rolling stats buffer
                    life_buffer, ammo_buffer, kills_buffer = [], [], [] 

                # Write Rolling Statistics to file
                    with open("/home/spillingvoid/Downloads/programs/Doom/statistics/c51_ddqn_stats.txt", "a+") as stats_file:
                        stats_file.write('Game: ' + str(GAME) + '\n')
                        stats_file.write('Max Score: ' + str(max_life) + '\n')
                        stats_file.write('mavg_score: ' + str(agent.mavg_score) + '\n')
                        stats_file.write('var_score: ' + str(agent.var_score) + '\n')
                        stats_file.write('mavg_ammo_left: ' + str(agent.mavg_ammo_left) + '\n')
                        stats_file.write('mavg_kill_counts: ' + str(agent.mavg_kill_counts) + '\n')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30, 45, 4)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 6, 10, 32)    8224        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 2, 4, 64)     32832       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 1, 2, 64)     36928       conv2d_1[0][0]                   
______________________________________________________________________________________________

  img = skimage.color.rgb2gray(img)


Episode Finish  [  0.  46. 100.]
TIME 32 / GAME 1 / STATE observe / EPSILON 1.0 / ACTION 0 / REWARD 103.0 / LIFE 31 / LOSS 0
Episode Finish  [  0.  38. 100.]
TIME 107 / GAME 2 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  43. 100.]
TIME 182 / GAME 3 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 251 / GAME 4 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  42. 100.]
TIME 326 / GAME 5 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -9.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 401 / GAME 6 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS [487.67236328125]
Episode Finish  [  0.  46. 100.]
TIME 476 / GAME 7 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [ 

Episode Finish  [  0.  47. 100.]
TIME 3578 / GAME 58 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  48. 100.]
TIME 3595 / GAME 59 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 3670 / GAME 60 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 3745 / GAME 61 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS [7228509.0]
Episode Finish  [  0.  46. 100.]
TIME 3791 / GAME 62 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  48. 100.]
TIME 3811 / GAME 63 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 3886 / GAME 64 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -4.0 / LIFE 74 / 

Episode Finish  [  0.  45. 100.]
TIME 6969 / GAME 115 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS [8589625.0]
Episode Finish  [  0.  49. 100.]
TIME 7011 / GAME 116 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 7086 / GAME 117 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  48. 100.]
TIME 7123 / GAME 118 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 7198 / GAME 119 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  45. 100.]
TIME 7252 / GAME 120 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 7295 / GAME 121 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LI

Episode Finish  [  0.  48. 100.]
TIME 10290 / GAME 172 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  45. 100.]
TIME 10365 / GAME 173 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS [242004960.0]
Episode Finish  [  0.  48. 100.]
TIME 10419 / GAME 174 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 10494 / GAME 175 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  44. 100.]
TIME 10569 / GAME 176 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -4.0 / LIFE 74 / LOSS [105417104.0]
Episode Finish  [  0.  43. 100.]
TIME 10635 / GAME 177 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 10647 / GAME 178 / STATE train / EPSILON -0.01552343749999845 / ACTION 0

Episode Finish  [  0.  41. 100.]
TIME 13879 / GAME 228 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  48. 100.]
TIME 13954 / GAME 229 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 14029 / GAME 230 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS [959661440.0]
Episode Finish  [  0.  48. 100.]
TIME 14104 / GAME 231 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  48. 100.]
TIME 14143 / GAME 232 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 14179 / GAME 233 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 14231 / GAME 234 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 10

Episode Finish  [  0.  49. 100.]
TIME 17207 / GAME 284 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 17282 / GAME 285 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 17357 / GAME 286 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS [319143488.0]
Episode Finish  [  0.  46. 100.]
TIME 17432 / GAME 287 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  43. 100.]
TIME 17507 / GAME 288 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  45. 100.]
TIME 17582 / GAME 289 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 17657 / GAME 290 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.

Episode Finish  [  0.  46. 100.]
TIME 20557 / GAME 340 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS [944055808.0]
Episode Finish  [  0.  48. 100.]
TIME 20632 / GAME 341 / STATE train / EPSILON -0.01552343749999845 / ACTION 2 / REWARD -9.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 20707 / GAME 342 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 20782 / GAME 343 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 20799 / GAME 344 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  47. 100.]
TIME 20874 / GAME 345 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  44. 100.]
TIME 20939 / GAME 346 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103

Episode Finish  [  0.  46. 100.]
TIME 23776 / GAME 396 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 23835 / GAME 397 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  46. 100.]
TIME 23896 / GAME 398 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  48. 100.]
TIME 23939 / GAME 399 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  45. 100.]
TIME 24007 / GAME 400 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Update Rolling Statistics
Episode Finish  [  0.  46. 100.]
TIME 24043 / GAME 401 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD 103.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  49. 100.]
TIME 24051 / GAME 402 / STATE train / EPSILON -0.01552343749999845 / AC

Episode Finish  [  0.  48. 100.]
TIME 27186 / GAME 452 / STATE train / EPSILON -0.01552343749999845 / ACTION 0 / REWARD -4.0 / LIFE 74 / LOSS 0
Episode Finish  [  0.  45. 100.]
TIME 27261 / GAME 453 / STATE train / EPSILON -0.01552343749999845 / ACTION 1 / REWARD -4.0 / LIFE 74 / LOSS [4118838272.0]


In [None]:
print("Training complete")
endtime = time()
print(" Test Time elapsed: %.2f minutes" % ((endtime - teststart) / 60.0))