In [1]:
#!/usr/bin/env python
from __future__ import print_function

import skimage as skimage
from skimage import transform, color, exposure
from skimage.viewer import ImageViewer
#from skimage.viewer.core import *
import random
import sys
import os
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from random import choice
import numpy as np
from collections import deque
import time
import tensorflow as tf
import json
from tensorflow.keras import models

from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from keras import backend as K
import vizdoom as vzd
from vizdoom import DoomGame, ScreenResolution
from vizdoom import *
import itertools as it
from time import sleep
from time import time


  from .core import *


In [2]:
#%magic

In [3]:
#%debug

In [4]:
teststart = time()

In [5]:
if len(tf.config.experimental.list_physical_devices('GPU')) > 1:
    print("GPU available")
    DEVICE = "/gpu:0"
else:
    print("No GPU available")
    DEVICE = "/cpu:0"

No GPU available


In [6]:

def dueling_dqn(input_shape, action_size, learning_rate):

        state_input = tf.keras.Input(shape=(input_shape))
        x = tf.keras.layers.Conv2D(32, 8, 4, activation='relu')(state_input)
        x = tf.keras.layers.Conv2D(64, 4, 2, activation='relu')(x)
        x = tf.keras.layers.Conv2D(64, 3, 3, activation='relu', padding="same")(x)
        x = tf.keras.layers.Flatten()(x)

        # state value tower - V
        state_value = tf.keras.layers.Dense(256, activation='relu')(x)
        state_value = tf.keras.layers.Dense(1)(state_value)
        state_value = tf.keras.layers.Lambda(lambda s: K.expand_dims(s[:, 0], -1),
                                             output_shape=(action_size,))(state_value)

        # action advantage tower - A
        action_advantage = tf.keras.layers.Dense(256, activation='relu')(x)
        action_advantage = tf.keras.layers.Dense(action_size)(action_advantage)
        action_advantage = tf.keras.layers.Lambda(lambda a: a[:, :] - K.mean(a[:, :], 
                                    keepdims=True), output_shape=(action_size,))(action_advantage)

        # merge to state-action value function Q
        #state_action_value = ([state_value + action_advantage])
        state_action_value = (state_value + action_advantage)

        model = Model(state_input, state_action_value)
        #model.compile(rmsprop(lr=learning_rate), "mse")
        adam = tf.keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss='mse',optimizer=adam)
        model.summary()
        return model



In [7]:
def preprocessImg(img, size):

    img = np.rollaxis(img, 0, 2)    # It becomes (640, 480, 3)
    img = skimage.transform.resize(img,size)
    img = skimage.color.rgb2gray(img)

    return img
    

In [8]:
class DoubleDQNAgent:

    def __init__(self, state_size, action_size):

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # these is hyper parameters for the Double DQN
        self.gamma = 0.99
        self.learning_rate = 0.0001
        self.epsilon = 1.0
        self.initial_epsilon = 1.0
        self.final_epsilon = 0.0001
        #self.epochs = 2
        self.batch_size = 4 #32
        self.observe = 16 #5000
        self.explore = 12 #50000 
        self.frame_per_action = 4
        self.update_target_freq =8 #3000 
        self.timestep_per_train = 8 #100 # Number of timesteps between training interval
            #times
            #batch 4, observe 8 explore 8 frames 4 update freq 8 timestep 8 maxgame  4 time[]
        
        
        
        # create replay memory using deque
        self.memory = deque(maxlen=2000) #2000
        self.max_memory = 50000 # number of previous transitions to remember

        # create main model and target model
        self.model = None
        self.target_model = None

        # Performance Statistics
        self.stats_window_size= 10 # window size for computing rolling statistics #50
        self.mavg_score = [] # Moving Average of Survival Time
        self.var_score = [] # Variance of Survival Time
        self.mavg_ammo_left = [] # Moving Average of Ammo used
        self.mavg_kill_counts = [] # Moving Average of Kill Counts

    def update_target_model(self):
        """
        After some time interval update the target model to be same with model
        """
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """
        Get action from model using epsilon-greedy policy
        """
        if np.random.rand() <= self.epsilon:
            action_idx = random.randrange(self.action_size)
        else:
            q = self.model.predict(state)
            action_idx = np.argmax(q)
            #print("action IDX", action_idx)
        return action_idx

    def shape_reward(self, r_t, misc, prev_misc, t):
        
        # Check any kill count
        if (misc[0] > prev_misc[0]):
            r_t = r_t + 1

        if (misc[1] < prev_misc[1]): # Use ammo
            r_t = r_t - 0.1

        if (misc[2] < prev_misc[2]): # Loss HEALTH
            r_t = r_t - 0.1

        return r_t

    # Save trajectory sample <s,a,r,s'> to the replay memory
    def replay_memory(self, s_t, action_idx, r_t, s_t1, is_terminated, t):
        self.memory.append((s_t, action_idx, r_t, s_t1, is_terminated))
        if self.epsilon > self.final_epsilon and t > self.observe:
            self.epsilon -= (self.initial_epsilon - self.final_epsilon) / self.explore

        if len(self.memory) > self.max_memory:
            self.memory.popleft()

        # Update the target model to be same with model
        if t % self.update_target_freq == 0:
            self.update_target_model()

    # Pick samples randomly from replay memory (with batch_size)
    def train_minibatch_replay(self):
        """
        Train on a single minibatch
        """
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros(((batch_size,) + self.state_size)) # Shape 64, img_rows, img_cols, 4
        update_target = np.zeros(((batch_size,) + self.state_size))
        action, reward, done = [], [], []

        for i in range(batch_size):
            update_input[i,:,:,:] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i,:,:,:] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input) # Shape 64, Num_Actions

        target_val = self.model.predict(update_target)
        target_val_ = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_val[i])
                target[i][action[i]] = reward[i] + self.gamma * (target_val_[i][a])

        # make minibatch which includes target q value and predicted q value
        # and do the model fit!
        loss = self.model.train_on_batch(update_input, target)

        return np.max(target[-1]), loss

    # Pick samples randomly from replay memory (with batch_size)
    def train_replay(self):

        num_samples = min(self.batch_size * self.timestep_per_train, len(self.memory))
        replay_samples = random.sample(self.memory, num_samples)

        update_input = np.zeros(((num_samples,) + self.state_size)) 
        update_target = np.zeros(((num_samples,) + self.state_size))
        action, reward, done = [], [], []

        for i in range(num_samples):
            update_input[i,:,:,:] = replay_samples[i][0]
            action.append(replay_samples[i][1])
            reward.append(replay_samples[i][2])
            update_target[i,:,:,:] = replay_samples[i][3]
            done.append(replay_samples[i][4])

        target = self.model.predict(update_input) 
        target_val = self.model.predict(update_target)
        target_val_ = self.target_model.predict(update_target)

        for i in range(num_samples):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_val[i])
                target[i][action[i]] = reward[i] + self.gamma * (target_val_[i][a])
        callbacks = [tf.keras.callbacks.TensorBoard(
                            log_dir="/home/spillingvoid/Downloads/programs/Doom/statistics/", 
                            histogram_freq=1, write_graph=True, write_images=True, embeddings_freq=1),
                tf.keras.callbacks.ModelCheckpoint(filepath="/home/spillingvoid/Downloads/programs/Doom/models/Dueling_ddqn_weights",
                save_weights_only=True),
                
                ]
        loss = self.model.fit(update_input, target, batch_size=self.batch_size, callbacks=callbacks, 
                              epochs=1, verbose=2)
        Q_max = np.max(target[-1]) 
        print("Q_MAX", Q_max)
        return Q_max, loss.history['loss']#np.max(target[-1])

    # load the saved model
    def load_model(self, name):
        self.model.load_weights("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5")

    # save the model which is under training
    def save_model(self, name):
        self.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5")
        self.model.save_weights("/home/spillingvoid/Downloads/programs/Doom/models/")

In [9]:
def finishing():
    print("saving model")
    agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)
    print("Save Complete")
    game.close()
    print("======================================")
    print("Training is finished.")
    print("Training complete")
    endtime = time()
    print(" Test Time elapsed: %.2f minutes" % ((endtime - teststart) / 60.0))
    quit
    sys.exit()
    %reset -f
    

In [10]:
if __name__ == "__main__":

    # Avoid Tensorflow eats up GPU memory
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    #sess = tf.Session(config=config)
    #K.set_session(sess)
    
     with tf.device(DEVICE):
        game = vzd.DoomGame()
        game.load_config("/home/spillingvoid/Downloads/programs/ViZDoom/scenarios/rocket_basic.cfg")
        game.add_available_game_variable(vzd.GameVariable.KILLCOUNT)
        game.add_available_game_variable(vzd.GameVariable.AMMO5)
        game.add_available_game_variable(vzd.GameVariable.HEALTH)
        game.set_window_visible(True)
        game.set_mode(vzd.Mode.PLAYER)
        game.set_screen_format(vzd.ScreenFormat.GRAY8)
        game.set_screen_resolution(vzd.ScreenResolution.RES_640X480)
        game.get_available_buttons_size()
        game.init()

        game.new_episode()
        game_state = game.get_state()
        misc = game_state.game_variables  # [KILLCOUNT, AMMO, HEALTH]
        prev_misc = misc
    
        action_size = game.get_available_buttons_size()

        img_rows , img_cols = 30, 45
    # Convert image into Black and white
        img_channels = 4 # We stack 4 frames

        state_size = (img_rows, img_cols, img_channels)
        agent = DoubleDQNAgent(state_size, action_size)

        agent.model = dueling_dqn(state_size, action_size, agent.learning_rate)
        agent.target_model = dueling_dqn(state_size, action_size, agent.learning_rate)

        x_t = game_state.screen_buffer # 480 x 640
        x_t = preprocessImg(x_t, size=(img_rows, img_cols))
        s_t = np.stack(([x_t]*4), axis=2) # It becomes 64x64x4
        s_t = np.expand_dims(s_t, axis=0) # 1x64x64x4

        is_terminated = game.is_episode_finished()

    # Start training
        maxGAME = 4
        epsilon = agent.initial_epsilon
        GAME = 0
        t = 0
        max_life = 0 # Maximum episode life (Proxy for agent performance)
        life = 0
        #maxGAME = 0
    # Buffer to compute rolling statistics 
        life_buffer, ammo_buffer, kills_buffer = [], [], []
        
        
        game.new_episode()
        game_state = game.get_state()
        misc = game_state.game_variables 
        prev_misc = misc
            
                
        while not game.is_episode_finished():
            if GAME > maxGAME:
                    finishing()
                    
            loss = 0
            Q_max = 0
            r_t = 0
            a_t = np.zeros([action_size])

        # Epsilon Greedy
            action_idx  = agent.get_action(s_t)
            a_t[action_idx] = 1

            a_t = a_t.astype(int)
            game.set_action(a_t.tolist())
            skiprate = agent.frame_per_action
            game.advance_action(skiprate)

            game_state = game.get_state()  # Observe again after we take the action
            is_terminated = game.is_episode_finished()

            r_t = game.get_last_reward()  #each frame we get reward of 0.1, so 4 frames will be 0.4

            if (is_terminated):
                if (life > max_life):
                    max_life = life
                GAME += 1
                life_buffer.append(life)
                ammo_buffer.append(misc[1])
                kills_buffer.append(misc[0])
                print ("Episode Finish ", misc)
                game.new_episode()
                game_state = game.get_state()
                misc = game_state.game_variables
                x_t1 = game_state.screen_buffer

            x_t1 = game_state.screen_buffer
            misc = game_state.game_variables

            x_t1 = preprocessImg(x_t1, size=(img_rows, img_cols))
            x_t1 = np.reshape(x_t1, (1, img_rows, img_cols, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

            r_t = agent.shape_reward(r_t, misc, prev_misc, t)

            if (is_terminated):
                life = 0
            else:
                life += 1

        # Update the cache
            prev_misc = misc

        # save the sample <s, a, r, s'> to the replay memory and decrease epsilon
            agent.replay_memory(s_t, action_idx, r_t, s_t1, is_terminated, t)

        # Do the training
            if t > agent.observe and t % agent.timestep_per_train == 0:
                Q_max, loss = agent.train_replay()
            
            s_t = s_t1
            t += 1

        # save progress every 10000 iterations
            if t % 100 == 0:
                print("Now we save model")
                agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)
                agent.model.save_weights("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)

        # print info
            #state = ""
            if t <= agent.observe:
                state = "observe"
            elif t > agent.observe and t <= agent.observe + agent.explore:
                state = "explore"
            else:
                state = "train"

            if (is_terminated):
                print("TIME", t, "/ GAME", GAME, "/ STATE", state, \
                        "/ EPSILON", agent.epsilon, "/ ACTION", action_idx, "/ REWARD", r_t, \
                        "/ Q_MAX %e" % Q_max, "/ LIFE", max_life, "/ LOSS", loss)
                    
            # Save Agent's Performance Statistics
                if GAME % agent.stats_window_size == 0 and t > agent.observe: 
                    print("Update Rolling Statistics")
                    agent.mavg_score.append(np.mean(np.array(life_buffer)))
                    agent.var_score.append(np.var(np.array(life_buffer)))
                    agent.mavg_ammo_left.append(np.mean(np.array(ammo_buffer)))
                    agent.mavg_kill_counts.append(np.mean(np.array(kills_buffer)))

                # Reset rolling stats buffer
                    life_buffer, ammo_buffer, kills_buffer = [], [], [] 

                # Write Rolling Statistics to file
                    with open("/home/spillingvoid/Downloads/programs/Doom/statistics/dueling_ddqn_stats.txt", "a+") as stats_file:
                        stats_file.write('Game: ' + str(GAME) + '\n')
                        stats_file.write('Max Score: ' + str(max_life) + '\n')
                        stats_file.write('mavg_score: ' + str(agent.mavg_score) + '\n')
                        stats_file.write('var_score: ' + str(agent.var_score) + '\n')
                        stats_file.write('mavg_ammo_left: ' + str(agent.mavg_ammo_left) + '\n')
                        stats_file.write('mavg_kill_counts: ' + str(agent.mavg_kill_counts) + '\n')
                    
            
                
                
                    #agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30, 45, 4)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 6, 10, 32)    8224        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 2, 4, 64)     32832       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 1, 2, 64)     36928       conv2d_1[0][0]                   
______________________________________________________________________________________________

  img = skimage.color.rgb2gray(img)


7/7 - 2s - loss: 8.7335
Q_MAX 0.03090651
8/8 - 2s - loss: 8.5942
Q_MAX -0.050845064
8/8 - 0s - loss: 7.8059
Q_MAX -0.18831763
8/8 - 0s - loss: 7.5382
Q_MAX -0.405976
8/8 - 0s - loss: 5.0687
Q_MAX -0.72257495
8/8 - 0s - loss: 6.3341
Q_MAX -1.5948902
8/8 - 0s - loss: 5.6891
Q_MAX -3.5841951
Episode Finish  [  0.  43. 100.]
TIME 75 / GAME 1 / STATE train / EPSILON -0.08322499999999981 / ACTION 0 / REWARD -4.0 / Q_MAX 0.000000e+00 / LIFE 74 / LOSS 0
8/8 - 0s - loss: 5.1837
Q_MAX -5.078415
8/8 - 0s - loss: 3.4315
Q_MAX -6.115849
8/8 - 0s - loss: 3.4187
Q_MAX -7.7979655
Now we save model
8/8 - 0s - loss: 5.0237
Q_MAX -12.258146
8/8 - 0s - loss: 4.7442
Q_MAX -14.030409
8/8 - 0s - loss: 4.0089
Q_MAX -16.485273
8/8 - 0s - loss: 3.8104
Q_MAX -15.27371
8/8 - 0s - loss: 2.9870
Q_MAX -17.47933
8/8 - 0s - loss: 4.6967
Q_MAX -18.007727
Episode Finish  [  0.  44. 100.]
TIME 150 / GAME 2 / STATE train / EPSILON -0.08322499999999981 / ACTION 1 / REWARD -4.0 / Q_MAX 0.000000e+00 / LIFE 74 / LOSS 0
8/8 - 

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
