In [1]:
#!/usr/bin/env python
from __future__ import print_function

import skimage as skimage
from skimage import transform, color, exposure
from skimage.viewer import ImageViewer
#from skimage.viewer.core import *
import random
import sys
import os
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from random import choice
import numpy as np
from collections import deque
import time
import tensorflow as tf
import json
from tensorflow.keras import models
from tensorflow.keras import losses
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from keras import backend as K
import vizdoom as vzd
from vizdoom import DoomGame, ScreenResolution
from vizdoom import *
import itertools as it
from time import sleep
from time import time

  from .core import *


In [2]:
teststart = time()

In [3]:
if len(tf.config.experimental.list_physical_devices('GPU')) > 1:
    print("GPU available")
    DEVICE = "/gpu:0"
else:
    print("No GPU available")
    DEVICE = "/cpu:0"

No GPU available


In [4]:
def dqn(input_shape, action_size, learning_rate):

        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Conv2D(32, 8, 4, activation='relu', input_shape=(input_shape)))
        model.add(tf.keras.layers.Conv2D(64, 4, 2, activation='relu'))
        model.add(tf.keras.layers.Conv2D(64, 3, 1, activation='relu', padding='same'))
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(512, activation='relu'))
        model.add(tf.keras.layers.Dense(action_size, activation='linear'))

        adam = tf.keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss='MSE',optimizer=adam)
        model.summary()
        return model

In [5]:
def preprocessImg(img, size):

    img = np.rollaxis(img, 0, 2)    # It becomes (640, 480, 3)
    img = skimage.transform.resize(img,size)
    img = skimage.color.rgb2gray(img)

    return img
    

In [6]:
class DoubleDQNAgent:

    def __init__(self, state_size, action_size):

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # these is hyper parameters for the Double DQN
        self.gamma = 0.99
        self.learning_rate = 0.0001
        self.epsilon = 1.0
        self.initial_epsilon = 1.0
        self.final_epsilon = 0.0001
        self.batch_size = 32
        self.observe = 16#5000
        self.explore = 32#50000 
        self.frame_per_action = 4
        self.update_target_freq = 8#3000 
        self.timestep_per_train = 4 #100 # Number of timesteps between training interval

        # create replay memory using deque
        self.memory = deque()
        self.max_memory = 50000 # number of previous transitions to remember

        # create main model and target model
        self.model = None
        self.target_model = None

        # Performance Statistics
        self.stats_window_size= 50 # window size for computing rolling statistics
        self.mavg_score = [] # Moving Average of Survival Time
        self.var_score = [] # Variance of Survival Time
        self.mavg_pistol_left = [] # Moving Average of Ammo used
        self.mavg_kill_counts = [] # Moving Average of Kill Counts
        self.mavg_shotgun_left = []
        self.mavg_minigun_left = []
        self.mavg_plasma_left = []
        self.mavg_rocket_left = []
        self.mavg_secret_left = []
        self.mavg_hit_count = []
        self.mavg_damage_given = []
        self.mavg_item_collected = []
        self.mavg_armor = []

    def update_target_model(self):
        """
        After some time interval update the target model to be same with model
        """
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """
        Get action from model using epsilon-greedy policy
        """
        if np.random.rand() <= self.epsilon:
            action_idx = random.randrange(self.action_size)
        else:
            q = self.model.predict(state)
            action_idx = np.argmax(q)
        return action_idx

    def shape_reward(self, r_t, misc, prev_misc, t):
        
        # Check any kill count
        if (misc[0] > prev_misc[0]):
            r_t = r_t + 310

        if (misc[1] < prev_misc[1]): # Use pistol
            r_t = r_t - 1
        if (misc[1] > prev_misc[1]):
            r_t = r_t + 1
        
        if (misc[2] < prev_misc[2]): # Loss HEALTH
            r_t = r_t - 2
        if (misc[2] > prev_misc[2]):
            r_t = r_t + 2.25
        
        if (misc[3] < prev_misc[3]): # Loss shotgun
            r_t = r_t - 3
        if (misc[3] < prev_misc[3]):
            r_t = r_t +3
        
        if (misc[4] < prev_misc[4]): # Loss minigun
            r_t = r_t - 1
        if (misc[4] < prev_misc[4]):
            r_t = r_t + 1
        
        if (misc[5] < prev_misc[5]): # plasma
            r_t = r_t - 2
        if (misc[5] < prev_misc[5]):
            r_t = r_t + 2
            
        if (misc[6] < prev_misc[6]): # rocket
            r_t = r_t - 5
        if (misc[6] > prev_misc[6]):
            r_t = r_t + 5
            
        if (misc[7] > prev_misc[7]): # secrets
            r_t = r_t + 15 
                    
        if (misc[8] > prev_misc[8]): # hitcount
            r_t = r_t + 5
            
        if (misc[9] > prev_misc[9]): # hits taken
            r_t = r_t - 5
        
        if (misc[10] > prev_misc[10]): # items picked up
            r_t = r_t + 2
        
        if (misc[11] < prev_misc[11]): # armor
            r_t = r_t - 1
        if (misc[11] > prev_misc[11]):
            r_t = r_t + 1.5
        

        return r_t

    # Save trajectory sample <s,a,r,s'> to the replay memory
    def replay_memory(self, s_t, action_idx, r_t, s_t1, is_terminated, t):
        self.memory.append((s_t, action_idx, r_t, s_t1, is_terminated))
        if self.epsilon > self.final_epsilon and t > self.observe:
            self.epsilon -= (self.initial_epsilon - self.final_epsilon) / self.explore

        if len(self.memory) > self.max_memory:
            self.memory.popleft()

        # Update the target model to be same with model
        if t % self.update_target_freq == 0:
            self.update_target_model()

    # Pick samples randomly from replay memory (with batch_size)
    def train_minibatch_replay(self):
        """
        Train on a single minibatch
        """
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros(((batch_size,) + self.state_size)) # Shape 64, img_rows, img_cols, 4
        update_target = np.zeros(((batch_size,) + self.state_size))
        action, reward, done = [], [], []

        for i in range(batch_size):
            update_input[i,:,:,:] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i,:,:,:] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input) # Shape 64, Num_Actions

        target_val = self.model.predict(update_target)
        target_val_ = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_val[i])
                target[i][action[i]] = reward[i] + self.gamma * (target_val_[i][a])

        # make minibatch which includes target q value and predicted q value
        # and do the model fit!
        loss = self.model.train_on_batch(update_input, target)

        return np.max(target[-1]), loss

    # Pick samples randomly from replay memory (with batch_size)
    def train_replay(self):

        num_samples = min(self.batch_size * self.timestep_per_train, len(self.memory))
        replay_samples = random.sample(self.memory, num_samples)

        update_input = np.zeros(((num_samples,) + self.state_size)) 
        update_target = np.zeros(((num_samples,) + self.state_size))
        action, reward, done = [], [], []

        for i in range(num_samples):
            update_input[i,:,:,:] = replay_samples[i][0]
            action.append(replay_samples[i][1])
            reward.append(replay_samples[i][2])
            update_target[i,:,:,:] = replay_samples[i][3]
            done.append(replay_samples[i][4])

        target = self.model.predict(update_input) 
        target_val = self.model.predict(update_target)
        target_val_ = self.target_model.predict(update_target)

        for i in range(num_samples):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_val[i])
                target[i][action[i]] = reward[i] + self.gamma * (target_val_[i][a])
        callbacks = [tf.keras.callbacks.TensorBoard(log_dir="/home/spillingvoid/Downloads/programs/Doom/statistics/", 
                                                    histogram_freq=1, write_graph=True),
        tf.keras.callbacks.ModelCheckpoint(filepath="/home/spillingvoid/Downloads/programs/Doom/models/DDQN_weights",
                save_weights_only=True),
                    ]
        loss = self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=2, callbacks=callbacks)

        return np.max(target[-1]), loss.history['loss']

    # load the saved model
    def load_model(self, name):
        self.model.load_weights("/home/spillingvoid/Downloads/programs/Doom/models/ddqn.h5")

    # save the model which is under training
    def save_model(self, name):
        self.model.save("/home/spillingvoid/Downloads/programs/Doom/models/ddqn.h5")
        self.model.save_weights("/home/spillingvoid/Downloads/programs/Doom/models/ddqn.h5")

In [7]:
def finishing():
    print("saving model")
    agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)
    print("Save Complete")
    game.close()
    print("======================================")
    print("Training is finished.")
    print("Training complete")
    endtime = time()
    print(" Test Time elapsed: %.2f minutes" % ((endtime - teststart) / 60.0))
    quit
    sys.exit()
    %reset -f

In [8]:
if __name__ == "__main__":

    # Avoid Tensorflow eats up GPU memory
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    #sess = tf.Session(config=config)
    #K.set_session(sess)

    with tf.device(DEVICE):
        game = vzd.DoomGame()
        game.load_config("/home/spillingvoid/Downloads/programs/Doom/scenarios/Doom32.cfg")
        game.add_available_game_variable(vzd.GameVariable.KILLCOUNT)
        game.add_available_game_variable(vzd.GameVariable.AMMO2)
        game.add_available_game_variable(vzd.GameVariable.HEALTH)
        game.add_available_game_variable(vzd.GameVariable.AMMO3)
        game.add_available_game_variable(vzd.GameVariable.AMMO4)
        game.add_available_game_variable(vzd.GameVariable.AMMO5)
        game.add_available_game_variable(vzd.GameVariable.AMMO6)
        game.add_available_game_variable(vzd.GameVariable.SECRETCOUNT)
        game.add_available_game_variable(vzd.GameVariable.HITCOUNT)
        game.add_available_game_variable(vzd.GameVariable.HITS_TAKEN)
        game.add_available_game_variable(vzd.GameVariable.ITEMCOUNT)
        game.add_available_game_variable(vzd.GameVariable.ARMOR)
        game.new_episode()
        game_state = game.get_state()
        misc = game_state.game_variables  # [KILLCOUNT, AMMO, HEALTH]
        prev_misc = misc

        action_size = game.get_available_buttons_size()

        img_rows , img_cols = 30, 90
    # Convert image into Black and white
        img_channels = 4 # We stack 4 frames

        state_size = (img_rows, img_cols, img_channels)
        agent = DoubleDQNAgent(state_size, action_size)

        agent.model = dqn(state_size, action_size, agent.learning_rate)
        agent.target_model = dqn(state_size, action_size, agent.learning_rate)

        x_t = game_state.screen_buffer # 480 x 640
        x_t = preprocessImg(x_t, size=(img_rows, img_cols))
        s_t = np.stack(([x_t]*4), axis=2) # It becomes 64x64x4
        s_t = np.expand_dims(s_t, axis=0) # 1x64x64x4

        is_terminated = game.is_episode_finished()

    # Start training
        maxGAME = 4
        epsilon = agent.initial_epsilon
        GAME = 0
        t = 0
        max_life = 0 # Maximum episode life (Proxy for agent performance)
        life = 0

    # Buffer to compute rolling statistics 
        life_buffer, pistol_buffer, kills_buffer, shotgun_buffer, minigun_buffer, plasma_buffer, rocket_buffer, secret_buffer, hit_buffer, damtaken_buffer, item_buffer, armor_buffer = [], [], [], [], [], [], [], [], [], [], [], []

        while not game.is_episode_finished():
            if GAME > maxGAME:
                finishing()
            loss = 0
            Q_max = 0
            r_t = 0
            a_t = np.zeros([action_size])

        # Epsilon Greedy
            action_idx  = agent.get_action(s_t)
            a_t[action_idx] = 1

            a_t = a_t.astype(int)
            game.set_action(a_t.tolist())
            skiprate = agent.frame_per_action
            game.advance_action(skiprate)

            game_state = game.get_state()  # Observe again after we take the action
            is_terminated = game.is_episode_finished()

            r_t = game.get_last_reward()  #each frame we get reward of 0.1, so 4 frames will be 0.4

            if (is_terminated):
                if (life > max_life):
                    max_life = life
                GAME += 1
                life_buffer.append(life)
                pistol_buffer.append(misc[1])
                kills_buffer.append(misc[2])
                shotgun_buffer.append(misc[3])
                minigun_buffer.append(misc[4])
                plasma_buffer.append(misc[5])
                rocket_buffer.append(misc[6])
                secret_buffer.append(misc[7])
                hit_buffer.append(misc[8])
                damtaken_buffer.append(misc[9])
                item_buffer.append(misc[10]) 
                armor_buffer.append(misc[11])
                print ("Episode Finish ", misc)
                game.new_episode()
                game_state = game.get_state()
                misc = game_state.game_variables
                x_t1 = game_state.screen_buffer

            x_t1 = game_state.screen_buffer
            misc = game_state.game_variables

            x_t1 = preprocessImg(x_t1, size=(img_rows, img_cols))
            x_t1 = np.reshape(x_t1, (1, img_rows, img_cols, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

            r_t = agent.shape_reward(r_t, misc, prev_misc, t)

            if (is_terminated):
                life = 0
            else:
                life += 1

        # Update the cache
            prev_misc = misc

        # save the sample <s, a, r, s'> to the replay memory and decrease epsilon
            agent.replay_memory(s_t, action_idx, r_t, s_t1, is_terminated, t)

        # Do the training
            if t > agent.observe and t % agent.timestep_per_train == 0:
                Q_max, loss = agent.train_replay()
            
            s_t = s_t1
            t += 1

        # save progress every 10000 iterations
            if t % 10000 == 0:
                print("Now we save model")
                agent.model.save_weights("/home/spillingvoid/Downloads/programs/Doom/models/ddqn.h5", overwrite=True)

        # print info
            state = ""
            if t <= agent.observe:
                state = "observe"
            elif t > agent.observe and t <= agent.observe + agent.explore:
                state = "explore"
            else:
                state = "train"

            if (is_terminated):
                print("TIME", t, "/ GAME", GAME, "/ STATE", state, \
                      "/ EPSILON", agent.epsilon, "/ ACTION", action_idx, "/ REWARD", r_t, \
                      "/ Q_MAX %e" % np.max(Q_max), "/ LIFE", max_life, "/ LOSS", loss)

            # Save Agent's Performance Statistics
                if GAME % agent.stats_window_size == 0 and t > agent.observe: 
                    print("Update Rolling Statistics")
                    agent.mavg_score.append(np.mean(np.array(life_buffer)))
                    agent.var_score.append(np.var(np.array(life_buffer)))
                    agent.mavg_pistol_left.append(np.mean(np.array(pistol_buffer)))
                    agent.mavg_kill_counts.append(np.mean(np.array(kills_buffer)))
                    agent.mavg_shotgun_left.append(np.mean(np.array(shotgun_buffer)))
                    agent.mavg_minigun_left.append(np.mean(np.array(minigun_buffer)))
                    agent.mavg_plasma_left.append(np.mean(np.array(plasma_buffer)))
                    agent.mavg_rocket_left.append(np.mean(np.array(rocket_buffer)))
                    agent.mavg_secret_left.append(np.mean(np.array(secret_buffer)))
                    agent.mavg_hit_count.append(np.mean(np.array(hit_buffer)))
                    agent.mavg_damage_given.append(np.mean(np.array(damtaken_buffer)))
                    agent.mavg_item_collected.append(np.mean(np.array(item_buffer)))
                    agent.mavg_armor.append(np.mean(np.array(armor_buffer)))
                # Reset rolling stats buffer
                    life_buffer, pistol_buffer, kills_buffer, shotgun_buffer, minigun_buffer, plasma_buffer, rocket_buffer, secret_buffer, hit_buffer, damtaken_buffer, item_buffer, armor_buffer = [], [], [], [], [], [], [], [], [], [], [], []

                # Write Rolling Statistics to file
                    with open("/home/spillingvoid/Downloads/programs/Doom/statistics/ddqn_stats.txt", "a+") as stats_file:
                        stats_file.write('Game: ' + str(GAME) + '\n')
                        stats_file.write('Max Score: ' + str(max_life) + '\n')
                        stats_file.write('mavg_score: ' + str(agent.mavg_score) + '\n')
                        stats_file.write('var_score: ' + str(agent.var_score) + '\n')
                        stats_file.write('mavg_pistol_left: ' + str(agent.mavg_pistol_left) + '\n')
                        stats_file.write('mavg_kill_counts: ' + str(agent.mavg_kill_counts) + '\n')
                        stats_file.write('mavg_shotgun_left: ' + str(agent.mavg_shotgun_left) + '\n')
                        stats_file.write('mavg_minigun_left: ' + str(agent.mavg_minigun_left) + '\n')
                        stats_file.write('mavg_plasma_left: ' + str(agent.mavg_plasma_left) + '\n')
                        stats_file.write('mavg_rocket_left: ' + str(agent.mavg_rocket_left) + '\n')
                        stats_file.write('mavg_secret_left: ' + str(agent.mavg_secret_left) + '\n')
                        stats_file.write('mavg_hit_count: ' + str(agent.mavg_hit_count) + '\n')
                        stats_file.write('mavg_damage_given: ' + str(agent.mavg_damage_given) + '\n')
                        stats_file.write('mavg_item_collected: ' + str(agent.mavg_item_collected) + '\n')
                        stats_file.write('mavg_armor: ' + str(agent.mavg_armor) + '\n')


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 6, 21, 32)         8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 2, 9, 64)          32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 2, 9, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 1152)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               590336    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 1539      
Total params: 669,859
Trainable params: 669,859
Non-trainable params: 0
__________________________________________________

  img = skimage.color.rgb2gray(img)


1/1 - 1s - loss: 8.3163
1/1 - 0s - loss: 7.7843
1/1 - 0s - loss: 7.3519
2/2 - 0s - loss: 7.1715
2/2 - 0s - loss: 6.8039
2/2 - 0s - loss: 7.3444
2/2 - 0s - loss: 6.9270
2/2 - 0s - loss: 7.4471
2/2 - 0s - loss: 6.8899
2/2 - 0s - loss: 7.2287
2/2 - 0s - loss: 6.8262
3/3 - 0s - loss: 7.3792
3/3 - 0s - loss: 6.0353
3/3 - 0s - loss: 6.6987
Episode Finish  [  0.  41. 100.]
TIME 75 / GAME 1 / STATE train / EPSILON 9.999999999978082e-05 / ACTION 1 / REWARD -4.0 / Q_MAX 0.000000e+00 / LIFE 74 / LOSS 0
3/3 - 0s - loss: 4.1408
3/3 - 0s - loss: 4.6758
3/3 - 0s - loss: 2.2621
3/3 - 0s - loss: 3.6387
3/3 - 0s - loss: 1.6164
4/4 - 0s - loss: 4.3492
4/4 - 0s - loss: 1.8893
4/4 - 0s - loss: 3.7398
4/4 - 0s - loss: 1.7455
4/4 - 0s - loss: 5.1637
4/4 - 0s - loss: 2.5094
4/4 - 0s - loss: 5.6507
4/4 - 0s - loss: 3.1531
4/4 - 1s - loss: 5.6941
4/4 - 0s - loss: 3.1259
4/4 - 0s - loss: 5.9389
Episode Finish  [  0.  49. 100.]
TIME 139 / GAME 2 / STATE train / EPSILON 9.999999999978082e-05 / ACTION 2 / REWARD 10

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
