In [1]:
#!/usr/bin/env python
from __future__ import print_function

import skimage as skimage
from skimage import transform, color, exposure
from skimage.viewer import ImageViewer
import random
import sys
import os
from random import choice
import numpy as np
from collections import deque
import time
import tensorflow as tf
import json
import math
from tensorflow.keras import models


from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import Model
from keras import backend as K
import vizdoom as vzd
from vizdoom import DoomGame, ScreenResolution
from vizdoom import *
import itertools as it
from time import sleep
from time import time


from tensorflow.keras import backend as K
from tensorflow.keras import utils


  from .core import *


In [2]:
def preprocessImg(img, size):

    img = np.rollaxis(img, 0, 2)    # It becomes (640, 480, 3)
    img = skimage.transform.resize(img,size)
    img = skimage.color.rgb2gray(img)

    return img

In [3]:
teststart = time()

In [4]:
class C51Agent:

    def __init__(self, state_size, action_size, num_atoms):

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # these is hyper parameters for the DQN
        self.gamma = 0.99
        self.learning_rate = 0.0001
        self.epsilon = 1.0
        self.initial_epsilon = 1.0
        self.final_epsilon = 0.0001
        self.batch_size = 4 #32
        self.observe = 32 #2000
        self.explore = 64 # 50000
        self.frame_per_action = 4
        self.update_target_freq = 40 #3000 
        self.timestep_per_train = 4 #100 # Number of timesteps between training interval
        self.maxGAME = 64
        # Initialize Atoms
        self.num_atoms = num_atoms # 51 for C51
        self.v_max = 300 # Max possible score for Defend the center is 26 - 0.1*26 = 23.4
        self.v_min = -600 # -0.1*26 - 1 = -3.6
        self.delta_z = (self.v_max - self.v_min) / float(self.num_atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.num_atoms)]

        # Create replay memory using deque
        self.memory = deque()
        self.max_memory = 50000 # number of previous transitions to remember

        # Models for value distribution
        self.model = None
        self.target_model = None

        # Performance Statistics
        self.stats_window_size= 50 # window size for computing rolling statistics
        self.mavg_score = [] # Moving Average of Survival Time
        self.var_score = [] # Variance of Survival Time
        self.mavg_pistol_left = [] # Moving Average of Ammo used
        self.mavg_kill_counts = [] # Moving Average of Kill Counts
        self.mavg_shotgun_left = []
        self.mavg_minigun_left = []
        self.mavg_plasma_left = []
        self.mavg_rocket_left = []
        self.mavg_secret_left = []
        self.mavg_hit_count = []
        self.mavg_damage_given = []
        self.mavg_item_collected = []
        self.mavg_armor = []

    def update_target_model(self):
        """
        After some time interval update the target model to be same with model
        """
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """
        Get action from model using epsilon-greedy policy
        """
        if np.random.rand() <= self.epsilon:
            action_idx = random.randrange(self.action_size)
        else:
            action_idx = self.get_optimal_action(state)

        return action_idx

    def get_optimal_action(self, state):
        """Get optimal action for a state
        """
        z = self.model.predict(state) # Return a list [1x51, 1x51, 1x51]

        z_concat = np.vstack(z)
        q = np.sum(np.multiply(z_concat, np.array(self.z)), axis=1) 

        # Pick action with the biggest Q value
        action_idx = np.argmax(q)
        
        return action_idx

    def shape_reward(self, r_t, misc, prev_misc, t):
        
        # Check any kill count
        if (misc[0] > prev_misc[0]):
            r_t = r_t + 310

        if (misc[1] < prev_misc[1]): # Use pistol
            r_t = r_t - 1
        if (misc[1] > prev_misc[1]):
            r_t = r_t + 1
        
        if (misc[2] < prev_misc[2]): # Loss HEALTH
            r_t = r_t - 2
        if (misc[2] > prev_misc[2]):
            r_t = r_t + 2.25
        
        if (misc[3] < prev_misc[3]): # Loss shotgun
            r_t = r_t - 3
        if (misc[3] < prev_misc[3]):
            r_t = r_t +3
        
        if (misc[4] < prev_misc[4]): # Loss minigun
            r_t = r_t - 1
        if (misc[4] < prev_misc[4]):
            r_t = r_t + 1
        
        if (misc[5] < prev_misc[5]): # plasma
            r_t = r_t - 2
        if (misc[5] < prev_misc[5]):
            r_t = r_t + 2
            
        if (misc[6] < prev_misc[6]): # rocket
            r_t = r_t - 5
        if (misc[6] > prev_misc[6]):
            r_t = r_t + 5
            
        if (misc[7] > prev_misc[7]): # secrets
            r_t = r_t + 15 
                    
        if (misc[8] > prev_misc[8]): # hitcount
            r_t = r_t + 5
            
        if (misc[9] > prev_misc[9]): # hits taken
            r_t = r_t - 5
        
        if (misc[10] > prev_misc[10]): # items picked up
            r_t = r_t + 2
        
        if (misc[11] < prev_misc[11]): # armor
            r_t = r_t - 1
        if (misc[11] > prev_misc[11]):
            r_t = r_t + 1.5
        

        return r_t

    # save sample <s,a,r,s'> to the replay memory
    def replay_memory(self, s_t, action_idx, r_t, s_t1, is_terminated, t):
        self.memory.append((s_t, action_idx, r_t, s_t1, is_terminated))
        if self.epsilon > self.final_epsilon and t > self.observe:
            self.epsilon -= (self.initial_epsilon - self.final_epsilon) / self.explore

        if len(self.memory) > self.max_memory:
            self.memory.popleft()

        # Update the target model to be same with model
        if t % self.update_target_freq == 0:
            self.update_target_model()

    # pick samples randomly from replay memory (with batch_size)
    def train_replay(self):

        num_samples = min(self.batch_size * self.timestep_per_train, len(self.memory))
        replay_samples = random.sample(self.memory, num_samples)

        state_inputs = np.zeros(((num_samples,) + self.state_size)) 
        next_states = np.zeros(((num_samples,) + self.state_size)) 
        m_prob = [np.zeros((num_samples, self.num_atoms)) for i in range(action_size)]
        action, reward, done = [], [], []

        for i in range(num_samples):
            state_inputs[i,:,:,:] = replay_samples[i][0]
            action.append(replay_samples[i][1])
            reward.append(replay_samples[i][2])
            next_states[i,:,:,:] = replay_samples[i][3]
            done.append(replay_samples[i][4])

        z = self.model.predict(next_states) # Return a list [32x51, 32x51, 32x51]
        z_ = self.model.predict(next_states) # Return a list [32x51, 32x51, 32x51]

        # Get Optimal Actions for the next states (from distribution z)
        optimal_action_idxs = []
        z_concat = np.vstack(z)
        q = np.sum(np.multiply(z_concat, np.array(self.z)), axis=1) # length (num_atoms x num_actions)
        q = q.reshape((num_samples, action_size), order='F')
        optimal_action_idxs = np.argmax(q, axis=1)

        # Project Next State Value Distribution (of optimal action) to Current State
        for i in range(num_samples):
            if done[i]: # Terminal State
                # Distribution collapses to a single point
                Tz = min(self.v_max, max(self.v_min, reward[i]))
                bj = (Tz - self.v_min) / self.delta_z 
                m_l, m_u = math.floor(bj), math.ceil(bj)
                m_prob[action[i]][i][int(m_l)] += (m_u - bj)
                m_prob[action[i]][i][int(m_u)] += (bj - m_l)
            else:
                for j in range(self.num_atoms):
                    Tz = min(self.v_max, max(self.v_min, reward[i] + self.gamma * self.z[j]))
                    bj = (Tz - self.v_min) / self.delta_z 
                    m_l, m_u = math.floor(bj), math.ceil(bj)
                    m_prob[action[i]][i][int(m_l)] += z_[optimal_action_idxs[i]][i][j] * (m_u - bj)
                    m_prob[action[i]][i][int(m_u)] += z_[optimal_action_idxs[i]][i][j] * (bj - m_l)

        callbacks = [tf.keras.callbacks.TensorBoard(log_dir="/home/spillingvoid/Downloads/programs/Doom/statistics/", 
                                                    histogram_freq=1, write_graph=True),
                tf.keras.callbacks.ModelCheckpoint(filepath="/home/spillingvoid/Downloads/programs/Doom/models/C51_weights",
                save_weights_only=True),
                    ]
        
        loss = self.model.fit(state_inputs, m_prob, batch_size=self.batch_size, epochs=1, 
                              callbacks=callbacks, verbose=2)

        return loss.history['loss']

    # load the saved model
    def load_model(self, name):
        self.model.load_weights(name)

    # save the model which is under training
    def save_model(self, name):
        self.model.save("/home/spillingvoid/Downloads/programs/Doom/models/c51_ddqn.h5")

In [5]:
if len(tf.config.experimental.list_physical_devices('GPU')) > 1:
    print("GPU available")
    DEVICE = "/gpu:0"
else:
    print("No GPU available")
    DEVICE = "/cpu:0"

No GPU available


In [6]:
def value_distribution_network(input_shape, num_atoms, action_size, learning_rate):
        """Model Value Distribution

        With States as inputs and output Probability Distributions for all Actions
        """

        state_input = tf.keras.Input(shape=(input_shape)) 
        cnn_feature = tf.keras.layers.Conv2D(32, 8, 4, activation='relu')(state_input)
        cnn_feature = tf.keras.layers.Conv2D(64, 4, 2, activation='relu')(cnn_feature)
        cnn_feature = tf.keras.layers.Conv2D(64, 3, 3, activation='relu', padding="same")(cnn_feature)
        cnn_feature = tf.keras.layers.Flatten()(cnn_feature)
        cnn_feature = tf.keras.layers.Dense(512, activation='relu')(cnn_feature)

        distribution_list = []
        for i in range(action_size):
            distribution_list.append(tf.keras.layers.Dense(num_atoms, activation='softmax')(cnn_feature))

        model = Model(state_input, distribution_list)

        adam = tf.keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss='categorical_crossentropy',optimizer=adam)
        model.summary()

        return model

In [7]:
def finishing():
    print("saving model")
    agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)
    print("Save Complete")
    game.close()
    print("======================================")
    print("Training is finished.")
    print("Training complete")
    endtime = time()
    print(" Test Time elapsed: %.2f minutes" % ((endtime - teststart) / 60.0))
    quit
    sys.exit()
    %reset -f
    

In [8]:
if __name__ == "__main__":

    # Avoid Tensorflow eats up GPU memory
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    #sess = tf.Session(config=config)
    #K.set_session(sess)
    with tf.device(DEVICE):
        game = vzd.DoomGame()
        game.load_config("/home/spillingvoid/Downloads/programs/Doom/scenarios/Doom32.cfg")
        game.add_available_game_variable(vzd.GameVariable.KILLCOUNT)
        game.add_available_game_variable(vzd.GameVariable.AMMO2)
        game.add_available_game_variable(vzd.GameVariable.HEALTH)
        game.add_available_game_variable(vzd.GameVariable.AMMO3)
        game.add_available_game_variable(vzd.GameVariable.AMMO4)
        game.add_available_game_variable(vzd.GameVariable.AMMO5)
        game.add_available_game_variable(vzd.GameVariable.AMMO6)
        game.add_available_game_variable(vzd.GameVariable.SECRETCOUNT)
        game.add_available_game_variable(vzd.GameVariable.HITCOUNT)
        game.add_available_game_variable(vzd.GameVariable.HITS_TAKEN)
        game.add_available_game_variable(vzd.GameVariable.ITEMCOUNT)
        game.add_available_game_variable(vzd.GameVariable.ARMOR)

        game.new_episode()
        game_state = game.get_state()
        misc = game_state.game_variables  # [KILLCOUNT, AMMO, HEALTH]
        prev_misc = misc

        action_size = game.get_available_buttons_size()

        img_rows , img_cols = 30, 45
    # Convert image into Black and white
        img_channels = 4 # We stack 4 frames

    # C51
        num_atoms = 51

        state_size = (img_rows, img_cols, img_channels)
        agent = C51Agent(state_size, action_size, num_atoms)

        agent.model = value_distribution_network(state_size, num_atoms, action_size, agent.learning_rate)
        agent.target_model = value_distribution_network(state_size, num_atoms, action_size, agent.learning_rate)

        x_t = game_state.screen_buffer # 480 x 640
        x_t = preprocessImg(x_t, size=(img_rows, img_cols))
        s_t = np.stack(([x_t]*4), axis=2)    # It becomes 64x64x4
        s_t = np.expand_dims(s_t, axis=0) # 1x64x64x4

        is_terminated = game.is_episode_finished()

    # Start training
        maxGAME = 4
        epsilon = agent.initial_epsilon
        GAME = 0
        t = 0
        max_life = 0 # Maximum episode life (Proxy for agent performance)
        life = 0

    # Buffer to compute rolling statistics 
        life_buffer, pistol_buffer, kills_buffer, shotgun_buffer, minigun_buffer, plasma_buffer, rocket_buffer, secret_buffer, hit_buffer, damtaken_buffer, item_buffer, armor_buffer = [], [], [], [], [], [], [], [], [], [], [], []

        while not game.is_episode_finished():
            
            if GAME > maxGAME:
                finishing()
                
            loss = 0
            r_t = 0
            a_t = np.zeros([action_size])

        # Epsilon Greedy
            action_idx  = agent.get_action(s_t)
            a_t[action_idx] = 1

            a_t = a_t.astype(int)
            game.set_action(a_t.tolist())
            skiprate = agent.frame_per_action
            game.advance_action(skiprate)

            game_state = game.get_state()  # Observe again after we take the action
            is_terminated = game.is_episode_finished()

            r_t = game.get_last_reward()  #each frame we get reward of 0.1, so 4 frames will be 0.4

            if (is_terminated):
                if (life > max_life):
                    max_life = life
                GAME += 1
                life_buffer.append(life)
                pistol_buffer.append(misc[1])
                kills_buffer.append(misc[2])
                shotgun_buffer.append(misc[3])
                minigun_buffer.append(misc[4])
                plasma_buffer.append(misc[5])
                rocket_buffer.append(misc[6])
                secret_buffer.append(misc[7])
                hit_buffer.append(misc[8])
                damtaken_buffer.append(misc[9])
                item_buffer.append(misc[10]) 
                armor_buffer.append(misc[11])                
                print ("Episode Finish ", misc)
                game.new_episode()
                game_state = game.get_state()
                misc = game_state.game_variables
                x_t1 = game_state.screen_buffer

            x_t1 = game_state.screen_buffer
            misc = game_state.game_variables

            x_t1 = preprocessImg(x_t1, size=(img_rows, img_cols))
            x_t1 = np.reshape(x_t1, (1, img_rows, img_cols, 1))
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

            r_t = agent.shape_reward(r_t, misc, prev_misc, t)

            if (is_terminated):
                life = 0
            else:
                life += 1

        #update the cache
            prev_misc = misc

        # save the sample <s, a, r, s'> to the replay memory and decrease epsilon
            agent.replay_memory(s_t, action_idx, r_t, s_t1, is_terminated, t)

        # Do the training
            if t > agent.observe and t % agent.timestep_per_train == 0:
                loss = agent.train_replay()

            s_t = s_t1
            t += 1

        # save progress every 10000 iterations
            if t % 10000 == 0:
                print("Now we save model")
                agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/c51_ddqn.h5")
                agent.model.save_weights("/home/spillingvoid/Downloads/programs/Doom/models/c51_ddqn.h5",
                                         overwrite=True)

        # print info
            state = ""
            if t <= agent.observe:
                state = "observe"
            elif t > agent.observe and t <= agent.observe + agent.explore:
                state = "explore"
            else:
                state = "train"

            if (is_terminated):
                print("TIME", t, "/ GAME", GAME, "/ STATE", state, \
                      "/ EPSILON", agent.epsilon, "/ ACTION", action_idx, "/ REWARD", r_t, \
                      "/ LIFE", max_life, "/ LOSS", loss)

            # Save Agent's Performance Statistics
                if GAME % agent.stats_window_size == 0 and t > agent.observe: 
                    print("Update Rolling Statistics")
                    agent.mavg_score.append(np.mean(np.array(life_buffer)))
                    agent.var_score.append(np.var(np.array(life_buffer)))
                    agent.mavg_pistol_left.append(np.mean(np.array(pistol_buffer)))
                    agent.mavg_kill_counts.append(np.mean(np.array(kills_buffer)))
                    agent.mavg_shotgun_left.append(np.mean(np.array(shotgun_buffer)))
                    agent.mavg_minigun_left.append(np.mean(np.array(minigun_buffer)))
                    agent.mavg_plasma_left.append(np.mean(np.array(plasma_buffer)))
                    agent.mavg_rocket_left.append(np.mean(np.array(rocket_buffer)))
                    agent.mavg_secret_left.append(np.mean(np.array(secret_buffer)))
                    agent.mavg_hit_count.append(np.mean(np.array(hit_buffer)))
                    agent.mavg_damage_given.append(np.mean(np.array(damtaken_buffer)))
                    agent.mavg_item_collected.append(np.mean(np.array(item_buffer)))
                    agent.mavg_armor.append(np.mean(np.array(armor_buffer)))
                # Reset rolling stats buffer
                    life_buffer, pistol_buffer, kills_buffer, shotgun_buffer, minigun_buffer, plasma_buffer, rocket_buffer, secret_buffer, hit_buffer, damtaken_buffer, item_buffer, armor_buffer = [], [], [], [], [], [], [], [], [], [], [], []

                # Write Rolling Statistics to file
                    with open("/home/spillingvoid/Downloads/programs/Doom/statistics/c51_ddqn_stats.txt", "a+") as stats_file:
                        stats_file.write('Game: ' + str(GAME) + '\n')
                        stats_file.write('Max Score: ' + str(max_life) + '\n')
                        stats_file.write('mavg_score: ' + str(agent.mavg_score) + '\n')
                        stats_file.write('var_score: ' + str(agent.var_score) + '\n')
                        stats_file.write('mavg_pistol_left: ' + str(agent.mavg_pistol_left) + '\n')
                        stats_file.write('mavg_kill_counts: ' + str(agent.mavg_kill_counts) + '\n')
                        stats_file.write('mavg_shotgun_left: ' + str(agent.mavg_shotgun_left) + '\n')
                        stats_file.write('mavg_minigun_left: ' + str(agent.mavg_minigun_left) + '\n')
                        stats_file.write('mavg_plasma_left: ' + str(agent.mavg_plasma_left) + '\n')
                        stats_file.write('mavg_rocket_left: ' + str(agent.mavg_rocket_left) + '\n')
                        stats_file.write('mavg_secret_left: ' + str(agent.mavg_secret_left) + '\n')
                        stats_file.write('mavg_hit_count: ' + str(agent.mavg_hit_count) + '\n')
                        stats_file.write('mavg_damage_given: ' + str(agent.mavg_damage_given) + '\n')
                        stats_file.write('mavg_item_collected: ' + str(agent.mavg_item_collected) + '\n')
                        stats_file.write('mavg_armor: ' + str(agent.mavg_armor) + '\n')
                
                if GAME >= agent.maxGAME:
                    game.close()
                    print("======================================")
                    print("Training is finished.")
                    agent.model.save("/home/spillingvoid/Downloads/programs/Doom/models/dueling_ddqn.h5", overwrite=True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30, 45, 4)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 6, 10, 32)    8224        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 2, 4, 64)     32832       conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 1, 2, 64)     36928       conv2d_1[0][0]                   
______________________________________________________________________________________________

  img = skimage.color.rgb2gray(img)


4/4 - 2s - loss: 3.9223 - dense_1_loss: 0.9831 - dense_2_loss: 1.4744 - dense_3_loss: 1.4648
4/4 - 0s - loss: 3.9128 - dense_1_loss: 1.4698 - dense_2_loss: 1.2287 - dense_3_loss: 1.2143
4/4 - 0s - loss: 3.9225 - dense_1_loss: 1.4700 - dense_2_loss: 1.2287 - dense_3_loss: 1.2239
4/4 - 0s - loss: 3.9228 - dense_1_loss: 1.7208 - dense_2_loss: 1.2239 - dense_3_loss: 0.9782
4/4 - 0s - loss: 3.9282 - dense_1_loss: 1.7211 - dense_2_loss: 1.2239 - dense_3_loss: 0.9832
4/4 - 0s - loss: 3.9137 - dense_1_loss: 0.9838 - dense_2_loss: 1.4692 - dense_3_loss: 1.4607
4/4 - 0s - loss: 3.9338 - dense_1_loss: 1.4768 - dense_2_loss: 1.7191 - dense_3_loss: 0.7379
4/4 - 0s - loss: 3.9348 - dense_1_loss: 0.7390 - dense_2_loss: 1.9647 - dense_3_loss: 1.2311
4/4 - 0s - loss: 3.9361 - dense_1_loss: 0.7364 - dense_2_loss: 2.2131 - dense_3_loss: 0.9867
4/4 - 0s - loss: 3.9330 - dense_1_loss: 1.9610 - dense_2_loss: 1.2298 - dense_3_loss: 0.7422
Episode Finish  [  0.  42. 100.]
TIME 75 / GAME 1 / STATE explore / EP

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
print("Training complete")
endtime = time()
print(" Test Time elapsed: %.2f minutes" % ((endtime - teststart) / 60.0))