In [13]:
import numpy as np
from vizdoom import *
import tensorflow as tf
import cv2
from collections import deque
import matplotlib.pyplot as plt

In [14]:
def create_env():
    game = DoomGame()
    game.load_config('basic.cfg')
    game.set_doom_scenario_path('basic.wad')
    
    game.init()
    
    left = [1,0,0]
    right = [0,1,0]
    shoot = [0,0,1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

In [15]:
game, possible_actions = create_env()
game.close()

In [16]:
def preprocess_frame(frame):
    
    cropped_frame = frame[220:-100, 100:-100]
    #plt.imshow(cropped_frame, cmap='gray')
    #plt.show()
    #exit()
    
    normalised_frame = cropped_frame/255.0
    
    preprocessed_frame = cv2.resize( normalised_frame, (84,84) )
    
    return preprocessed_frame

In [17]:
stack_size = 4

stacked_frames = deque( [ np.zeros( (84,84) , dtype = np.int ) for i in range(stack_size) ], maxlen = 4 )

def stack_frames( stacked_frames, state, is_new_episode ):
    
    frame = preprocess_frame(state)
    
    if is_new_episode:
        
        stacked_frames = deque( [np.zeros( (84,84) , dtype = np.int ) for i in range(stack_size) ], maxlen = 4 )
        
        for _ in range(4):
            stacked_frames.append(frame)
        
        stacked_state = np.stack( stacked_frames, axis=2 )
    
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack( stacked_frames, axis = 2 )
        
    return stacked_state, stacked_frames

In [18]:
### MODEL HYPERPARAMETERS
state_size = [84,84,4]      # Our input is a stack of 4 frames hence 84x84x4 (Width, height, channels) 
action_size = game.get_available_buttons_size()              # 3 possible actions: left, right, shoot
learning_rate =  0.0002      # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 500        # Total episodes for training
max_steps = 100              # Max possible steps in an episode
batch_size = 64             

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.95               # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000          # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = False

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = True

In [19]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            
            self.inputs = tf.placeholder( tf.float32, [None, 84,84,4], name='inputs' )
            self.actions_ = tf.placeholder( tf.float32, [None,3], name = 'actions_' )
            
            self.target_Q = tf.placeholder( tf.float32, [None], name = 'target' )
            
            ## First Conv Layer
            self.conv1 = tf.layers.conv2d( inputs=self.inputs, filters=32, kernel_size=[8,8], strides=[4,4], padding='VALID', kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv1' )
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1, epsilon=1e-5, training=True, name = 'batch_norm1' )
            
            self.conv1_out = tf.nn.elu( self.conv1_batchnorm, name = 'conv1_out' )
            
            ## Second Conv layer
            self.conv2 = tf.layers.conv2d( inputs=self.conv1_out, filters=64, kernel_size=[4,4], strides=[2,2], padding='VALID', kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv2' )
            
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2, epsilon=1e-5, training=True, name = 'batch_norm2' )
            
            self.conv2_out = tf.nn.elu( self.conv2_batchnorm, name = 'conv2_out' )
            
            ## Third Convolution
            self.conv3 = tf.layers.conv2d( inputs=self.conv2_out, filters=128, kernel_size=[4,4], strides=[2,2], padding='VALID', kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name='conv3' )
            
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3, epsilon=1e-5, training=True, name = 'batch_norm3' )
            
            self.conv3_out = tf.nn.elu( self.conv3_batchnorm, name = 'conv3_out' )
            
            ## Flatten
            self.flatten = tf.layers.flatten( self.conv3_out )
            
            self.fc = tf.layers.dense( inputs= self.flatten, units=512, activation=tf.nn.relu, name = 'fc1', kernel_initializer=tf.contrib.layers.xavier_initializer() )
            
            self.output = tf.layers.dense( inputs= self.fc, units=3, kernel_initializer=tf.contrib.layers.xavier_initializer(), activation=None )
            
            self.Q = tf.reduce_sum( tf.multiply( self.output, self.actions_ ), axis=1 )
            
            self.loss = tf.reduce_mean( tf.square( self.target_Q - self.Q ) )
            self.optimiser = tf.train.RMSPropOptimizer( self.learning_rate ).minimize(self.loss)

In [20]:
tf.reset_default_graph()

DQNetwork = DQNetwork( state_size, action_size, learning_rate )

In [23]:
saver = tf.train.Saver()

with tf.Session() as sess:
    
    game, possible_actions = create_env()
    
    totalScore = 0
    
   
    # Load the model
    saver.restore(sess, "./model/model.ckpt")
    game.init()
    for i in range(5):
        
        game.new_episode()
        start = True
        
        while not game.is_episode_finished():
            frame = game.get_state().screen_buffer
            state, _ = stack_frames(stacked_frames, frame, start)
            #break
            # Take the biggest Q value (= the best action)
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs: state.reshape((1, 84,84,4))})
            action = np.argmax(Qs)
            action = possible_actions[int(action)]
            game.make_action(action)        
            score = game.get_total_reward()
            start = False
            
        print("Score: ", score)
        totalScore += score
    print("TOTAL_SCORE", totalScore/100.0)
    game.close()

INFO:tensorflow:Restoring parameters from ./model/model.ckpt
('Score: ', 51.0)
('Score: ', 95.0)
('Score: ', 32.0)
('Score: ', 33.0)
('Score: ', 17.0)
('TOTAL_SCORE', 2.28)


In [22]:
game.close()