In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-import operator

# M. Kempka, T.Sternal, M.Wydmuch, Z.Boztoprak
# Januar 2021
import sys
import os
import cv2
import numpy as np
import vizdoom as vzd
import itertools as it

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras import models
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import layers

In [None]:
import skimage.color, skimage.transform

from tqdm import trange
from random import sample
from time import time, sleep
from collections import deque
#import tensorflow_datasets as tfds
#import resnet
#os.environ["TF_MIN_GPU_MULTIPROCESSOR_COUNT"]="2"
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
tf.compat.v1.enable_eager_execution() #tf.compat.v1.disable_eager_execution() 
tf.executing_eagerly()

In [None]:
tf.debugging.set_log_device_placement(True) 

In [None]:
tf.test.is_built_with_cuda()

In [None]:
from tensorflow.python.client.device_lib import list_local_devices
devices = list_local_devices()
    #devices
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [None]:
mirrored_strategy = tf.distribute.MirroredStrategy()

In [None]:
num_replicas = mirrored_strategy.num_replicas_in_sync
print("Num_replicas:", num_replicas)

In [None]:
# Q-learning settings
learning_rate = 0.00000025
discount_factor = 0.99
replay_memory_size = 1000000
num_train_epochs = 2 #256      epochs 2 learning steps 32 
learning_steps_per_epoch = 32 #2048
target_net_update_steps =  4 #1024

# NN learning settings
#BUFFER = len(extractDigits(*argv))
batch_size = 4
BATCH_SIZE_PER_REPLICA = 2
GLOBALBATCH_SIZE = BATCH_SIZE_PER_REPLICA * mirrored_strategy.num_replicas_in_sync
EPOCHS = 8
epoch = 8
#x = [0]
K = keras.backend
# Training regime 
test_episodes_per_epoch = 4

# Other parameters
frames_per_action = 3
resolution = (45, 30)
shape = (1, 30, 45, 2)
episodes_to_watch = 2
num_action = 3
num_actions = 2 ** num_action

save_model = True
load = False
skip_learning = False
watch = True
#dist_training = False

In [None]:
# Configuration file path
# Configuration file path
config_file_path = "/home/spillingvoid/Downloads/programs/ViZDoom/scenarios/rocket_basic.cfg"
model_savefolder = "/home/spillingvoid/Downloads/programs/ac/model"
model_weight = "/home/spillingvoid/Downloads/programs/ac/weights"
model_callback = "/home/spillingvoid/Downloads/programs/ac/callback"
model_statistics = "/home/spillingvoid/Downloads/programs/ac/statistics"
videos = "/home/spillingvoid/Downloads/programs/ac/statistics"
plots = "/home/spillingvoid/Downloads/programs/ac/statistics"
checkpoint_dir = '/trainingcheckpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [None]:
if len(tf.config.experimental.list_physical_devices('GPU')) > 1:
    print("GPU available")
    DEVICE = "/gpu:0"
else:
    print("No GPU available")
    DEVICE = "/cpu:0"
    
    
teststart = time()

In [None]:
def preprocess(img):
    
    img = cv2.resize(img, resolution)
    img = cv2.bitwise_not(img)
    image = cv2.resize(img, (320,220))
    cv2.imshow('img', image)
    cv2.waitKey(250)
    img = img.astype(np.float32)
    img = np.expand_dims(img, axis=-1)
    img /= 255
    print("Image dimensions:", img.shape)
       
    return tf.stack(img)


In [None]:
def initialize_game():
    print("Initializing doom...")
    game = vzd.DoomGame()
    game.load_config(config_file_path)
    game.set_window_visible(True)
    game.set_mode(vzd.Mode.PLAYER)
    game.set_screen_format(vzd.ScreenFormat.BGR24)
    game.set_screen_resolution(vzd.ScreenResolution.RES_640X480)
    game.get_available_buttons_size()
    game.init()
    print("Doom initialized.")

    return game



In [None]:

class DQNAgent:
    def __init__(self, num_actions=num_actions, epsilon=1, epsilon_min=0.1, epsilon_decay=0.9999, load=load):
        #with mirrored_strategy.scope():
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.discount_factor = discount_factor
        self.num_actions = num_actions
        self.optimizer = SGD(learning_rate)
        print("Loading model from: ", model_savefolder) 
        self.dqn = tf.keras.models.load_model(model_savefolder)
        if load:
            print("Loading model from: ", model_savefolder) 
            self.dqn = tf.keras.models.load_model(model_savefolder)
        else:
            self.dqn = DQN(self.num_actions)
            self.target_net = DQN(self.num_actions)

    def update_target_net(self):
        self.target_net.set_weights(self.dqn.get_weights())
        
    def choose_action(self, state):
        
        if self.epsilon < np.random.uniform(0,1):
            action = int(tf.argmax(self.dqn(tf.reshape(state, (1,30,45,1))), axis=1))
        else:
            action = np.random.choice(range(self.num_actions), 1)[0]

        return action
        #with mirrored_strategy.scope():
    def train_dqn(self, samples):
        screen_buf, actions, rewards, next_screen_buf, dones = split_tuple(samples)

        row_ids = list(range(screen_buf.shape[0]))
        ids = extractDigits(row_ids, actions)
        done_ids = extractDigits(np.where(dones)[0])
        
            
        with tf.GradientTape(persistent=False) as tape:
            
            tape.watch(self.dqn.trainable_variables)

            Q_prev = tf.gather_nd(self.dqn(screen_buf), ids)
            Q_next = self.target_net(next_screen_buf)
            Q_next = tf.gather_nd(Q_next, extractDigits(row_ids, #best_next_action
                            tf.argmax(agent.dqn(next_screen_buf), axis=1)))
            q_target = rewards + self.discount_factor * Q_next #next best q values
            
            if len(done_ids)>0:
                done_rewards = tf.gather_nd(rewards, done_ids)
                q_target = tf.tensor_scatter_nd_update(tensor=q_target, 
                                indices=done_ids, updates=done_rewards) #target q values #2
                
            #DQN.Qvalues = tf.reducesum((all_Q_values * mask, axis=1, keepdims=True)
                
            
            td_error = tf.keras.losses.MSE(q_target, Q_prev)
                                                    
            gradients = tape.gradient(td_error, self.dqn.trainable_variables)
            self.optimizer.apply_gradients(zip(gradients, self.dqn.trainable_variables),
                                       experimental_aggregate_gradients=True)
            return td_error
        if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay
        else:
                    self.epsilon = self.epsilon_min
    #@tf.function
    def distributed_train_step(self, samples):
            per_replica_losses = mirrored_strategy.run(self.train_dqn, args=(samples,)) #mirrored_strategy.run
            if epoch % 2 == 0:

                return mirrored_strategy.reduce(tf.distribute.ReduceOp.SUM,
                                per_replica_losses, axis=None)
        

In [None]:
def split_tuple(samples):
    samples = np.array(samples, dtype=object)
    screen_buf = tf.stack(samples[:,0])
    actions = samples[:,1]
    rewards = tf.stack(samples[:,2])
    next_screen_buf = tf.stack(samples[:,3])
    dones = tf.stack(samples[:,4])  
    return screen_buf, actions, rewards, next_screen_buf, dones 

In [None]:

def extractDigits(*argv):
    #with mirrored_strategy.scope():
    if len(argv)==1:
        return list(map(lambda x: [x], argv[0]))

    return list(map(lambda x,y: [x,y], argv[0], argv[1]))

In [None]:

def get_samples(memory):
    #with mirrored_strategy.scope():
    if len(memory) < batch_size:
        sample_size = len(memory)
    else:
        sample_size = batch_size

    return sample(memory, sample_size)

In [None]:
def run(agent, game, replay_memory):
    time_start = time()

    for episode in range(num_train_epochs):
        train_scores = []
        print("\nEpoch %d\n-------" % (episode + 1))

        game.new_episode()

        for i in trange(learning_steps_per_epoch, leave=False):
            state = game.get_state()
            screen_buf = preprocess(state.screen_buffer)
            action = agent.choose_action(screen_buf)
            reward = game.make_action(actions[action], frames_per_action)
            done = game.is_episode_finished()

            if not done:
                next_screen_buf = preprocess(game.get_state().screen_buffer)
            else:
                next_screen_buf = tf.zeros(shape=screen_buf.shape)

            if done:
                train_scores.append(game.get_total_reward())

                game.new_episode()

            replay_memory.append((screen_buf, action, reward, next_screen_buf, done))

            if i >= batch_size:
                agent.train_dqn(get_samples(replay_memory)) #agent.distributed_train_step(get_samples(replay_memory)) 
       
            if ((i % target_net_update_steps) == 0):
                agent.update_target_net()

        train_scores = np.array(train_scores)
        print("Results: mean: %.1f±%.1f," % (train_scores.mean(), train_scores.std())), \
        #          "min: %.1f," % train_scores.min(), "max: %.1f," % train_scores.max())

        test(test_episodes_per_epoch, game, agent)
        print("Total elapsed time: %.2f minutes" % ((time() - time_start) / 60.0))

In [None]:
def test(test_episodes_per_epoch, game, agent):
    test_scores = []
    #with mirrored_strategy.scope():
    print("\nTesting...")
    for test_episode in trange(test_episodes_per_epoch, leave=False):
        game.new_episode()
        while not game.is_episode_finished():
            state = preprocess(game.get_state().screen_buffer)
            best_action_index = agent.choose_action(state)
            game.make_action(actions[best_action_index], frames_per_action)

        r = game.get_total_reward()
        test_scores.append(r)

    test_scores = np.array(test_scores)
    print("Results: mean: %.1f±%.1f," % (
            test_scores.mean(), test_scores.std())) #, "min: %.1f" % test_scores.min(),
        #      "max: %.1f" % test_scores.max())
    print("reward", r)

In [None]:
class DQN(tf.keras.Model):
    
    def __init__(self, num_actions):
        super(DQN, self).__init__()
        self.conv1 = tf.keras.layers.Conv3D(1024, kernel_size=10, strides=4, input_shape=(1, 30, 45), activation ="elu", padding="same")
        self.norm1 = tf.keras.layers.BatchNormalization()
        self.conv3 = tf.keras.layers.Conv3D(264, kernel_size=8, strides=3, activation ="elu", padding="same")
        self.drop1 = tf.keras.layers.Dropout(0.4)
        self.conv5 = tf.keras.layers.Conv3D(2048, kernel_size=3, strides=2, activation ="elu", padding="same") #input_shape=(9, 14, 8)
        self.drop2 = tf.keras.layers.Dropout(0.5)
        self.conv2 = tf.keras.layers.Conv3D(512, kernel_size=2, strides=1, activation ="elu", padding="same")
        
        self.elu   = tf.keras.layers.ELU()

        self.flatten = tf.keras.layers.Flatten()
        self.hidden1 = tf.keras.layers.Dense(256, activation="elu")
        self.state_value = tf.keras.layers.Dense(1, activation="softmax")
        self.advantage = tf.keras.layers.Dense(num_actions)
    

    @tf.function(experimental_relax_shapes=True)
    def call(self, x, training=True):
        
        #x = input_states(x)
        x = self.conv1(x)
        x = self.norm1(x)
        #x = self.depth(x)
        x = self.conv3(x)
        x = self.drop1(x)
        #x = self.conv4(x)
        #x = self.norm2(x)
        x = self.conv5(x)
        x = self.drop2(x)
        x = self.conv2(x)
        x = self.elu(x)
        #x = self.norm3(x)
        x = self.flatten(x)
        x = self.hidden1(x)
        
        x1 = x[:, :96]
        x2 = x[:, 96:]
        x1 = self.state_value(x1)
        x2 = self.advantage(x2)
        
        x = x1 + (x2 - tf.reshape(tf.math.reduce_mean(x2, axis=1), shape=(-1,1))) #Q_values
        return x
    
        model = DQN(num_actions)
        keras.utils.plot_model(model, 'Doomddddqn.png', show_shapes=True)
        model.save(model_savefolder)
        model.save_weights(model_weight)


In [None]:
if __name__ == '__main__':
    #with mirrored_strategy.scope():
    maxlen=replay_memory_size
    agent = DQNAgent()
    game = initialize_game()
    replay_memory = deque(maxlen=replay_memory_size)
    n = game.get_available_buttons_size()
    actions = [list(a) for a in it.product([0, 1], repeat=n)]
    
    with tf.device(DEVICE):
    #with mirrored_strategy.scope():
    #with strategy:
        if not skip_learning:
            print("Starting the training!")

            run(agent, game, replay_memory)

            game.close()
            print("======================================")
            print("Training is finished.")

            if save_model:
                agent.dqn.save(model_savefolder)
                testend = time()
                print(" Test Time elapsed: %.2f minutes" % ((testend - teststart) / 60.0))
            game.close()

In [None]:
        if watch:
            game.set_window_visible(True)
            game.set_mode(vzd.Mode.ASYNC_PLAYER)
            game.init()

            for _ in range(episodes_to_watch):
                game.new_episode()
                while not game.is_episode_finished():
                    state = preprocess(game.get_state().screen_buffer)
                    best_action_index = agent.choose_action(state)

                    # Instead of make_action(a, frame_repeat) in order to make the animation smooth
                    game.set_action(actions[best_action_index])
                    for _ in range(frames_per_action):
                        game.advance_action()

                # Sleep between episodes
                sleep(1.0)
                score = game.get_total_reward()
                print("Total score: ", score)