In [16]:
import socket
from envVMWM import *
exe_location='C:\\Users\\YuHang\\Desktop\\Water_Maze\\v0.18\\VMWM.exe'
cfg_location = 'C:\\Users\\YuHang\\Desktop\\Water_Maze\\v0.18\\VMWM_data\\configuration_original.txt'


import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.slim as slim
import scipy.signal
%matplotlib inline
from helper import *

from random import choice
from time import sleep
from time import time

## Helper function

In [17]:
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Processes Doom screen image to produce cropped and resized image. 
def process_frame(frame):
    s = frame[10:-10,30:-30]
    s = scipy.misc.imresize(s,[84,84])
    s = np.reshape(s,[np.prod(s.shape)]) / 255.0
    return s

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

## Actor-Critic Network

In [18]:
class AC_Network():
    def __init__(self,s_size,a_size,scope,trainer):
        with tf.variable_scope(scope):
            #Input and visual encoding layers
            self.inputs = tf.placeholder(shape=[None,s_size],dtype=tf.float32)
            self.imageIn = tf.reshape(self.inputs,shape=[-1,84,84,1])
            self.conv1 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.imageIn,num_outputs=16,
                kernel_size=[8,8],stride=[2,2],padding='VALID')
            self.conv2 = slim.conv2d(activation_fn=tf.nn.elu,
                inputs=self.conv1,num_outputs=32,
                kernel_size=[4,4],stride=[2,2],padding='VALID')
            hidden = slim.fully_connected(slim.flatten(self.conv2),256,activation_fn=tf.nn.elu)
            
            #Recurrent network for temporal dependencies
            lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(256,dropout_keep_prob=0.5)
            #lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell,output_keep_prob=0.5)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 256])
            
            #Output layers for policy and value estimations
            self.policy = slim.fully_connected(rnn_out,a_size,
                activation_fn=tf.nn.softmax,
                weights_initializer=normalized_columns_initializer(0.01),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn=None,
                weights_initializer=normalized_columns_initializer(1.0),
                biases_initializer=None)
            
            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
                self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)
                
                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

## My Worker

In [19]:
"""class Worker():

    def __init__(self,name):
        self.name = "worker_" + str(name)
        self.number = name
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.bind(('127.0.0.1', 0))
        port = sock.getsockname()[1]
        print(sock.getsockname())
        #sock.shutdown(socket.SHUT_WR)
        self.env = VMWMGame(cfg_location,exe_location)
        self.env.reset_cfg()
        self.env.set_trial('Practice - Hills')
        self.env.set_local_host('127.0.0.1', port) # local host IP address & dynamic allocated port 
    
    def start(self):
        self.env.start(setting=1)
        
    def work(self):
        self.env.start_trial()
        sleep(0.5)
        #print('OK')
        for i in range(20):
            #print('In action')
            while(not self.env.is_episode_finished()):
                #cv2.imshow('frame', env.get_screenImage())
                #cv2.waitKey(1)
                self.env.make_action(2,50)
                #print(env.get_reward())
                #print(env.get_score())
            self.env.new_episode()
            
        self.env.end_trial()
        self.env.s.close()"""

'class Worker():\n\n    def __init__(self,name):\n        self.name = "worker_" + str(name)\n        self.number = name\n        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        sock.bind((\'127.0.0.1\', 0))\n        port = sock.getsockname()[1]\n        print(sock.getsockname())\n        #sock.shutdown(socket.SHUT_WR)\n        self.env = VMWMGame(cfg_location,exe_location)\n        self.env.reset_cfg()\n        self.env.set_trial(\'Practice - Hills\')\n        self.env.set_local_host(\'127.0.0.1\', port) # local host IP address & dynamic allocated port \n    \n    def start(self):\n        self.env.start(setting=1)\n        \n    def work(self):\n        self.env.start_trial()\n        sleep(0.5)\n        #print(\'OK\')\n        for i in range(20):\n            #print(\'In action\')\n            while(not self.env.is_episode_finished()):\n                #cv2.imshow(\'frame\', env.get_screenImage())\n                #cv2.waitKey(1)\n                self.env.make_actio

## VizDoom Worker

In [20]:
class Worker():
    def __init__(self,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name        
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(s_size,a_size,self.name,trainer)
        self.update_local_ops = update_target_graph('global',self.name)        
        
        #Set up actions
        self.actions = np.identity(a_size,dtype=bool).tolist()
        
        #Set up VMWM env
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.bind(('127.0.0.1', 0))
        port = sock.getsockname()[1]
        print(sock.getsockname())
        #sock.shutdown(socket.SHUT_WR)
        self.env = VMWMGame(cfg_location,exe_location)
        self.env.reset_cfg()
        self.env.set_trial('Practice - Hills')
        self.env.set_local_host('127.0.0.1', port) # local host IP address & dynamic allocated port 
        
    def start(self,setting=0):
        self.env.start()
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        rnn_state = self.local_AC.state_init
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.inputs:np.vstack(observations),
            self.local_AC.actions:actions,
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:rnn_state[0],
            self.local_AC.state_in[1]:rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,max_episode_length,gamma,sess,coord,saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                
                self.env.start_trial()
                sleep(0.5)
                
                s = self.env.get_screenImage()
                episode_frames.append(s)
                s = process_frame(s)
                rnn_state = self.local_AC.state_init
                
                while self.env.is_episode_finished() == False:
                    #Take an action using probabilities from policy network output.
                    a_dist,v,rnn_state = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
                        feed_dict={self.local_AC.inputs:[s],
                        self.local_AC.state_in[0]:rnn_state[0],
                        self.local_AC.state_in[1]:rnn_state[1]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)
                    
                    self.env.make_action(a,100)
                    r = self.env.get_reward()
                    d = self.env.is_episode_finished()
                    if d == False:
                        s1 = self.env.get_screenImage()
                        episode_frames.append(s1)
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                        
                    episode_buffer.append([s,a,r,s1,d,v[0,0]])
                    episode_values.append(v[0,0])

                    episode_reward += r
                    s = s1                
                    total_steps += 1
                    episode_step_count += 1
                    
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value, 
                            feed_dict={self.local_AC.inputs:[s],
                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})[0,0]
                        v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if d == True:
                        break
                                            
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the experience buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                                
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0:
                    if self.name == 'worker_0' and episode_count % 25 == 0:
                        time_per_step = 0.05
                        images = np.array(episode_frames)
                        make_gif(images,'./frames/image'+str(episode_count)+'.gif',
                            duration=len(images)*time_per_step,true_image=True,salience=False)
                    if episode_count % 250 == 0 and self.name == 'worker_0':
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1
                
                # Start a new episode
                self.env.new_episode()
            
            # Stop trail
            self.env.end_trial()
            self.env.s.close()

In [21]:
max_episode_length = 200
gamma = .99 # discount rate for advantage estimation and reward discounting
s_size = 7056 # Observations are greyscale frames of 84 * 84 * 1
a_size = 3 # Agent can move Left, Right, or Fire
load_model = False
model_path = './model'

In [22]:
tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
#Create a directory to save episode playback gifs to
if not os.path.exists('./frames'):
    os.makedirs('./frames')

with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-4)
    master_network = AC_Network(s_size,a_size,'global',None) # Generate global network
    num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    workers = []
    # Create worker classes
    for i in range(5):
        worker = Worker(i,s_size,a_size,trainer,model_path,global_episodes)
        workers.append(worker)
        worker.start(setting=0)
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    # This is where the asynchronous magic happens.
    # Start the "work" process for each worker in a separate threat.
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver)
        t = threading.Thread(target=(worker_work))
        t.start()
        sleep(0.5)
        worker_threads.append(t)
    coord.join(worker_threads)

('127.0.0.1', 50821)
Port being used: 50821
local host--127.0.0.1:50821
('127.0.0.1', 50825)
Port being used: 50825
local host--127.0.0.1:50825
('127.0.0.1', 50831)
Port being used: 50831
local host--127.0.0.1:50831
('127.0.0.1', 50835)
Port being used: 50835
local host--127.0.0.1:50835
('127.0.0.1', 50840)
Port being used: 50840
local host--127.0.0.1:50840
Starting worker 0
Starting worker 1
Starting worker 2
Starting worker 3
Starting worker 4


100%|████████████████████████████████████████████████████████████████████████████████| 242/242 [00:07<00:00, 32.48it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 241/242 [00:10<00:00, 20.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 242/242 [00:09<00:00, 26.46it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 232/233 [00:09<00:00, 25.02it/s]
 99%|███████████████████████████████████████████████████████████████████████████████▌| 173/174 [00:09<00:00, 19.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 230/231 [00:08<00:00, 25.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 229/229 [00:08<00:00, 26.47it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 271/272 [00:09<00:00, 23.70it/s]


Doesn't read episode info.


100%|████████████████████████████████████████████████████████████████████████████████| 239/239 [00:09<00:00, 20.30it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 235/236 [00:09<00:00, 26.36it/s]


Saved Model


 99%|███████████████████████████████████████████████████████████████████████████████▌| 173/174 [00:09<00:00, 18.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 218/219 [00:08<00:00, 26.11it/s]
 99%|███████████████████████████████████████████████████████████████████████████████▌| 174/175 [00:09<00:00, 15.92it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 218/219 [00:08<00:00, 25.92it/s]
 99%|███████████████████████████████████████████████████████████████████████████████▍| 155/156 [00:07<00:00, 22.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 237/238 [00:09<00:00, 25.99it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 240/241 [00:08<00:00, 26.96it/s]
100%|███████████████████████████████████████████████████████████████████████████████▋| 230/231 [00:09<00:00, 24.65it/s]
100%|███████████████████████████████████

Saved Model


 99%|███████████████████████████████████████████████████████████████████████████████▎| 118/119 [00:04<00:00, 27.29it/s]
 99%|███████████████████████████████████████████████████████████████████████████████▌| 164/165 [00:09<00:00, 16.66it/s]
Exception in thread Thread-22:
Traceback (most recent call last):
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-22-7dd59ff4ec91>", line 36, in <lambda>
    worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver)
  File "<ipython-input-20-4569aa1646b2>", line 98, in work
    self.local_AC.state_in[1]:rnn_state[1]})
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\site-packages\tensorflow\python\client\session.py", line 778, in run
    run_metadata_ptr)
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\site-packages\tensorflow\

KeyboardInterrupt: 

Exception in thread Thread-23:
Traceback (most recent call last):
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-22-7dd59ff4ec91>", line 36, in <lambda>
    worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver)
  File "<ipython-input-20-4569aa1646b2>", line 98, in work
    self.local_AC.state_in[1]:rnn_state[1]})
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\site-packages\tensorflow\python\client\session.py", line 778, in run
    run_metadata_ptr)
  File "C:\Users\YuHang\Anaconda2\envs\Maze\lib\site-packages\tensorflow\python\client\session.py", line 914, in _run
    raise RuntimeError('Attempted to use a closed Session.')
RuntimeError: Attempted to use a closed Session.



In [None]:
"""
num_workers = multiprocessing.cpu_count() # Set workers ot number of available CPU threads
workers = []
# Create worker classes
for i in range(2):
    worker = Worker(i)
    workers.append(worker)
    worker.start()
    
worker_threads = []
for worker in workers:
    worker_work = lambda: worker.work()\
    
    
    t = threading.Thread(target=(worker_work))
    t.start()
    sleep(1)
    worker_threads.append(t)
"""