In [None]:
import numpy as np
from vizdoom import *
import skimage.color, skimage.transform
from random import sample, randint, random
import time,random,threading,datetime
from tqdm import tqdm
import transition
import tensorflow as tf
import replay_memory
import transition
import h5py
import math
import sys, os, glob
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
np.concatenate([np.array([1,2,3]), np.array([4,5,6])], axis=0) 

In [None]:
DEMO_PATH = "./demonstration/demodata02.hdf5"
CONFIG_FILE_PATH = "./config/custom_config.cfg"
LOG_DIR = "./logs/logs_test"
MODEL_PATH = "./models/model_test"
GIF_PATH = "./gifs/gif_test.gif"

TIME_LEARN = datetime.timedelta(hours  = 3)
TIME_PRELEARN = datetime.timedelta(minutes  = 2)

SAVE_FILE = False

USED_GPU = "0"

# if SAVE_FILE:
#     if len(glob.glob(LOG_DIR+"/*"))!=0 or len(glob.glob(MODEL_PATH))!=0:
#         print("ERROR: Log or Model file exists already!")
#         sys.exit()
        
#     if not os.path.exists(MODEL_PATH):
#         os.mkdir(MODEL_PATH)

RESOLUTION = (120,180,3)

N_ADV = 5

FREQ_COPY = 10
FREQ_TEST = 50

N_WORKERS = 1
N_PRESTEPS = 20000
N_STEPS = 200
TOTAL_STEPS = N_STEPS
TOTAL_TIME = TIME_LEARN.seconds

GAMMA = 0.99

DISCOUNT = 0.9
LEARNING_RATE = 0.3
RMSProbDecaly = 0.9

BATCH_SIZE = 10
LAMBDA1 = 1.0
LAMBDA2 = 1.0
LAMBDA3 = 10e-5
L_MIN = 0.8
LSTM_SIZE = 128

N_ACTION = 6

BOTS_NUM = 20

FRAME_REPEAT = 4

N_FOLDER = 3

REWARDS = {'living':-0.01, 'health_loss':-1, 'medkit':50, 'ammo':0.0, 'frag':500, 'dist':3e-2, 'suicide':-500} 

CAPACITY = 10000

EPS_START = 0.5
EPS_END = 0.0
LINEAR_EPS_START = 0.1
LINEAR_EPS_END = 0.9

In [None]:
# --class for Thread　-------
class WorkerThread:
    # Each Thread has an Environment to run Game and Learning.
    def __init__(self, thread_name, parameter_server, isLearning=True):
        self.environment = Environment(thread_name, parameter_server)
        print(thread_name," Initialized")
        self.isLearning = isLearning

    def run(self):
        if self.isLearning:
            while True:
                if not self.environment.finished:
                    self.environment.run()
                else:
                    break
        else:
            # Run Test Environment
            pass

In [None]:
class Environment(object):
    def __init__(self,name, parameter_server, summary=False):
        self.game = DoomGame()
        self.game.load_config(CONFIG_FILE_PATH)
        self.game.set_window_visible(False)
        self.game.set_mode(Mode.PLAYER)
#         self.game.set_screen_format(ScreenFormat.GRAY8)
        self.game.set_screen_format(ScreenFormat.CRCGCB)
        self.game.set_screen_resolution(ScreenResolution.RES_640X480)
        self.game.init()
        
        health = self.game.get_game_variable(GameVariable.HEALTH)
        ammo = self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO)
        frag = self.game.get_game_variable(GameVariable.FRAGCOUNT)
        pos_x = self.game.get_game_variable(GameVariable.POSITION_X)
        pos_y = self.game.get_game_variable(GameVariable.POSITION_Y)
        self.reward_gen = RewardGenerater(health,ammo,frag,pos_x,pos_y)
        
        self.network = NetworkLocal(name, parameter_server)
        self.agent = Agent(self.network)
        
        self.demo_buff = replay_memory.ReplayMemory(CAPACITY)
        
        self.local_step = 0
        
        self.finished = False
        
        self.pre_death = 0
        
        self.name = name
        
        self.summary = summary
        self.parameter_server = parameter_server
    
    def start_episode(self):
        self.game.new_episode()
        self.agent.image_buff = []
        for i in range(BOTS_NUM):
            self.game.send_game_command("addbot")
        
    def preprocess(self,img):
        if len(img.shape) == 3:
            img = img.transpose(1,2,0)

        img = skimage.transform.resize(img, RESOLUTION,mode='constant')
        img = img.astype(np.float32)
        return img
    
    def get_reward(self):
        health = self.game.get_game_variable(GameVariable.HEALTH)
        ammo = self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO)
        frag = self.game.get_game_variable(GameVariable.FRAGCOUNT)
        pos_x = self.game.get_game_variable(GameVariable.POSITION_X)
        pos_y = self.game.get_game_variable(GameVariable.POSITION_Y)
        
        r,r_detail = self.reward_gen.get_reward(health,ammo,frag,pos_x,pos_y)
    
        return r,r_detail
    
    # Method for Previous Learning
    def run_pre_learning(self):        
        global frames, start_time_pre
        
#         for step in tqdm(range(N_PRESTEPS)):
        step = 0
        while True:
            
            self.network.pull_parameter_server()
            
            tree_idx, batch, is_weight = self.demo_buff.sample(BATCH_SIZE)

            s1 = np.zeros((BATCH_SIZE , N_ADV,)+RESOLUTION,dtype=np.float32)
            s2 = np.zeros((BATCH_SIZE , N_ADV,)+RESOLUTION,dtype=np.float32)
            actions = np.zeros((BATCH_SIZE ,),dtype=np.int8)
            rewards = np.zeros((BATCH_SIZE, ),dtype=np.float32)
            rewards_adv = np.zeros((BATCH_SIZE,),dtype=np.float32)
            isterminals = np.zeros((BATCH_SIZE, ),dtype=np.int8)
            isdemos = np.zeros((BATCH_SIZE,),dtype=np.int8)
            
            for i in range(BATCH_SIZE):
                for j in range(N_ADV):
                    if not type(batch[i][j].s1) == type(None):
                        s1[i][j] = batch[i][j].s1
                    if not type(batch[i][j].s2) == type(None):
                        s2[i][j] = batch[i][j].s2
#                     print(np.mean(s1[i][j]), np.std(s1[i][j]))
            
            for i in range(BATCH_SIZE):
                R = 0
                for j in range(N_ADV-1, -1, -1):
#                     s1[i][j] = batch[i][j].s1
#                     s2[i][j] = batch[i][j].s2
                    if not batch[i][j].isterminal :
                        if j == N_ADV-1:
                            isterminals[i] = batch[i][j].isterminal
                            actions[i] = batch[i][j].action
                            rewards[i] = batch[i][j].reward
#                             R = np.max(self.network.get_q_value(np.array([np.concatenate([s1[i][0:j], np.ones(shape=(N_ADV-j,)+RESOLUTION)*np.nan])]))[0])
#                             print(s1[i].shape)
                            R = np.max(self.network.get_q_value( np.array([s1[i]]) )[0])
                        else:
                            R = batch[i][j].reward + GAMMA * R
                    else:
                        if not(type(batch[i][j].s1) == type(None)):
                            isterminals[i] = batch[i][j].isterminal
                            actions[i] = batch[i][j].action
                            rewards[i] = batch[i][j].reward
                        else:
                            pass
                rewards_adv[i] = R
                isdemos[i] = True
            
            l_one, l_adv, loss_class = self.network.update_parameter_server_batch(s1,actions,rewards,rewards_adv,s2,isdemos)
#             print(SESS.run([self.network.model_l,self.network.model_t,self.network.loss_one],{self.network.state1_:s1[0:1], self.network.state2_:s2[0:1],self.network.r_:rewards[0:1]}))
            if step%100 == 0 and SAVE_FILE == True:
#                 lo, la, lc = self.network.calc_loss(s1[0:1],actions[0:1],rewards[0:1],rewards_adv[0:1],s2[0:1],isdemos[0:1])
                self.parameter_server.write_weights(frames)
                self.parameter_server.write_summary(frames,s1[0:1],l_one, l_adv, loss_class,[0.0])
            if step%FREQ_COPY==0:
                self.network.copy_learn2target()
            frames += 1
            step += 1
            
            if datetime.datetime.now() > TIME_PRELEARN + start_time_pre:
                runout = True
                break
    
    # Method for multi task learning
    def run(self):
        global frames,runout,current_time
        
        self.start_episode()
        
        train_episode = 0
        step = 0
        while True: 
#         for step in range(WORKER_STEPS):
            #Copy params from global
            self.agent.q_network.pull_parameter_server()

            if not self.game.is_episode_finished():
                
                if step%N_ADV==0 and not step==0:
                    self.reward_gen.update_origin(self.game.get_game_variable(GameVariable.POSITION_X),\
                                                  self.game.get_game_variable(GameVariable.POSITION_Y))

                s1 = self.preprocess(self.game.get_state().screen_buffer)
                action = self.agent.act_eps_greedy(s1)
                self.game.make_action(action,FRAME_REPEAT)
                reward,_ = self.get_reward()
                isterminal = self.game.is_episode_finished()
                s2 = self.preprocess(self.game.get_state().screen_buffer) if not isterminal else np.zeros(RESOLUTION)

                self.agent.push_advantage(s1,action.index(1),reward,s2,isterminal,False)
#                 l_v = self.agent.calc_loss()
                self.agent.learn_advantage(isterminal)
                if self.summary==True and SAVE_FILE == True:
                    if step % 100 == 0:
                        self.parameter_server.write_summary(frames,self.agent.s1_record, self.agent.loss_one_record, self.agent.loss_adv_record, self.agent.loss_class_record,[0.0])
                
                if self.game.is_player_dead():
                    self.game.respawn_player()
                    self.reward_gen.respawn_pos(self.game.get_game_variable(GameVariable.HEALTH), \
                                                self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO), \
                                                self.game.get_game_variable(GameVariable.POSITION_X),\
                                                self.game.get_game_variable(GameVariable.POSITION_Y))

            else:
                train_episode += 1
                self.start_episode()
                self.reward_gen.new_episode(health = self.game.get_game_variable(GameVariable.HEALTH), \
                                           ammo = self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO), \
                                           posx = self.game.get_game_variable(GameVariable.POSITION_X), \
                                           posy = self.game.get_game_variable(GameVariable.POSITION_Y))
            
            frames += 1
            step += 1
            current_time = datetime.datetime.now().timestamp() - start_time_async.timestamp()
            
            if runout == True:
                break
                
        print(self.name," finished")
        self.finished = True
        
    def run_test(self, save_gif=False):
        
        global frames
        health = self.game.get_game_variable(GameVariable.HEALTH)
        ammo = self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO)
        frag = self.game.get_game_variable(GameVariable.FRAGCOUNT)
        pos_x = self.game.get_game_variable(GameVariable.POSITION_X)
        pos_y = self.game.get_game_variable(GameVariable.POSITION_Y)
        self.reward_gen = RewardGenerater(health,ammo,0,pos_x,pos_y)
        
        self.start_episode()
        
        #Copy params from global
        self.agent.q_network.pull_parameter_server()

        step = 0
        gif_img = []
        while not self.game.is_episode_finished():
            
            if step%N_ADV==0 and not step==0:
                self.reward_gen.update_origin(self.game.get_game_variable(GameVariable.POSITION_X),\
                                              self.game.get_game_variable(GameVariable.POSITION_Y))

            s1 = self.preprocess(self.game.get_state().screen_buffer)
            if save_gif==True:
                if step%5==0:
                    gif_img.append(s1)
            action = self.agent.act_greedy(s1)
            self.game.make_action(action,1)
            reward = self.get_reward()
            isterminal = self.game.is_episode_finished()

            if self.game.is_player_dead():
                self.game.respawn_player()
                self.reward_gen.respawn_pos(self.game.get_game_variable(GameVariable.HEALTH), \
                                            self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO), \
                                            self.game.get_game_variable(GameVariable.POSITION_X),\
                                            self.game.get_game_variable(GameVariable.POSITION_Y))
            
            step += 1
        
        save_img = []
        if save_gif == True:
            for i in range(len(gif_img)):
                save_img.append(Image.fromarray(np.uint8(gif_img[i]*255)))
            save_img[0].save(GIF_PATH,save_all=True,append_images=save_img[1:])
        
        print("----------TEST at %d step-------------"%(frames))
        ret_frag = self.game.get_game_variable(GameVariable.FRAGCOUNT)
        ret_death = self.game.get_game_variable(GameVariable.DEATHCOUNT)-self.pre_death
        ret_reward = self.reward_gen.total_reward
        print("FRAG:",ret_frag,"DEATH:",ret_death)
        print("REWARD",ret_reward)
        print("DETAIL:",self.reward_gen.total_reward_detail)
        self.pre_death = self.game.get_game_variable(GameVariable.DEATHCOUNT)
        return ret_reward,ret_frag,ret_death
                 
    def load_demonstration(self):
        
        file = h5py.File(DEMO_PATH,"r")
        episode_list = list(file.keys())[1:]
        episode_list = episode_list[1:3]
        
        test_data = []
        
        select_forward = False
        for e in episode_list:
            n_steps = file[e+"/states"].shape[0]
            states = file[e+"/states"][:]
            actions = file[e+"/action"][:]
            frags = file[e+"/frag"][:]
            deaths = file[e+"/death"][:]
            health = file[e+"/health"][:]
            ammo = file[e+"/ammo"][:]
            posx = file[e+"/posx"][:]
            posy = file[e+"/posy"][:]
            
            log_buffer = []
            originx = posx[0]
            originy = posy[0]
            for i in range(n_steps):
                
                if i % N_ADV == 0:
                    originx = posx[i]
                    originy = posy[i]
                if not i == n_steps - 1:
                    m_frag = frags[i+1] - frags[i]
                    m_death = deaths[i+1] - deaths[i]
                    m_health = health[i+1] - health[i]
                    m_ammo = ammo[i+1] - ammo[i]
                    m_posx = abs(posx[i] - originx)
                    m_posy = abs(posy[i] - originy)
                    r_d = self.reward_gen.calc_reward(m_frag,m_death,m_health,m_ammo,m_posx,m_posy)
                    r = sum(r_d.values())
                    
                    if sum(actions[i]) == 1:
                        a_idx = np.where(actions[i] == 1)[0]
                    else:
                        if actions[i][5] == 1:
                            a_idx = 5
                        elif (np.where(actions[i]==1)[0] == [2,4]).all():
                            if select_forward == True:
                                a_idx = 4
                            else:
                                a_idx = 2
                            select_forward = not select_forward
                        elif (np.where(actions[i]==1)[0] == [3,4]).all():
                            if select_forward == True:
                                a_idx = 4
                            else:
                                a_idx = 3
                            select_forward = not select_forward
                        else:
                            a_idx = -1
                            
                    
                    log_buffer.append(transition.Transition(states[i],a_idx,states[i+1],r,False,True))
                else:
                    log_buffer.append(transition.Transition(states[i],5, None, 0 ,True, True))
                    log_buffer = log_buffer + [transition.Transition(None,None,None,None,True,True) for _ in range(N_ADV - len(log_buffer))]
                    
                    
                if len(log_buffer) == N_ADV:
#                     self.demo_buff.store(np.copy(log_buffer))
                    test_data.append(np.copy(log_buffer))
                    log_buffer = []
        
        return test_data
            
        
#         hdf5file = h5py.File(DEMO_PATH,"r")
#         folder = "demodata_"+str(0)
#         state1 = hdf5file[folder+"/state1"].value
#         state2 = hdf5file[folder+"/state2"].value
#         actions = hdf5file[folder+"/actions"].value
#         isterminals = hdf5file[folder+"/isterminals"].value
#         health = hdf5file[folder+"/healths"].value
#         ammo = hdf5file[folder+"/ammos"].value
#         posx = hdf5file[folder+"/posxs"].value
#         posy = hdf5file[folder+"/posys"].value
#         death = hdf5file[folder+"/deaths"].value
#         frag = hdf5file[folder+"/frags"].value

#         for i in range(1,N_FOLDER):
#             folder = "demodata_" +str(i)
#             state1 = np.concatenate((state1,hdf5file[folder+"/state1"].value),axis=0)
#             state2 = np.concatenate((state2,hdf5file[folder+"/state2"].value),axis=0)
#             actions = np.concatenate((actions,hdf5file[folder+"/actions"].value),axis=0)
#             isterminals = np.concatenate((isterminals,hdf5file[folder+"/isterminals"].value),axis=0)
#             health = np.concatenate((health,hdf5file[folder+"/healths"].value),axis=0)
#             ammo = np.concatenate((ammo,hdf5file[folder+"/ammos"].value),axis=0)
#             posx = np.concatenate((posx,hdf5file[folder+"/posxs"].value),axis=0)
#             posy = np.concatenate((posy,hdf5file[folder+"/posys"].value),axis=0)
#             death = np.concatenate((death,hdf5file[folder+"/deaths"].value),axis=0)
#             frag = np.concatenate((frag,hdf5file[folder+"/frags"].value),axis=0)

#         n_transit, n_step, _ = actions.shape

#         print("SIZE of DEMO:",actions.shape)

#         transit = np.empty((n_step,),dtype=object)

#         is_dead = False
#         is_finished = False

#         pre_health = 100
#         pre_ammo = 15
#         pre_frag = 0
#         pre_death = 0
#         pre_posx = 0.0
#         pre_posy = 0.0


#         for i in range(n_transit):

#             if i % 2 == 0:
#                 pre_posx = posx[i][0]
#                 pre_posy = posy[i][0]

#             for j in range(n_step):
#                 if not is_finished:
#                     if is_dead :
#                         pre_posx = posx[i][j]
#                         pre_posy = posy[i][j]
#                         is_dead = False

#                     m_frag = frag[i][j] - pre_frag
#                     m_death = death[i][j] - pre_death
#                     m_health = health[i][j] - pre_health
#                     m_ammo = ammo[i][j] - pre_ammo
#                     m_posx = posx[i][j] - pre_posx
#                     m_posy = posy[i][j] - pre_posy

#                     if m_death >= 1:
#                         is_dead = True 

#                     if isterminals[i][j] == True:
#                         is_finished = True

#                     r_d = self.reward_gen.calc_reward(m_frag,m_death,m_health,m_ammo,m_posx,m_posy)
#                     r = sum(r_d.values())
#                     transit[j] = transition.Transition(state1[i][j],actions[i][j],state2[i][j],r,isterminals[i][j],True)

#                     pre_frag = frag[i][j]
#                     pre_death = death[i][j]
#                     pre_health = health[i][j]
#                     pre_ammo = ammo[i][j]
#                 else:
#                     transit[j] = transition.Transition(None,None,None,None,True,True)

#             is_finished = False

#             self.demo_buff.store(np.copy(transit))

In [None]:
class RewardGenerater(object):
    def __init__(self,health,ammo,frag,pos_x,pos_y):

        # Reward
        self.rewards = REWARDS
        self.dist_unit = 6.0
        
        self.origin_x = pos_x
        self.origin_y = pos_y
        
        self.pre_health = health
        self.pre_ammo = ammo
        self.pre_frag = frag

        self.total_reward = 0.0
        self.total_reward_detail = {'living':0.0, 'health_loss':0.0, 'medkit':0.0, 'ammo':0.0, 'frag':0.0, 'dist':0.0, 'suicide': 0.0}

    
    def get_reward(self,health,ammo,frag,pos_x,pos_y):
        
        if abs(health) > 10000:
            health = 100.0

        if self.origin_x == 0 and self.origin_y == 0:
            self.origin_x = pos_x
            self.origin_y = pos_y
        
        self.reward_detail = self.calc_reward(frag-self.pre_frag,0.0, \
                                              health-self.pre_health,\
                                              ammo-self.pre_ammo, \
                                              pos_x-self.origin_x, \
                                              pos_y-self.origin_y)
        self.reward = sum(self.reward_detail.values())

        for k,v in self.reward_detail.items():
            self.total_reward_detail[k] += v
        self.total_reward = sum(self.total_reward_detail.values())

        self.pre_frag = frag
        self.pre_health = health
        self.pre_ammo = ammo
                    
        return (self.reward, self.reward_detail)
    
    def calc_reward(self,m_frag,m_death,m_health,m_ammo,m_posx,m_posy):

        ret_detail = {}

        ret_detail['living'] = self.rewards['living']

        if m_frag >= 0:
            ret_detail['frag'] = (m_frag)*self.rewards['frag']
            ret_detail['suicide'] = 0.0
        else:
            ret_detail['suicide'] = (m_frag*-1)*(self.rewards['suicide'])
            ret_detail['frag'] = 0.0
        
        ret_detail['dist'] = int((math.sqrt((m_posx)**2 + (m_posy)**2))/self.dist_unit) * (self.rewards['dist'] * self.dist_unit)
        
        if m_health > 0:
            ret_detail['medkit'] = self.rewards['medkit']
            ret_detail['health_loss'] = 0.0
        else:
            ret_detail['medkit'] = 0.0
            ret_detail['health_loss'] = (m_health)*self.rewards['health_loss'] * (-1)

        ret_detail['ammo'] = (m_ammo)*self.rewards['ammo'] if m_ammo>0 else 0.0
        
        return ret_detail 
    
    def respawn_pos(self,health,ammo,posx, posy):
        self.origin_x = posx
        self.origin_y = posy
        self.pre_health = health
        self.pre_ammo = ammo

    def new_episode(self,health,ammo,posx,posy):
        self.respawn_pos(health,ammo,posx,posy)
        self.pre_frag = 0

        self.total_reward = 0
        self.total_reward_detail={'living':0.0, 'health_loss':0.0, 'medkit':0.0, 'ammo':0.0, 'frag':0.0, 'dist':0.0, 'suicide': 0.0}
    
    def update_origin(self,pos_x, pos_y):
        self.origin_x = pos_x
        self.origin_y = pos_y

In [None]:
class Agent(object):
    def __init__(self,q_network):
        
        self.q_network = q_network
        
#         self.image_buff = np.zeros(shape=(N_ADV,)+RESOLUTION)
        self.image_buff = []
        self.memory = []
        self.batch = {'s1':[], 'action':[], 's2':[] ,'reward':[], 'reward_adv':[], 'isdemo':[]}
        self.R = 0
        
        self.s1_record = np.zeros((1,N_ADV,)+RESOLUTION)
        self.loss_one_record = 0
        self.loss_adv_record = 0
        self.loss_class_record = 0
        
    def calc_eps_step(self):
            global frames

            if frames<TOTAL_STEPS*LINEAR_EPS_START:
                eps = EPS_START
            elif frames>=TOTAL_STEPS*LINEAR_EPS_START and frames<TOTAL_STEPS*LINEAR_EPS_END:
                eps = EPS_START + frames*(EPS_END-EPS_START)/(TOTAL_STEPS)
            else:
                eps = EPS_END
            return eps
        
    def calc_eps_time(self):
        if current_time < TOTAL_TIME * LINEAR_EPS_START:
            eps = EPS_START
        elif current_time >= TOTAL_TIME * LINEAR_EPS_START and current_time < TOTAL_TIME*LINEAR_EPS_END:
            eps = EPS_START + current_time*(EPS_END-EPS_START)/(TOTAL_TIME)
        else:
            eps = EPS_END
            
        return eps

    def act_eps_greedy(self,s1):

        self.image_buff.append(s1)
        ret_action = np.zeros((N_ACTION,))
        if len(self.image_buff) == N_ADV + 1:
            eps = self.calc_eps_time()

            self.image_buff.pop(0)
            
            if random.random() > eps:
                a_idx = self.q_network.predict_best_action(self.image_buff)
            else:
                a_idx = randint(0,N_ACTION-1)
        else:
            a_idx = randint(0,N_ACTION-1)
                
        ret_action[a_idx] = 1
        return ret_action.tolist()
    
    def act_greedy(self,s1):

        self.image_buff.append(s1)
        ret_action = np.zeros((N_ACTION,))
        if len(self.image_buff) == N_ADV + 1:
            eps = self.calc_eps_time()

            self.image_buff.pop(0)
            
            a_idx = self.q_network.predict_best_action(self.image_buff)
        else:
            a_idx = randint(0,N_ACTION-1)

        ret_action[a_idx] = 1
        return ret_action.tolist()
    
    def push_advantage(self,s1_,a_,r_,s2_,isterminal,isdemo):
        self.memory.append((s1_,a_,r_,s2_,isdemo))
    
    def clear_memory(self):
        self.memory = []
    
    def push_to_batch(self, s1, action, s2, reward, reward_adv, isdemo):
        self.batch['s1'].append(s1)
        self.batch['action'].append(action)
        self.batch['s2'].append(s2)
        self.batch['reward'].append(reward)
        self.batch['reward_adv'].append(reward_adv)
        self.batch['isdemo'].append(isdemo)
        return 0
    
    def clear_batch(self):
        self.batch = {'s1':[], 'action':[], 's2':[] ,'reward':[], 'reward_adv':[], 'isdemo':[]}
        return 0
    
    def make_batch_learn(self):
        n = len(self.batch['action'])
        s1 = np.zeros((n, N_ADV,)+RESOLUTION)
        s2 = np.zeros((n, N_ADV,)+RESOLUTION)
        for i in range(n):
            s1[i, :i+1] = self.batch['s1'][:i+1]
            s2[i, :i+1] = self.batch['s2'][:i+1]
        
        self.s1_record = s1[0:1]
        self.loss_one_record, self.loss_adv_record, self.loss_class_record = \
        self.q_network.update_parameter_server_batch(s1, self.batch['action'], self.batch['reward'], \
                                                         self.batch['reward_adv'], s2, self.batch['isdemo']) 
        return 0
    
    def learn_advantage(self, isterminal):
        
        if len(self.memory)==N_ADV or isterminal:
            tail_idx = len(self.memory)-1
            
            s1_buff = np.zeros((N_ADV, )+RESOLUTION)
            for i in range(tail_idx+1):
                s1_buff[i] = self.memory[i][0]
            
            for i in range(tail_idx,-1,-1):
                s1,a,r,s2,d = self.memory[i]
                if i==tail_idx:
                    if not isterminal:
#                         print(np.max(self.q_network.get_q_value(s1)[0]))
                        self.R = np.max(self.q_network.get_q_value(s1_buff)[0])
                        
                    else:
                        self.R = 0
                else:
                    self.R =  r + GAMMA*self.R
            
#                 self.q_network.train_push(s1,a,r,self.R,s2,d)
                self.push_to_batch(s1,a,s2,r,self.R,d)
            
#             self.q_network.update_parameter_server()
#             self.q_network.update_parameter_server_batch(self.batch['s1'], self.batch['action'], self.batch['reward'], \
#                                                          self.batch['reward_adv'], self.batch['s2'], self.batch['isdemo'])

#             print(np.shape(self.batch['s1']))
#             print(np.shape(self.batch['s2']))
#             print(np.shape(self.batch['s2']))
            self.make_batch_learn()
            self.q_network.copy_learn2target()
            self.R = 0
            self.clear_memory()
            self.clear_batch()
            
#             return self.q_network.calc_loss([s1],[a],[r],[self.R],[s2],[d])
#         return 0.0,0.0,0.0
    
    def calc_loss(self):
        
        if len(self.memory) == N_ADV :
            tail_idx = len(self.memory) - 1
            s1_buff = np.ones((1, tail_idx+1, )+RESOLUTION) * np.nan
            s2_buff = np.ones((1, tail_idx+1, )+RESOLUTION) * np.nan
            for i in range(tail_idx+1):
                s1_buff[0, i] = self.memory[i][0]
                s2_buff[0, i] = self.memory[i][3]
            
            for i in range(tail_idx, -1, -1):
                s1 , a, r, s2, d = self.memory[i]
                if i == tail_idx :
                    R = np.max(self.q_network.get_q_value(s1_buff)[0])
                else:
                    R = r * GAMMA * R
                
                _, last_action, last_r, _, last_d = self.memory[tail_idx]
                
                return [s1_buff] + self.q_network.calc_loss(s1_buff, [last_action], [last_r], [R] ,s2_buff ,[last_d])
        
        return -1

In [None]:
class NetworkSetting:
    
    def encode(pre_layer):
        s = tf.shape(pre_layer)
        return tf.reshape(pre_layer, shape=(-1,)+RESOLUTION)
    
    def conv1(pre_layer):
        num_outputs = 32
        kernel_size = [1,6,6]
        stride = [1,3,3]
#         kernel_size = [6,6]
#         stride = [3,3]
        padding = 'SAME'
        activation = tf.nn.relu
        weights_init = tf.contrib.layers.xavier_initializer_conv2d()
#         weights_init = tf.constant_initializer(2.0)
        bias_init = tf.constant_initializer(0.1)
        
        return tf.contrib.layers.conv2d(pre_layer,kernel_size=kernel_size,\
                                        num_outputs=num_outputs,\
                                        stride=stride,padding=padding,activation_fn=activation,\
                                        weights_initializer=weights_init,\
                                        biases_initializer=bias_init)
    
    def maxpool1(pre_layer):
#         return tf.nn.max_pool(pre_layer,[1,3,3,1],[1,2,2,1],'SAME')
        return tf.nn.max_pool3d(pre_layer,[1,1,3,3,1],[1,1,2,2,1],'SAME')
    
    def conv2(pre_layer):
        num_outputs = 64
        kernel_size = [1,3,3]
        stride = [1,2,2]
#         kernel_size = [3,3]
#         stride = [2,2]
        padding = 'SAME'
        activation = tf.nn.relu
        weights_init = tf.contrib.layers.xavier_initializer_conv2d()
        bias_init = tf.constant_initializer(0.1)
        return tf.contrib.layers.conv2d(pre_layer,kernel_size=kernel_size,num_outputs=num_outputs,\
                                        stride=stride,padding=padding,activation_fn=activation,\
                                        weights_initializer=weights_init,biases_initializer=bias_init)
    
    def maxpool2(pre_layer):
#         return tf.nn.max_pool(pre_layer,[1,3,3,1],[1,2,2,1],'SAME')
        return tf.nn.max_pool3d(pre_layer,[1,1,3,3,1],[1,1,2,2,1],'SAME')
        
    def reshape(pre_layer):
        print(pre_layer)
#         return tf.contrib.layers.flatten(pre_layer)
        a = tf.shape(pre_layer)[1]
        b = tf.shape(pre_layer)[2]
        c = tf.shape(pre_layer)[3]
        d = tf.shape(pre_layer)[4]
        print(a,",",b,",",c,",",d)
        return tf.reshape(pre_layer, shape=(-1, N_ADV * 2560))
        
    def fc1(pre_layer):
        print((pre_layer))
        num_outputs = 512
        activation_fn = tf.nn.relu
        weights_init = tf.contrib.layers.xavier_initializer()
        bias_init = tf.constant_initializer(0.1)
        return tf.contrib.layers.fully_connected(pre_layer,num_outputs=num_outputs,activation_fn=activation_fn,\
                                                 weights_initializer=weights_init, biases_initializer=bias_init)
    
    def decode(pre_layer):
        return tf.reshape(pre_layer, shape=(-1, N_ADV,512))
    
    def lstm(pre_layer, state):
        batch_size = tf.shape(pre_layer)[0]
        print(pre_layer)
        temp = tf.reduce_max(state, axis=4)
        temp = tf.reduce_max(temp, axis=3)
        temp = tf.reduce_max(temp, axis=2)
        lengh = tf.cast(tf.reduce_sum(tf.sign(temp) , axis=1),dtype=tf.int32) 
        cell = tf.contrib.rnn.BasicLSTMCell(LSTM_SIZE)
        rnn_state = cell.zero_state(batch_size, dtype=tf.float32)
        rnn_out, state_out = tf.nn.dynamic_rnn(cell, pre_layer, initial_state=rnn_state, sequence_length=lengh,dtype=tf.float32)
        out_idx = tf.range(0, batch_size) * N_ADV + (lengh  -1)
        output = tf.gather(tf.reshape(rnn_out, [-1, LSTM_SIZE]), out_idx)
        return output, lengh, rnn_out
    
    def q_value(pre_layer):
        num_outputs = N_ACTION
        activation_fn = None
        weights_init = tf.contrib.layers.xavier_initializer()
        bias_init = tf.constant_initializer(0.1)
        return tf.contrib.layers.fully_connected(pre_layer,num_outputs=num_outputs,activation_fn=activation_fn,\
                                                 weights_initializer=weights_init, biases_initializer=bias_init)


In [None]:
# Network which be shared in global
class ParameterServer:
    def __init__(self):
        
        self.state1_ = tf.placeholder(tf.float32,shape=(None,N_ADV)+RESOLUTION, name="state1")
        self.a_ = tf.placeholder(tf.int32, shape=(None,), name="action")
        self.r_ = tf.placeholder(tf.float32, shape=(None,), name="reward")
        self.r_adv = tf.placeholder(tf.float32, shape=(None,), name="reward_adv")
        self.mergin_value = tf.placeholder(tf.float32,shape=(None,N_ACTION), name="mergin_value")
#         self.s1idx_ = tf.placeholder(tf.int32, shape=(None,), name="lengh_of_state")
        
        with tf.variable_scope("parameter_server",reuse=tf.AUTO_REUSE):      # スレッド名で重み変数に名前を与え、識別します（Name Space）
            with tf.device("/gpu:0"):
                self.model = self._build_model()            # ニューラルネットワークの形を決定
            
        self.weights_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="parameter_server")
#         self.optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, RMSProbDecaly)    # loss関数を最小化していくoptimizerの定義です
        self.optimizer = tf.train.AdamOptimizer()
        with tf.variable_scope("summary"):
            self._build_summary()

        self.saver = tf.train.Saver()
        
        print("-------GLOBAL-------")
        for w in self.weights_params:
            print(w)

    def _build_model(self):
        
#         self.enco = NetworkSetting.encode(self.state1_)
        self.conv1 = NetworkSetting.conv1(self.state1_)
        maxpool1 = NetworkSetting.maxpool1(self.conv1)
        self.conv2 = NetworkSetting.conv2(maxpool1)
        maxpool2 = NetworkSetting.maxpool2(self.conv2)
        reshape = NetworkSetting.reshape(maxpool2)
        fc1 = NetworkSetting.fc1(reshape)
#         self.deco = NetworkSetting.decode(fc1)
#         rnn, l, _ = NetworkSetting.lstm(self.deco, self.state1_)
        
        q_value = NetworkSetting.q_value(fc1)
                
        print("---------MODEL SHAPE-------------")
        print(self.state1_.get_shape())
        print(self.conv1.get_shape())
        print(self.conv2.get_shape())
        print(reshape.get_shape())
        print(fc1.get_shape())
        print(q_value.get_shape())
            
        return q_value
                
    def _build_summary(self):
        
        self.loss_one = tf.placeholder(tf.float32,shape=(1,))
        self.loss_n = tf.placeholder(tf.float32,shape=(1,))
        self.loss_c = tf.placeholder(tf.float32,shape=(1,))
        self.loss_l = tf.placeholder(tf.float32,shape=(1,))
        
        self.reward = tf.placeholder(tf.float32,shape=(1,))
        self.frag = tf.placeholder(tf.int64,shape=(1,))
        self.death = tf.placeholder(tf.int64,shape=(1,))
        
        summary_lo = tf.summary.scalar('loss_one',self.loss_one[0])
        summary_ln = tf.summary.scalar('loss_nstep', self.loss_n[0])
        summary_lc = tf.summary.scalar('loss_class', self.loss_c[0])
        summary_ll = tf.summary.scalar('loss_l2',self.loss_l[0])

        self.merged_loss = tf.summary.merge([summary_lo,summary_ln,summary_lc,summary_ll])
        
        conv1_display = tf.expand_dims(tf.transpose(self.conv1, perm=[0,1,4,2,3]), axis=5)
        conv2_display = tf.expand_dims(tf.transpose(self.conv2, perm=[0,1,4,2,3]), axis=5)

        state_shape = self.state1_.get_shape()
        conv1_shape = conv1_display.get_shape()
        conv2_shape = conv2_display.get_shape()
        print("conv1_shape:", conv1_shape)
        print("conv2_shape:",conv2_shape)
        summary_state  = tf.summary.image('state',tf.reshape(self.state1_,[-1,state_shape[2], state_shape[3], state_shape[4]]),max_outputs = 1)
        summary_conv1 = tf.summary.image('conv1',tf.reshape(conv1_display,[-1, conv1_shape[3], conv1_shape[4], conv1_shape[5]]),max_outputs = 1)
        summary_conv2 = tf.summary.image('conv2',tf.reshape(conv2_display,[-1, conv2_shape[3], conv2_shape[4], conv2_shape[5]]),max_outputs = 1)

        self.merged_image = tf.summary.merge([summary_state,summary_conv1,summary_conv2])
        
        summary_reward = tf.summary.scalar('reward',self.reward[0])
        summary_frag = tf.summary.scalar('frag',self.frag[0])
        summary_death = tf.summary.scalar('death',self.death[0])
        
        self.merged_testscore = tf.summary.merge([summary_reward,summary_frag,summary_death])
        
        self.merged_weights = tf.summary.merge([tf.summary.scalar('weights'+str(i),tf.reduce_mean(self.weights_params[i])) for i in range(len(self.weights_params))])
        
        self.writer = tf.summary.FileWriter(LOG_DIR,SESS.graph)

    # write summary about LOSS and IMAGE
    def write_summary(self,step,s1,loss_one,loss_n,loss_class,loss_l2):
#         print(s1.shape)
#         print("step:",step,"loss_one:",loss_one, "loss_n:",loss_n, "loss_class",loss_class, "loss_l2",loss_l2)
        if step%1 == 0:
            m_s,m_i = SESS.run([self.merged_loss,self.merged_image],feed_dict= \
                               {self.state1_:s1,self.loss_one:[loss_one],self.loss_n:[loss_n],self.loss_c:[loss_class],self.loss_l:loss_l2})
            self.writer.add_summary(m_s,step)
            if step%1 == 0:
                self.writer.add_summary(m_i,step)
    
    def write_records(self,step,r,f,d):
#         r = np.array([[r]])
#         f = np.array([[f]])
#         d = np.array([[d]])
        m = SESS.run(self.merged_testscore,feed_dict={self.reward:[r],self.frag:[f],self.death:[d]})
        self.writer.add_summary(m,step)
        
    def write_weights(self, step):
        m = SESS.run(self.merged_weights)
        self.writer.add_summary(m, step)
        return 0
    
    def save_model(self):
        self.saver.save(SESS, MODEL_PATH+"/model.ckpt")
        
    def load_model(self):
        self.saver.restore(SESS, MODEL_PATH+"/model.ckpt")

In [None]:
class NetworkLocal(object):
    def __init__(self,name,parameter_server):
        self.name = name

        self.s1 = np.zeros(shape=(120,RESOLUTION[0],RESOLUTION[1],RESOLUTION[2]),dtype=np.float32)
        self.s2 = np.zeros(shape=(120,RESOLUTION[0],RESOLUTION[1],RESOLUTION[2]),dtype=np.float32)
        self.reward = np.empty(shape=(120,),dtype=np.float32)
        self.reward_adv = np.empty(shape=(120,),dtype=np.float32)
        self.action = np.empty(shape=(120,),dtype=np.float32)
        self.isdemo = np.empty(shape=(120,),dtype=np.float32)
        self.queue_pointer = 0
        
        self.state1_ = tf.placeholder(tf.float32,shape=(None,N_ADV,)+RESOLUTION, name="A")
        self.state2_ = tf.placeholder(tf.float32,shape=(None,N_ADV,)+RESOLUTION, name="B")
        self.a_ = tf.placeholder(tf.int32, shape=(None,))
        self.r_ = tf.placeholder(tf.float32, shape=(None,))
        self.r_adv = tf.placeholder(tf.float32, shape=(None,))
        self.isdemo_ = tf.placeholder(tf.float32,shape=(None,))
        self.mergin_value = tf.placeholder(tf.float32,shape=(None,N_ACTION))
#         self.s1idx_ = tf.placeholder(tf.int32, shape = (None,))
        
        with tf.variable_scope(self.name+"_target", reuse=tf.AUTO_REUSE):
            self.model_t, self.len_s2 = self._model(self.state2_)
        with tf.variable_scope(self.name+"_train"):
            self.model_l, self.len_s1 = self._model(self.state1_)

        self._build_graph(parameter_server)
            
#         print("-----LOCAL weights---")
#         for w in self.weights_params:
#             print(w)
            
#         print("-----LOCAL grads---")
#         for w in self.grads:
#             print(w)
    
    def _model(self,state):
        
#         enco = NetworkSetting.encode(state)
        conv1 = NetworkSetting.conv1(state)
        maxpool1 = NetworkSetting.maxpool1(conv1)
        conv2 = NetworkSetting.conv2(maxpool1)
        maxpool2 = NetworkSetting.maxpool2(conv2)
        reshape = NetworkSetting.reshape(maxpool2)
        fc1 = NetworkSetting.fc1(reshape)
#         deco = NetworkSetting.decode(fc1)
#         rnn, lengh, _ = NetworkSetting.lstm(deco, state)
#         self.deco = NetworkSetting.decode(fc1)
#         self.rnn, lengh, _ = NetworkSetting.lstm(self.deco, state)
        
        q_value = NetworkSetting.q_value(fc1)
        
        return q_value, 0

    def _build_graph(self,parameter_server):
        
#         self.best_action = tf.argmax(self.model_l, axis=0)
        self.prob_action = tf.nn.softmax(self.model_l, axis=1)

        q_model_t = tf.where(tf.equal(self.len_s2, self.len_s1) , self.model_t,tf.zeros_like(self.model_t))
        self.test1 = q_model_t
        
#         self.loss_one = tf.square(tf.stop_gradient(self.r_ + tf.reduce_max(q_model_t,axis=1)) - tf.reduce_max(self.model_l,axis=1))
#         self.loss_adv = tf.square(tf.stop_gradient(self.r_adv + tf.reduce_max(q_model_t,axis=1)) - tf.reduce_max(self.model_l,axis=1))
        self.loss_one = tf.reduce_mean(tf.abs(tf.stop_gradient(self.r_ + tf.reduce_max(q_model_t,axis=1)) - tf.reduce_max(self.model_l,axis=1)))
        self.loss_adv = tf.reduce_mean(tf.abs(tf.stop_gradient(self.r_adv + tf.reduce_max(q_model_t,axis=1)) - tf.reduce_max(self.model_l,axis=1)))
        target = tf.stop_gradient(tf.reduce_max(self.model_l + self.mergin_value))
        idx = tf.transpose([tf.range(tf.shape(self.model_l)[0]), self.a_])
        self.loss_class =  tf.reduce_mean((target- tf.gather_nd(self.model_l,indices=idx)) * self.isdemo_)
        
        self.loss_total = self.loss_one + LAMBDA1 * self.loss_adv + LAMBDA2 * self.loss_class
        
        self.weights_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name+"_train")
        self.weights_params_target = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name+"_target")
        self.grads = tf.gradients(self.loss_total,self.weights_params)
        self.copy_params = [t.assign(l) for l,t in zip(self.weights_params, self.weights_params_target)]
        
        self.update_global_weight_params = \
            parameter_server.optimizer.apply_gradients(zip(self.grads, parameter_server.weights_params))

        self.pull_global_weight_params = [l_p.assign(g_p) for l_p,g_p in zip(self.weights_params,parameter_server.weights_params)]

        self.push_local_weight_params = [g_p.assign(l_p) for g_p,l_p in zip(parameter_server.weights_params,self.weights_params)]
    
    def pull_parameter_server(self):
        SESS.run(self.pull_global_weight_params)
    
    def push_parameter_server(self):
        SESS.run(self.push_local_weight_params)
        
    def show_weights(self):
        hoge = SESS.run(self.weights_params)
        for i in range(len(hoge)):
            print(hoge[i])
    
    def update_parameter_server_batch(self, s1, a, r, r_adv, s2, isdemo):
        if np.ndim(s1) == 4:
            s1 = np.array([s1])
        if np.ndim(s2) == 4:
            s2 = np.array([s2])
        mergin = [[0.8*(not(a[j]==i)) for i in range(N_ACTION)] for j in range(np.shape(a)[0])]

        feed_dict = {self.state1_: s1,self.a_:a, self.r_:r,self.r_adv:r_adv, self.state2_:s2, self.mergin_value:mergin,self.isdemo_:isdemo}
        val = SESS.run([self.update_global_weight_params, self.loss_one, self.loss_adv, self.loss_class],feed_dict)
#         val = SESS.run([self.update_global_weight_params, self.loss_one, self.loss_adv, self.loss_class,self.rnn, self.grads, self.deco],feed_dict)
#         print("--------------------------")
#         print("deco:",[np.mean(v) for v in val[6]])
#         print("rnn:",[np.mean(v) for v in val[4]])
#         print("grad", [np.mean(v) for v in val[5]])
        return val[1], val[2], val[3]

        
    def update_parameter_server(self):
        if self.queue_pointer > 0:
            s1 = np.ones((self.queue_pointer, N_ADV, )+RESOLUTION) * np.nan
            s2 = np.ones((self.queue_pointer, N_ADV, )+RESOLUTION) * np.nan
            for i in range(self.queue_pointer):
                s1[i, 0:i] = self.s1[0:i]
                s2[i, 0:i] = self.s2[0:i]
            r = self.reward[0:self.queue_pointer]
            a = self.action[0:self.queue_pointer]
            r_adv = self.reward_adv[0:self.queue_pointer]
            mergin = [[0.8*(not(a[j]==i)) for i in range(N_ACTION)] for j in range(self.queue_pointer)]
            isdemo = self.isdemo[0:self.queue_pointer]
            
            feed_dict = {self.state1_: s1, self.a_:a, self.r_:r, self.r_adv:r_adv, self.state2_:s2, self.mergin_value:mergin, self.isdemo_:isdemo}
#             _, l, m_l, m_t = SESS.run([self.update_global_weight_params, self.loss_total, self.model_l, self.model_t],feed_dict)
            SESS.run(self.update_global_weight_params,feed_dict)
            self.queue_pointer = 0
            
    def predict_best_action(self, s1):
        if np.ndim(s1)==4:
            s1 = np.array([s1])
        
#         print(SESS.run(self.model_l, {self.state1_:s1}))
#         return SESS.run(self.best_action,{self.state1_:s1})

        probs = SESS.run(self.prob_action, {self.state1_:s1})
#         print(probs)

        return [np.random.choice(N_ACTION, p=p) for p in probs]

    def get_q_value(self,s1):
        if np.ndim(s1)==4:
            s1 = np.array([s1])
            
        return SESS.run(self.model_l,{self.state1_:s1})
    
    def calc_loss(self, s1, a, r, r_adv, s2, isdemo):
        mergin = [[0.8*(not(a[j]==i)) for i in range(N_ACTION)] for j in range(len(a))]
        
        feed_dict = {self.state1_: s1,self.a_:a, self.r_:r,self.r_adv:r_adv, self.state2_:s2, self.mergin_value:mergin,self.isdemo_:isdemo}
        return SESS.run([self.loss_one, self.loss_adv, self.loss_class],feed_dict)
    
    def copy_learn2target(self):
        SESS.run(self.copy_params)

    def train_push(self,s1,a,r,r_adv,s2,isdemo):
        # Push obs to make batch
        self.s1[self.queue_pointer] = s1
        self.s2[self.queue_pointer] = s2
        self.action[self.queue_pointer] = a
        self.reward[self.queue_pointer] = r
        self.reward_adv[self.queue_pointer] = r_adv
        self.isdemo[self.queue_pointer] = isdemo
        self.queue_pointer += 1

In [None]:
if __name__ == "learning":
    
    frames = 0
    runout = False
    current_time = 0
    start_time_async = 0

    config = tf.ConfigProto(gpu_options = tf.GPUOptions(visible_device_list=USED_GPU))
    config.log_device_placement = True
    config.allow_soft_placement = True
    SESS = tf.Session(config=config)
    
    threads = []
    with tf.device("/cpu:0"):
        parameter_server = ParameterServer()

    with tf.device("/gpu:0"):
        for i in range(N_WORKERS):            
            threads.append(WorkerThread("learning_"+str(i),parameter_server))

        pre_env = Environment("pre_env",parameter_server)
        test_env = Environment("test_env", parameter_server)

    SESS.run(tf.global_variables_initializer())

    threads[0].environment.summary=True

    time.sleep(5.0)

    print("---LOADING DEMO---")
    pre_env.load_demonstration()
    print("---PRE LEARNING---")
    start_time_pre = datetime.datetime.now()
    pre_env.run_pre_learning()
    
#     if SAVE_FILE == True:
#         print("---SAVING GIF---")
#         test_env.run_test(True)

#     print("---MULTI THREAD LEARNING---")
#     start_time_async = datetime.datetime.now()
#     for worker in threads:
#         job = lambda: worker.run()      # この辺は、マルチスレッドを走らせる作法だと思って良い
#         t = threading.Thread(target=job)
#         t.start()

#     test_frame = 0
#     while True:
#         if frames >= test_frame and frames<test_frame+1000:
#             r,f,d = test_env.run_test()
#             if SAVE_FILE == True:
#                 parameter_server.write_weights(frames)
#                 parameter_server.write_records(frames,r,f,d)
#             test_frame += 1000
#         elif frames >= test_frame+1000:
#             print("TEST at %d~%d step cant be finished"%(test_frame, test_frame+1000-1))
#             test_frame += 1000
#         else:
#             pass

#         if datetime.datetime.now() > TIME_LEARN + start_time_async:
#             runout = True
#             break
#     print("*****************************\nTIME to PRE LEARNING:%.3f [sec]\n*****************************"%(datetime.datetime.now()-start_time_pre).seconds)
#     print("*****************************\nTIME to ASYNC LEARNING:%.3f [sec]\n*****************************"%(datetime.datetime.now()-start_time_async).seconds)

#     print("---LEARNING PHASE IS FINISHED---")
#     test_env.run_test()

    if SAVE_FILE == True:
        print("---SAVING_MODEL---")
        parameter_server.save_model()
        print("---SAVING GIF---")
        test_env.run_test(True)

In [None]:
if __name__ == "test":
    
    frames = 0
    runout = False
    current_time = 0
    start_time_async = 0

    config = tf.ConfigProto(gpu_options = tf.GPUOptions(visible_device_list=USED_GPU))
    config.log_device_placement = True
    config.allow_soft_placement = True
    SESS = tf.Session(config=config)

    with tf.device("/cpu:0"):
        parameter_server = ParameterServer()

    with tf.device("/gpu:0"):
        test_env = Environment("test_env", parameter_server)

    SESS.run(tf.global_variables_initializer())

    parameter_server.load_model()

    test_env.run_test(True)

In [None]:
if __name__ == "__main__":
    
    frames = 0
    runout = False
    current_time = 0
    start_time_async = 0

    config = tf.ConfigProto(gpu_options = tf.GPUOptions(visible_device_list=USED_GPU))
    config.log_device_placement = True
    config.allow_soft_placement = True
    SESS = tf.Session(config=config)
    
    test_env = Environment("test", ParameterServer())
    h = test_env.load_demonstration()

In [None]:
print(len(h))
for t in h:
    print(t[0].reward)