In [1]:
import os.path, gym
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import roboschool
from pathlib import Path

# Loading expert agent

In [2]:
from expert1 import *

# Design imitation agent

In [3]:
class learning_agent(object):
    def __init__(self, env):
        self.env = env
        # agent memory
        self.trajectory = [] # seq of state, union of state
        self.temporary_traj = [] # store new comming unlabel seq
        self.expert_action_history = [] # seq of expert action, union of action
        self.save_freq = 0
        # define network structure (state input and action output)
        self.state, self.action =  self.build_net(self.env)
        # trajectory and regression 
        self.expert_action = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]])
        self.loss = tf.losses.mean_squared_error(self.expert_action, self.action)
        self.optimizer = tf.train.AdamOptimizer(0.001).minimize(self.loss)
        # start session
        self.sess = tf.Session()
        self.saver=tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        # check memory existance
        my_file = Path("memory/model.ckpt.index")
        if my_file.is_file():
            self.saver.restore(self.sess,"memory/model.ckpt")
            print("model restore")
        else:
            print("start new file")
    
    def build_net(self, env):
        act_shape = env.action_space.shape[0]
        obs_shape = env.observation_space.shape[0]
        state = tf.placeholder(tf.float32, [None, obs_shape])
        # network structure
        l1 = tf.layers.dense(inputs = state, units = 128, activation = tf.nn.relu, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "policy_l1")
        
        l2 = tf.layers.dense(inputs = l1, units = 64, activation = tf.nn.relu, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "policy_l2")
        
        action = tf.layers.dense(inputs = l1, units = act_shape, activation = tf.nn.tanh, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "action")

        scale_action = tf.multiply(action, np.array([env.action_space.high.tolist()]), name = "scale_action")
        return state, scale_action
    
    def imitation_learn(self):
        feed_dict = {self.state: self.trajectory, self.expert_action: self.expert_action_history}
        # optimize loss
        for i in range(800):
            self.sess.run(self.optimizer, feed_dict)
            if i % 499 == 0:
                if i == 0:
                    print("initial cost: ",self.sess.run(self.loss, feed_dict))
                else:
                    print("final cost:   ",self.sess.run(self.loss, feed_dict))
        
        self.save_freq = self.save_freq + 1 
        if self.save_freq % 300 == 1:
            self.saver.save(self.sess, "memory/model.ckpt")

    def expert_label(self, expert):
        for i in range(len(self.temporary_traj)):
            if len(self.expert_action_history) == 0:
                self.expert_action_history = np.array([expert.act(self.temporary_traj[i,:], self.env).tolist()])
            else:
                self.expert_action_history = np.append(self.expert_action_history,
                                                       np.array([expert.act(self.temporary_traj[i,:], 
                                                                            self.env).tolist()]),0)
        # data aggregate
        if len(self.trajectory) == 0:
            self.trajectory = self.temporary_traj
        else:
            self.trajectory = np.append(self.trajectory, self.temporary_traj,0)
        
        self.temporary_traj = [] # temporary memeory clear to zero
    
    def pick_action(self, obs):
        # store temporal obs
        if len(self.temporary_traj) == 0:
            self.temporary_traj = np.array([obs.tolist()])
        else:
            self.temporary_traj = np.append(self.temporary_traj, np.array([obs.tolist()]), 0)
            
        return self.sess.run(self.action, feed_dict={self.state : np.array([obs.tolist()])})[0]  

# imitate expert action

In [None]:
def imitation(itr, epi_s):
    # environment and expert set up
    config = tf.ConfigProto(
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1,
        device_count = { "GPU": 0 } )
    sess = tf.InteractiveSession(config=config)
    env = gym.make("RoboschoolAnt-v1")
    pi = ZooPolicyTensorflow("mymodel1", env.observation_space, env.action_space)
    agent = learning_agent(env) 
    for _ in range(itr):
        sc = 0
        observation = env.reset()
        done = False
        count_epi = 0 

        while done == False:
            action = agent.pick_action(observation) # collect traj and agent current policy
            observation_, reward, done, info = env.step(action)
            sc = sc + reward
            observation = observation_

            count_epi = count_epi + 1
            if count_epi > epi_s:
                done = True

        
        print("cumulative socre: ", sc)
        agent.expert_label(pi) # ask expert to label data
        agent.imitation_learn() # imitation learning
        print(" ")

In [None]:
imitation(80,500)

[2017-09-21 06:15:14,508] Making new env: RoboschoolAnt-v1


INFO:tensorflow:Restoring parameters from memory/model.ckpt


[2017-09-21 06:15:14,852] Restoring parameters from memory/model.ckpt


model restore
cumulative socre:  -128.24372312006315
initial cost:  8.36126
final cost:    4.29493
 
cumulative socre:  -26.82303733246082
initial cost:  3.73577
final cost:    2.48827
 
cumulative socre:  144.07803698030042
initial cost:  3.16901
final cost:    2.31962
 
cumulative socre:  392.10155005651325
initial cost:  1.78622
final cost:    1.74355
 
cumulative socre:  442.8480729926618
initial cost:  1.54794
final cost:    1.4181
 
cumulative socre:  467.2443330571217
initial cost:  1.25657
final cost:    1.19498
 
cumulative socre:  408.1150816164153
initial cost:  1.06481
final cost:    1.02851
 
cumulative socre:  538.897613373655
initial cost:  0.915832
final cost:    0.900813
 
cumulative socre:  453.18675589153736
initial cost:  0.845488
final cost:    0.806524
 
cumulative socre:  520.5569512682487
initial cost:  0.735488
final cost:    0.725822
 
cumulative socre:  529.390553710573
initial cost:  0.668718
final cost:    0.661394
 
cumulative socre:  638.9192590024409
ini