In [1]:
import gym 
import numpy as np
import tensorflow as tf
from pathlib import Path

In [2]:
class learning_agent(object):
    def __init__(self, env):
        self.env = env
        # agent memory
        self.trajectory = [] # seq of state, union of state
        self.temporary_traj = [] # store new comming unlabel seq
        self.expert_action_history = [] # seq of expert action, union of action
        self.save_freq = 0
        # define network structure (state input and action output)
        self.state, self.action =  self.build_net(self.env)
        # trajectory and regression 
        self.expert_action = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]])
        self.loss = tf.losses.mean_squared_error(self.expert_action, self.action)
        self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.loss)
        # start session
        self.sess = tf.Session()
        self.saver=tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        # check memory existance
        my_file = Path("tmp/model.ckpt.index")
        if my_file.is_file():
            self.saver.restore(self.sess,"tmp/model.ckpt")
            print("model restore")
        else:
            print("start new file")
    
    def build_net(self, env):
        act_shape = env.action_space.shape[0]
        obs_shape = env.observation_space.shape[0]
        state = tf.placeholder(tf.float32, [None, obs_shape])
        # network structure
        l1 = tf.layers.dense(inputs = state, units = 50, activation = tf.nn.relu, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "policy_l1")
        
        l2 = tf.layers.dense(inputs = l1, units = 20, activation = tf.nn.relu, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "policy_l2")
        
        action = tf.layers.dense(inputs = l1, units = act_shape, activation = tf.nn.tanh, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "action")
        
        scale_action = tf.multiply(action, np.array([env.action_space.high.tolist()]), name = "scale_action")
        return state, scale_action
    
    def imitation_learn(self):
        feed_dict = {self.state: self.trajectory, self.expert_action: self.expert_action_history}
        # optimize loss
        for i in range(500):
            self.sess.run(self.optimizer, feed_dict)
            if i % 499 == 0:
                print("difference in policy: ", self.sess.run(self.loss, feed_dict))
        
        self.save_freq = self.save_freq + 1 
        if self.save_freq % 50 == 1:
            self.saver.save(self.sess, "tmp/model.ckpt")

    def expert_label(self, expert):
        for i in range(len(self.temporary_traj)):
            if len(self.expert_action_history) == 0:
                self.expert_action_history = np.array([expert.act(self.temporary_traj[i,:]).tolist()])
            else:
                self.expert_action_history = np.append(self.expert_action_history,
                                                       np.array([expert.act(self.temporary_traj[i,:]).tolist()]),0)
        # data aggregate
        if len(self.trajectory) == 0:
            self.trajectory = self.temporary_traj
        else:
            self.trajectory = np.append(self.trajectory, self.temporary_traj,0)
        
        self.temporary_traj = [] # temporary memeory clear to zero
    
    def pick_action(self, obs):
        # store temporal obs
        if len(self.temporary_traj) == 0:
            self.temporary_traj = np.array([obs.tolist()])
        else:
            self.temporary_traj = np.append(self.temporary_traj, np.array([obs.tolist()]), 0)
            
        #print(self.sess.run(self.action, feed_dict={self.state : np.array([obs.tolist()])}))
        return self.sess.run(self.action, feed_dict={self.state : np.array([obs.tolist()])})[0]       

In [3]:
class expert(object):
    def __init__(self, env):
        self.env = env
        
    def act(self, obs):
        return self.env.action_space.sample()

In [4]:
env = gym.make('Pendulum-v0')
env.seed(1)
pi = expert(env)
agent = learning_agent(env)
for _ in range(4):
    score = 0
    observation = env.reset()
    done = False
    while done == False:
        env.render()
        action = agent.pick_action(observation) # collect traj and agent current policy
        #print(action)
        observation_, reward, done, info = env.step(action)
        score = score + reward
        observation = observation_
    
    print("sum of score: ", score)
    agent.expert_label(pi) # ask expert to label data
    agent.imitation_learn() # imitation learning

[2017-09-19 09:50:28,836] Making new env: Pendulum-v0


start new file
sum of score:  -1232.57727621
difference in policy:  1.28678
difference in policy:  0.953069
sum of score:  -1276.15545997
difference in policy:  1.51011
difference in policy:  1.11669
sum of score:  -1285.64930045
difference in policy:  1.27066
difference in policy:  1.1955
sum of score:  -1359.33047099
difference in policy:  1.23966
difference in policy:  1.18626
