In [1]:
import os.path, gym
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import roboschool
from pathlib import Path

# Policy Gradient with sum of total reward update

In [2]:
class learning_agent(object):
    def __init__(self, env):
        self.env = env
        # agent memory
        self.trajectory = [] # seq of state, union of state
        self.reward_traj = []
        self.action_traj = []
        self.save_freq = 0
        self.gamma = 1 # finite horizon so deacy can be 1
        # agent data flow checking variable
        self.mean_check = []
        self.sig_check = []
        # define network structure (state input and action output)
        self.state, self.action =  self.build_net(self.env)
        
        self.action_tr = tf.placeholder(tf.float32, [None, 2])
        self.total_reward = tf.placeholder(tf.float32, [None, 1])
        
        # discrete policy gradient opject defined
        self.loss = tf.reduce_mean(-tf.log(self.action) * self.action_tr * self.total_reward)
        self.optimizer = tf.train.AdamOptimizer(0.001).minimize(self.loss)
        
        # start session
        self.sess = tf.Session()
        self.saver=tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        
        # check nn memory existance
        my_file = Path("discrete_mem/model.ckpt.index")
        if my_file.is_file():
            self.saver.restore(self.sess,"discrete_mem/model.ckpt")
            print("model restore")
        else:
            print("start new file")
    
    def build_net(self, env):
        act_shape = 2
        obs_shape = env.observation_space.shape[0]
        state = tf.placeholder(tf.float32, [None, obs_shape])
        # network structure
        l1 = tf.layers.dense(inputs = state, units = 64, activation = tf.nn.relu, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "policy_l1")
        
        l2 = tf.layers.dense(inputs = l1, units = 32, activation = tf.nn.relu, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "policy_l2")

        action = tf.layers.dense(inputs = l2, units = act_shape, activation = tf.nn.softmax, 
                             kernel_initializer = tf.random_normal_initializer(0.,.1), 
                             bias_initializer = tf.constant_initializer(0.1), name = "action")
        
        return state, action
    
    def pg_learn(self):
        # suming reward
        total_reward= -300
        for i in range(len(self.reward_traj)):
            total_reward = total_reward + self.reward_traj[i]
        
        total_reward = np.array([[total_reward]])
        total_reward.astype(np.float32)
        self.trajectory.astype(np.float32)
        self.action_traj.astype(np.float32)
        feed_dict = {self.state: self.trajectory, self.action_tr: self.action_traj, self.total_reward: total_reward}
        # optimize loss
        #print(self.sess.run(self.log_r_sum, feed_dict))
        
        self.sess.run(self.optimizer, feed_dict)
        # store memory
        self.save_freq = self.save_freq + 1 
        if self.save_freq % 10000 == 1:
            self.saver.save(self.sess, "discrete_mem/model.ckpt")
            
        self.trajectory = [] 
        self.reward_traj = []
        self.action_traj = []
    
    def pick_action(self, obs):
        # store sample trajectory
        if len(self.trajectory) == 0:
            self.trajectory = np.array([obs.tolist()])
        else:
            self.trajectory = np.append(self.trajectory, np.array([obs.tolist()]), 0)
        
        action = self.sess.run(self.action, feed_dict={self.state : np.array([obs.tolist()])})[0]
        action = np.random.choice([0,1],p=action/np.sum(action))
        
        a_oneHot = np.zeros([1,2])[0]
        a_oneHot[action] = 1
        
        # store action trajectory
        if len(self.action_traj) == 0:
            self.action_traj = np.array([a_oneHot.tolist()])
        else:
            self.action_traj = np.append(self.action_traj, np.array([a_oneHot.tolist()]), 0)
        
        return action    
    
    def get_reward(self, reward):
        
        self.reward_traj.append(reward)
        
    def get_parameter(self, obs):
        print("mean: ", self.sess.run(self.a_mean, feed_dict={self.state : np.array([obs.tolist()])})[0], "\n")
        print("sigm: ", self.sess.run(self.a_sigma, feed_dict={self.state : np.array([obs.tolist()])})[0], "\n")
        

# Sample trajectory by random policy

In [3]:
env = gym.make("CartPole-v0")
env.seed(1) 
observation = env.reset()
for _ in range(1):
    done = False
    count_epi = 0 
    while done == False:
        print ("state: \n", observation,'\n')
        observation_, reward, done, info = env.step(env.action_space.sample())
        print("action: \n", env.action_space.sample(), "\n")
        print("reward: \n", reward,"\n")
        observation = observation_
        count_epi = count_epi + 1
        if count_epi > 1:
            done = True

[2017-09-24 22:31:26,351] Making new env: CartPole-v0


state: 
 [ 0.03073904  0.00145001 -0.03088818 -0.03131252] 

action: 
 1 

reward: 
 1.0 

state: 
 [ 0.03076804 -0.19321569 -0.03151444  0.25146705] 

action: 
 0 

reward: 
 1.0 



# Monte Carlo and Policy Gradient with Discrete Policy

In [4]:
def pg(itr, epi_s):
    # environment and expert set up
    config = tf.ConfigProto(
        inter_op_parallelism_threads=8,
        intra_op_parallelism_threads=8,
        device_count = { "GPU": 0 } )
    sess = tf.InteractiveSession(config=config)
    env = gym.make("CartPole-v0")
    env = env.unwrapped
    render = False
    env.seed(1)
    agent = learning_agent(env) 
    q = 0
    run_avg = 0
    for _ in range(itr):
        sc = 0
        observation = env.reset()
        done = False
        count_epi = 0 
        # start new episodes
        while done == False:
            if render:
                env.render()
                
            action = agent.pick_action(observation) # collect traj and agent current policy
            observation_, reward, done, info = env.step(action)
            sc = sc + reward
            agent.get_reward(reward)
            observation = observation_
            # start new epi if perform step larger than epi_s
            count_epi = count_epi + 1
            if count_epi > epi_s:
                done = True
            #print(action)
        q = q + 1
        if q%100==1:
            print("metirc: ", run_avg, "\n")
        # perform learning based on current data
        agent.pg_learn()
        if sc > 100:
            render = False
            
        run_avg = run_avg* 0.95+0.05*sc
    return agent

In [5]:
agent = pg(1000,600)

[2017-09-24 22:31:26,503] Making new env: CartPole-v0


INFO:tensorflow:Restoring parameters from discrete_mem/model.ckpt


[2017-09-24 22:31:26,817] Restoring parameters from discrete_mem/model.ckpt


model restore
metirc:  0 

metirc:  21.31636858838623 

metirc:  30.19934138226741 

metirc:  33.78688403250608 

metirc:  64.09221703812656 

metirc:  107.46098126439628 

metirc:  135.43292237472554 

metirc:  336.869276919681 

metirc:  506.2863129031171 

metirc:  486.2004574532918 



agent.mean_check.astype(np.float64)
agent.sig_check.astype(np.float64)
a = tf.contrib.distributions.Normal(loc = agent.mean_check, scale = agent.sig_check)

agent.action_traj.astype(np.float64)
action_tr = tf.placeholder(tf.float32, [None, env.action_space.shape[0]])
tf.Session().run(a.prob(action_tr), {action_tr: agent.action_traj})

import pandas as pd
pd.DataFrame(agent.mean_check)

pd.DataFrame(agent.sig_check)