In [1]:
import numpy as np
import tensorflow as tf

In [3]:
action1_high = 2.97
action1_low = -2.97
action2_high = 1.57
action2_low = -1.57
action3_high = 1.57
action3_low = -1.57
action4_high = 0.04
action4_low = -2.81
action5_high = 1.51
action5_low = -2.77
action6_high = 1.57
action6_low = -1.57
action7_high = 2.97
action7_low = -2.96

minibatch_size = 128
actor_learning_rate = 1e-4
critic_learning_rate = 1e-3
gamma = 0.98
tau = 0.001

In [5]:
class Actor:
    def __init__(self, state_size, action_size, goal_size, kind):
        self.state_size = state_size
        self.action_size = action_size
        self.goal_size = goal_size
        self.kind = kind
        self.eparams = {}   #evaluation network parameter
        self.taprams = {}

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session()
            set_parameter(self.eparams)
            set_parameter(self.tparams)
            #initialize target network parameter
            for p in self.eparams:
                tf.assign(self.tparams[p], self.eparams[p])
            self.sess.run(tf.global_variables_initializer())
            self.action = self.build_network(self.eparams)
            self.optimaizer = self.train_evaluation_network()

    def set_parameter(self, params):
        params['w1'] = tf.Variable(tf.truncated_normal(shape = [self.state_size + self.goal_size, 256], mean = 0.0, stddev = 1.0))
        params['b1'] = tf.Variable(tf.zeros(256))
        params['w2'] = tf.Variable(tf.truncated_normal(shape = [256, 64], mean = 0.0, stddev = 1.0))
        params['b2'] = tf.Variable(tf.zeros(64))
        params['w3'] = tf.Variable(tf.truncated_normal(shape = [64, self.action_size], mean = 0.0, stddev = 1.0))
        params['b3'] = tf.Variable(tf.zeros(self.action_size))

    def build_network(self, params):
        state = tf.placeholder(tf.float32, [None, self.state_size])
        goal = tf.placeholder(tf.float32, [None, self.goal_size])
        inputs = tf.concat([state, goal], axis = 1)
        o = tf.matmul(inputs, params['w1']) + params['b1']
        y = tf.nn.relu(o)
        o = tf.matmul(y, params['w2']) + params['b2']
        y = tf.nn.relu(o)
        o = tf.matmul(y, params['w3']) + params['b3']
        y = tf.nn.tanh(o)

        c1 = [(action1_high - action1_low) / 2,
              (action2_high - action2_low) / 2,
              (action3_high - action3_low) / 2,
              (action4_high - action4_low) / 2,
              (action5_high - action5_low) / 2,
              (action6_high - action6_low) / 2,
              (action7_high - action7_low) / 2]
        c2 = [(action1_high + action1_low) / 2,
              (action2_high + action2_low) / 2,
              (action3_high + action3_low) / 2,
              (action4_high + action4_low) / 2,
              (action5_high + action5_low) / 2,
              (action6_high + action6_low) / 2,
              (action7_high + action7_low) / 2]
        y = y * c1 + c2                                 #scaleout
        return y

    def train_evaluation_network(self):
        dq_da = tf.placeholder(tf.float32, [None, self.action_size])
        eparams = []
        key = []
        for p in self.eparams:
            eparams.append(self.eparams[p])
        grads = tf.gradients(self.action, eparams, dq_da)
        grads = np.array(grads)
        grad = -1 * np.sum(grads, axis = 0) / minibatch_size
        grad = grad.tolist()
        optimizer = tf.train.AdamOptimizer(actor_learning_rate).apply_gradients(zip(grad, eparams))
        for i, p in zip(6, self.eparams):
            tf.assign(self.eparams[p], eparams[i])
        return optimizer

    def update_target_network(self):
        for p in self.eparams:
            tf.assign(self.tparams[p], tau * self.eparams[p] + (1 - tau) * self.tparams[p])

In [6]:
class Critic:
    def __init__(self, state_size, action_size, goal_size, kind):
        self.state_size = state_size
        self.action_size = action_size
        self.goal_size = goal_size
        self.kind = kind
        self.eparams = {}   #evaluation network parameter
        self.tparams = {}   #target network parameter

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session()
            #initialize evaluation network parameter
            set_parameter(self.eparams)
            set_parameter(self.tparams)
            #initialize target network parameter
            for p in self.eparams:
                tf.assign(self.tparams[p], self.eparams[p])
            self.sess.run(tf.global_variables_initializer())
            self.q = self.build_network(self.kind)
            self.optimaizer = self.train_evaluation_network()
            self.qgrad = self.dq_da()

    def set_parameter(self, params):
        params['w1'] = tf.Variable(tf.truncated_normal(shape = [self.state_size + self.goal_size, 256], mean = 0.0, stddev = 1.0))
        params['b1'] = tf.Variable(tf.zeros(256))
        params['w2'] = tf.Variable(tf.truncated_normal(shape = [256, 64], mean = 0.0, stddev = 1.0))
        params['b2'] = tf.Variable(tf.zeros(64))
        params['w3'] = tf.Variable(tf.truncated_normal(shape = [64, self.action_size], mean = 0.0, stddev = 1.0))
        params['b3'] = tf.Variable(tf.zeros(self.action_size))

    def build_network(self, params):
        state = tf.placeholder(tf.float32, [None, self.state_size])
        action = tf.placeholder(tf.float32, [None, self.action_size])
        goal = tf.placeholder(tf.float32, [None, self.goal_size])
        inputs = tf.concat([state, action, goal], axis = 1)
        o = tf.matmul(inputs, params['w1']) + params['b1']
        y = tf.nn.relu(o)
        o = tf.matmul(y, params['w2']) + params['b2']
        y = tf.nn.relu(o)
        y = tf.matmul(y, params['w3']) + params['b3']
        return y

    def train_evaluation_network(self):
        q_ = tf.placeholder(tf.float32, [None, 1])  #next state q
        reward = tf.placeholder(tf.float32, [None, 1])
        done = tf.placeholder(tf.float32, [None, 1])   #if episode end 1, otherwise 0
        q_target = reward + (1 - done) * gamma * q_
        eparams = []
        for p in self.eparams:
            eparams.append(self.eparams[p])
        loss = tf.losses.mean_squared_error(q_target, self.q)
        grad = tf.gradients(loss, eparams)
        optimizer = tf.train.AdamOptimizer(critic_learning_rate).apply_gradients(zip(grad, eparams))
        for i, p in zip(6, self.eparams):
            tf.assign(self.eparams[p], eparams[i])
        return optimizer

    def dq_da(self):
        action = tf.placeholder(tf.float32, [None, self.action_size])
        grad = tf.gradients(self.q, action)
        return grad

    def update_target_network(self):
        for p in self.eparams:
            tf.assign(self.tparams[p], tau * self.eparams[p] + (1 - tau) * self.tparams[p])

In [27]:
eparamslist = actor.grad()
print actor.sess.run(eparamslist)

[array([[ 0.40066016, -0.6626734 ],
       [-0.28813574,  1.6087884 ],
       [-1.658034  ,  0.09471382],
       [ 0.11548227, -1.1697099 ]], dtype=float32), array([[-0.14110884, -1.0847067 ],
       [-0.3824489 ,  0.7458162 ]], dtype=float32), array([0., 0.], dtype=float32), array([0., 0.], dtype=float32), array([0., 0.], dtype=float32), array([[-1.0118456 , -0.16460407],
       [-0.17729041,  0.45414558]], dtype=float32)]
