In [2]:
%config InlineBackend.figure_format = 'retina'

In [3]:
import random
import time

In [4]:
import tensorflow as tf
import numpy as np

import zmq

In [5]:
from learner import *

In [1]:
exp_socket_addr = "tcp://127.0.0.1:39884"
max_actions = 7
vars_socket_addr = "tcp://127.0.0.1:39885"

In [57]:
context = zmq.Context()
sock_exp = context.socket(zmq.REP)
sock_exp.bind(exp_socket_addr)

In [58]:
sock_vars = context.socket(zmq.PUB)
sock_vars.bind(vars_socket_addr)

In [8]:
NUM_HIDDEN = 128

In [122]:
def CreateNetworks(state, num_actions, is_training=True, reuse=False):
    with tf.variable_scope('common'):
        hidden1 = tf.contrib.layers.relu(
            state, NUM_HIDDEN,
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            biases_initializer=tf.constant_initializer(0.01),
#             normalizer_fn=tf.contrib.layers.batch_norm,
#             normalizer_params={'is_training': is_training},
            scope='hidden1',
            reuse=reuse)
        hidden2 = tf.contrib.layers.relu(
            hidden1, NUM_HIDDEN,
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            biases_initializer=tf.constant_initializer(0.01),
#             normalizer_fn=tf.contrib.layers.batch_norm,
#             normalizer_params={'is_training': is_training},
            scope='hidden2',
            reuse=reuse)
    
    with tf.variable_scope('value'):
        value = tf.contrib.layers.linear(hidden2, 1,
                                         weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                         scope='value',
                                         reuse=reuse)
        value = tf.squeeze(value)
#         return value
    if not reuse:
        with tf.variable_scope('policy'):
            logits = tf.contrib.layers.linear(hidden2, num_actions,
                                              weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                              scope='policy',
                                              reuse=reuse)
    else:
        logits = None
    return logits, value

In [123]:
DEFAULT_OPTIONS = {
    'clip_grad': 5.,
    'learning_rate': 0.001,
}

In [124]:
def HuberLoss(tensor, boundary):
    abs_x = tf.abs(tensor)
    delta = boundary
    quad = tf.minimum(abs_x, delta)
    lin = (abs_x - quad)
    return 0.5 * quad**2 + delta * lin

In [125]:
class ActorCritic(object):
    def __init__(self, build_networks, state_size, options=DEFAULT_OPTIONS):
        self._options = options
        with tf.device('/cpu:0'):
            self.state = tf.placeholder(tf.float32, shape=[None, state_size], name='state')
            self.action = tf.placeholder(tf.int32, shape=[None], name='action')
            self.reward = tf.placeholder(tf.float32, shape=[None], name='reward')
            self.state1 = tf.placeholder(tf.float32, shape=[None, state_size], name='state1')
            self.gamma = tf.placeholder(tf.float32, shape=[None], name='gamma')
            self.is_training = tf.placeholder(tf.bool, shape=None, name='is_training')

            self.logits, self.baseline = build_networks(self.state,
                                                        is_training=self.is_training, reuse=False)
            _,  self.baseline1 = build_networks(self.state1, is_training=False, reuse=True)
            self.tf_policy = tf.reshape(tf.multinomial(self.logits, 1), [])

            self.vars = tf.get_collection(tf.GraphKeys.VARIABLES)
            
            # Experimental
            self.rolled_baseline = tf.stop_gradient(self.reward + self.gamma * self.baseline1)
            self.advantage = self.rolled_baseline - self.baseline
            
            self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, self.action)
            self.policy_loss = tf.reduce_mean(
                tf.mul(self.cross_entropy, tf.stop_gradient(self.advantage)))
            
            self.value_loss = 0.5 * tf.reduce_mean(HuberLoss(self.advantage, 5))

            self.policy_entropy = tf.reduce_mean(-tf.nn.softmax(self.logits) * 
                                                 tf.nn.log_softmax(self.logits))

#             loss = self.value_loss
            loss = self.policy_loss + 0.25 * self.value_loss - 0.01 * self.policy_entropy

            self.optimizer = tf.train.AdamOptimizer(options['learning_rate'])
            grads = self.optimizer.compute_gradients(loss, tf.get_collection(tf.GraphKeys.VARIABLES))
            if 'clip_grad' in options:
                grads = [(tf.clip_by_norm(g, options['clip_grad']) if g is not None else None, v)
                         for g, v in grads]

            for grad, var in grads:
                tf.histogram_summary(var.name, var)
                if grad is not None:
                    tf.histogram_summary('{}/grad'.format(var.name), grad)            

            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.train_op = self.optimizer.apply_gradients(grads, self.global_step)
            
            tf.histogram_summary("Predicted baseline", self.baseline)
            tf.histogram_summary("TD error", self.advantage)
            tf.scalar_summary("Loss/Actor", self.policy_loss)
            tf.scalar_summary("Loss/Critic", self.value_loss)
            tf.scalar_summary("Loss/Entropy", self.policy_entropy)
            tf.scalar_summary("Loss/Total", loss)

            self.summary_op = tf.merge_all_summaries()

    def Init(self, sess, run_id):
        sess.run(tf.initialize_all_variables())
        self.writer = tf.train.SummaryWriter(
            '/Users/vertix/tf/tensorflow_logs/aicup/%s'  % run_id
#             '/media/vertix/UHDD/tmp/tensorflow_logs/aicup/%s' % run_id
        )
        self.saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.VARIABLES))
        self.last_start = time.time()
        self.cur_step = 0
        self.writer.add_graph(tf.get_default_graph())

    def Step(self, sess, batch):
        ss, aa, rr, ss1, gg = batch        
        feed_dict = {self.state: ss, self.action: aa, self.reward: rr, self.state1:ss1,
                     self.gamma: gg, self.is_training: True}

        if self.cur_step and self.cur_step % 100 != 0:
            self.cur_step, _ = sess.run(
                [self.global_step, self.train_op], feed_dict)
        else:
            self.cur_step, _, smr = sess.run(
                [self.global_step, self.train_op, self.summary_op], feed_dict)
            self.writer.add_summary(smr, self.cur_step)
                    
        if self.cur_step % 20000 == 0:
            self.saver.save(sess, 'actor_critic', global_step=self.global_step)
            if self.last_start is not None:
                self.writer.add_summary(
                    tf.Summary(
                        value=[tf.Summary.Value(
                                tag='Steps per sec',
                                simple_value=20000 / (time.time() - self.last_start))]),
                    self.cur_step)
            self.last_start = time.time()

    def Stat(self, data):
        self.writer.add_summary(
            tf.Summary(
                value=[tf.Summary.Value(tag=name, simple_value=value)
                       for name, value in data.items()]), self.cur_step)

    def GetVars(self, sess):
        vals = sess.run(self.vars)
        return dict(zip([v.name for v in self.vars], vals))

-------------------

In [107]:
sess.close()
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [108]:
ac = ActorCritic(lambda x, **kwargs: CreateNetworks(x, 7, **kwargs), 814)

In [109]:
ac.Init(sess, 'pg4')

In [86]:
# ac.saver.restore(sess, 'ac-6300000')

In [114]:
# %%prun

for i in range(15 * 10 ** 6):  # 
    sock_vars.send_pyobj(ac.GetVars(sess))

    msg = sock_exp.recv_pyobj()
    sock_exp.send('Ok')
    if msg['type'] == 'exp':
        msg = msg['data']

        ac.Step(sess, (msg['s'], msg['a'].transpose(),
                       msg['r'].transpose(), msg['s1'],
                       msg['g'].transpose()))
    elif msg['type'] == 'stat':
        ac.Stat(msg['data'])

KeyboardInterrupt: 

In [115]:
msg

{'a': array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32),
 'g': array([ 0.995,  0.995,  0.995,  0.995,  0.995,  0.995,  0.995,  0.995,
         0.995,  0.995,  0.995,  0.995,  0.995,  0.995,  0.995,  0.995,
         0.995,  0.995,  0.995,  0.995]),
 'r': array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 139,   0,   0,
          0,   0,   0,   0,   0,   0,   0]),
 's': array([[   110.,    110.,    107., ...,      0.,      0.,  13200.],
        [   110.,    110.,    107., ...,      0.,      0.,  13199.],
        [   110.,    110.,    107., ...,      0.,      0.,  13198.],
        ..., 
        [   110.,    120.,    111., ...,      0.,      0.,  13183.],
        [   110.,    120.,    111., ...,      0.,      0.,  13182.],
        [   110.,    120.,     99., ...,      0.,      0.,  13181.]]),
 's1': array([[   110.,    110.,    107., ...,      0.,      0.,  13199.],
        [   110.,    110.,    107., ...,      0.,      0.,  13198.],
        [   110.,  

In [116]:
feed_dict = {ac.state: msg['s'], ac.action: msg['a'], ac.reward: msg['r'], ac.state1:msg['s1'],
             ac.gamma: msg['g'], ac.is_training: True}

In [117]:
ac.cross_entropy.eval(feed_dict)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [119]:
ac.policy_entropy.eval(feed_dict)

1.0010546e-14

In [148]:
for v in tf.get_collection(tf.GraphKeys.VARIABLES):
    print v.name

common/hidden1/weights:0
common/hidden1/biases:0
common/hidden2/weights:0
common/hidden2/biases:0
value/value/weights:0
value/value/biases:0
policy/policy/weights:0
policy/policy/biases:0
global_step:0
beta1_power:0
beta2_power:0
common/hidden1/weights/Adam:0
common/hidden1/weights/Adam_1:0
common/hidden1/biases/Adam:0
common/hidden1/biases/Adam_1:0
common/hidden2/weights/Adam:0
common/hidden2/weights/Adam_1:0
common/hidden2/biases/Adam:0
common/hidden2/biases/Adam_1:0
value/value/weights/Adam:0
value/value/weights/Adam_1:0
value/value/biases/Adam:0
value/value/biases/Adam_1:0
policy/policy/weights/Adam:0
policy/policy/weights/Adam_1:0
policy/policy/biases/Adam:0
policy/policy/biases/Adam_1:0


In [224]:
import cPickle

def DumpVariables(filename):
    network_vars = {}
    for v in tf.get_collection(tf.GraphKeys.VARIABLES):
        if '/' in v.name and 'Adam' not in v.name:
            network_vars[v.name] = v.eval()
    with open(filename, 'w') as f:
        cPickle.dump(network_vars, f)        

In [225]:
DumpVariables('network')

----------------

In [162]:
def ReLu(x):
    return np.maximum(x, 0)

In [163]:
def BatchNorm(state, network_vars, key):
    eps = 0.001
    inv = 1.0 / np.sqrt(network_vars[key + '/moving_variance:0'] + eps)

    return state * inv + (network_vars[key + '/beta:0'] - network_vars[key + '/moving_mean:0'] * inv)

In [164]:
def Softmax(state):
    state -= np.max(state)
    e = np.exp(state)
    return e / np.sum(e)

In [205]:
class NNPolicy(object):
    def __init__(self, network_vars):
        self.vars = network_vars
        self.actions = None

    def Logits(self, state):
        state = np.matmul(state, self.vars['common/hidden1/weights:0'])
        state += self.vars['common/hidden1/biases:0']
        state = ReLu(state)

        state = np.matmul(state, self.vars['common/hidden2/weights:0'])
        state += self.vars['common/hidden2/biases:0']
        state = ReLu(state)

        logits = np.matmul(state, self.vars['policy/policy/weights:0'])
        logits += self.vars['policy/policy/biases:0']
        return logits
    
    def Softmax(self, state):
        logits = self.Logits(state)
        return Softmax(logits)
    
    def Sample(self, state):
        sm = self.Softmax(state)
        if self.actions is None:
            self.actions = range(len(sm))
        return np.random.choice(self.actions, p=sm)

In [206]:
nnp = NNPolicy(network_vars)

In [219]:
nnp.Softmax(s[1, :])

array([  5.60286788e-05,   1.27642462e-75,   1.33276611e-08,
         5.64379590e-10,   8.89499400e-01,   1.10444557e-01,
         1.42909448e-18])

In [218]:
nnp.Sample(s[1, :])

4

In [178]:
np.max(nnp.Logits(s) -  ac.logits.eval({ac.state: s}))

0.00021755436517878479

In [173]:
ac.logits.eval({ac.state: s})

(15, 7)

In [280]:
ql.pred_vars[6].eval({ql.state: ss, ql.is_training: False})

array([[ 0.4263148 ,  0.27988333,  0.52581638,  0.42325473,  0.46426272,
         0.47350475,  0.352005  ]], dtype=float32)

In [284]:
QFunction(network_vars).Q(ss)

array([[ 0.42631482,  0.27988335,  0.52581639,  0.4232547 ,  0.46426274,
         0.47350477,  0.352005  ]])

In [170]:
ql.q.eval(feed_dict)

array([ 1746.92163086,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,  3156.87939453], dtype=float32)

In [174]:
ql.q_s1.eval(feed_dict)

array([ 10.50465488,  10.50465488,  10.50465488,  10.50465488,
        10.50465488,  10.50465488,  10.50465488,  10.50465488,
        10.50465488,  10.50465488], dtype=float32)

In [173]:
sess.run(ql.copy_op)

In [138]:
Select(ql.pred_vars[2], ql.action).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [148]:
tf.reduce_sum(ql.pred_vars[2] * tf.one_hot(ql.action, ql.num_actions), reduction_indices=1).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [93]:
tf.shape(ql.pred_vars[2]).eval(feed_dict)

array([10,  7], dtype=int32)

In [51]:
ql.delta.eval()

array([ 25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.], dtype=float32)

In [40]:
tf.reduce_mean(ql.delta).eval({ql.state: ss, ql.action: aa, ql.reward: rr, ql.state1:ss1, ql.gamma: gg})

25.0

In [283]:
ql.q.eval({ql.state: ss, ql.action: aa})

array([    0.        ,     0.        ,  2561.84985352,  2696.66064453,
        3319.86474609,  2376.26171875,  2375.80859375,  2602.42016602,
        2548.83056641,     0.        ], dtype=float32)

In [284]:
ql.q_s1.eval({ql.state1: ss1})

array([ 9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268,
        9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268], dtype=float32)