In [1]:
%config InlineBackend.figure_format = 'retina'

In [2]:
import random
import time

In [3]:
import tensorflow as tf
import numpy as np

import zmq

In [4]:
from learner import *

In [5]:
exp_socket_addr = "tcp://127.0.0.1:29884"
max_actions = 7
strat_socket_addr = "tcp://127.0.0.1:29885"

In [6]:
context = zmq.Context()
sock_exp = context.socket(zmq.REP)
sock_exp.bind(exp_socket_addr)

In [7]:
sock_strat = context.socket(zmq.PUB)
sock_strat.bind(strat_socket_addr)

In [8]:
NUM_HIDDEN = 128

In [9]:
def CreateNetworks(state, num_actions, is_training=True, reuse=False):
    with tf.variable_scope('common'):
        hidden1 = tf.contrib.layers.relu(
            state, NUM_HIDDEN,
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
#             biases_initializer=tf.constant_initializer(0.1),
            normalizer_fn=tf.contrib.layers.batch_norm,
            normalizer_params={'is_training': is_training},
            scope='hidden1',
            reuse=reuse)
        hidden2 = tf.contrib.layers.relu(
            hidden1, NUM_HIDDEN,
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            biases_initializer=tf.constant_initializer(0.1),
    #         normalizer_fn=tf.contrib.layers.batch_norm,
    #         normalizer_params={'is_training': is_training},
            scope='hidden2',
            reuse=reuse)
    
    with tf.variable_scope('value'):
        value = tf.contrib.layers.linear(hidden2, 1,
                                         weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
                                         scope='value',
                                         reuse=reuse)
    if not reuse:
        with tf.variable_scope('policy'):
            logits = tf.contrib.layers.linear(hidden2, num_actions,
                                              weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                              scope='policy',
                                              reuse=reuse)
    else:
        logits = None
    return logits, value

In [10]:
DEFAULT_OPTIONS = {
    'clip_grad': 5.,
    'learning_rate': 0.0001,
}

In [11]:
class ActorCritic(object):
    def __init__(self, build_networks, buf, options=DEFAULT_OPTIONS):
        self._options = options
        self.exp_buffer = buf
        with tf.device('/cpu:0'):
            self.state = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size], name='state')
            self.action = tf.placeholder(tf.int32, shape=[None], name='action')
            self.reward = tf.placeholder(tf.float32, shape=[None], name='reward')
            self.state1 = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size], name='state1')
            self.gamma = tf.placeholder(tf.float32, shape=[None], name='gamma')
            self.is_weights = tf.placeholder(tf.float32, shape=[None], name='is_weights')       
            self.is_training = tf.placeholder(tf.bool, shape=None, name='is_training')

            self.logits, self.baseline = build_networks(self.state,
                                                        is_training=self.is_training, reuse=False)
            _, self.baseline1 = build_networks(self.state1, is_training=False, reuse=True)
            self.tf_policy = tf.reshape(tf.multinomial(self.logits, 1), [])

            self.policy_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, self.action))
            
            self.td_error = self.baseline - tf.stop_gradient(self.reward + self.gamma * self.baseline1)
            self.value_loss = 0.5 * tf.reduce_mean(tf.square(self.td_error))

            self.policy_entropy = tf.reduce_mean(-tf.nn.softmax(self.logits) * 
                                                 tf.nn.log_softmax(self.logits))

            loss = self.policy_loss + 0.25 * self.value_loss - 0.01 * self.policy_entropy

            self.optimizer = tf.train.AdamOptimizer(options['learning_rate'])
            grads = self.optimizer.compute_gradients(loss, tf.get_collection(tf.GraphKeys.VARIABLES))
            if 'clip_grad' in options:
                grads = [(tf.clip_by_norm(g, options['clip_grad']), v)
                         for g, v in grads]

            for grad, var in grads:
                tf.histogram_summary(var.name, var)
                if grad is not None:
                    tf.histogram_summary('{}/grad'.format(var.name), grad)            

            self.global_step = tf.Variable(0, name='global_step', trainable=False)
            self.train_op = self.optimizer.apply_gradients(grads, self.global_step)
            
            tf.histogram_summary("Predicted baseline", self.baseline)
            tf.histogram_summary("TD error", self.td_error)
            tf.scalar_summary("Loss/Actor", self.policy_loss)
            tf.scalar_summary("Loss/Critic", self.value_loss)
            tf.scalar_summary("Loss/Entropy", self.policy_entropy)
            tf.scalar_summary("Loss/Total", loss)

            self.summary_op = tf.merge_all_summaries()

    def Init(self, sess, run_id):
        sess.run(tf.initialize_all_variables())
        self.writer = tf.train.SummaryWriter(
            '/Users/vertix/tf/tensorflow_logs/aicup/%s'  % run_id
#             '/media/vertix/UHDD/tmp/tensorflow_logs/aicup/%s' % run_id
        )
        self.saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.VARIABLES))
        self.last_start = time.time()
        self.cur_step = 0

    def Step(self, sess, batch_size=32):
        idx, ss, aa, rr, ss1, gg, ww = self.exp_buffer.sample(batch_size)
        if ss is None:
            return
        
        feed_dict = {self.state: ss, self.action: aa, self.reward: rr, self.state1:ss1,
                     self.gamma: gg, self.is_weights: ww,
                     self.is_training: True}

        if self.cur_step and self.cur_step % 100 != 0:
            self.cur_step, _ = sess.run(
                [self.global_step, self.train_op], feed_dict)
        else:
            self.cur_step, _, smr = sess.run(
                [self.global_step, self.train_op, self.summary_op], feed_dict)
            self.writer.add_summary(smr, self.cur_step)
                    
        if self.cur_step % 20000 == 0:
            self.saver.save(sess, 'ac', global_step=self.global_step)
            if self.last_start is not None:
                self.writer.add_summary(
                    tf.Summary(
                        value=[tf.Summary.Value(
                                tag='Steps per sec',
                                simple_value=20000 / (time.time() - self.last_start))]),
                    self.cur_step)
            self.last_start = time.time()

In [12]:
buf = WeightedExperienceBuffer(0.0, 0.0, 100, buffer_size=1<<16)

In [15]:
for _ in range(120):
    msg = sock_exp.recv_pyobj()
    sock_exp.send('Ok')
    if msg['type'] == 'exp':
        msg = msg['data']
        buf.add(msg['s'], msg['a'], msg['r'], msg['s1'], msg['g'], 100)

In [14]:
buf.inserted

120

-------------------

In [118]:
sess.close()
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [119]:
ac = ActorCritic(lambda x, **kwargs: CreateNetworks(x, 7, **kwargs), buf)

In [127]:
# ac.exp_buffer = buf

In [120]:
ac.Init(sess, 'ac2')

In [None]:
for i in range(15 * 10 ** 6):
    ac.Step(sess)
    if i % 5 == 0:
        msg = sock_exp.recv_pyobj()
        sock_exp.send('Ok')
    
        if msg['type'] == 'exp':
            msg = msg['data']
            ac.exp_buffer.add(msg['s'], msg['a'], msg['r'], msg['s1'], msg['g'], 100)
        elif msg['type'] == 'stat':
            pass

In [126]:
ac.exp_buffer.inserted

2000

In [111]:
_, s, a, _, _, _, _ = ac.exp_buffer.sample(5)

In [112]:
a

array([6, 6, 0, 0, 0], dtype=int16)

In [113]:
sess.run(ac.logits, {ac.state: s})

array([[-71.1931076 , -90.23137665, -74.0412674 , -73.76772308,
        -73.48735046, -72.86225128, -72.86316681],
       [-71.1931076 , -90.23137665, -74.0412674 , -73.76772308,
        -73.48735046, -72.86225128, -72.86316681],
       [-71.1931076 , -90.23137665, -74.0412674 , -73.76772308,
        -73.48735046, -72.86225128, -72.86316681],
       [-71.1931076 , -90.23137665, -74.0412674 , -73.76772308,
        -73.48735046, -72.86225128, -72.86316681],
       [-71.1931076 , -90.23137665, -74.0412674 , -73.76772308,
        -73.48735046, -72.86225128, -72.86316681]], dtype=float32)

In [106]:
sess.run(ac.baseline, {ac.state: s})

array([[ 24.70120621],
       [ 24.70120621],
       [ 24.70120621],
       [ 24.70120621],
       [ 24.70120621]], dtype=float32)

In [78]:
for v in tf.get_collection(tf.GraphKeys.VARIABLES):
    print v.name

common/hidden1/weights:0
common/hidden1/biases:0
common/hidden2/weights:0
common/hidden2/biases:0
value/value/weights:0
value/value/biases:0
policy/policy/weights:0
policy/policy/biases:0
global_step:0
beta1_power:0
beta2_power:0
common/hidden1/weights/Adam:0
common/hidden1/weights/Adam_1:0
common/hidden1/biases/Adam:0
common/hidden1/biases/Adam_1:0
common/hidden2/weights/Adam:0
common/hidden2/weights/Adam_1:0
common/hidden2/biases/Adam:0
common/hidden2/biases/Adam_1:0
value/value/weights/Adam:0
value/value/weights/Adam_1:0
value/value/biases/Adam:0
value/value/biases/Adam_1:0
policy/policy/weights/Adam:0
policy/policy/weights/Adam_1:0
policy/policy/biases/Adam:0
policy/policy/biases/Adam_1:0


----------------

In [23]:
def ReLu(x):
    return np.maximum(x, 0)

In [220]:
def BatchNorm(state, network_vars, key):
    eps = 0.001
    inv = 1.0 / np.sqrt(network_vars[key + '/moving_variance:0'] + eps)

    return state * inv + (network_vars[key + '/beta:0'] - network_vars[key + '/moving_mean:0'] * inv)

In [279]:
class QFunction(object):
    def __init__(self, network_vars):
        self.vars = network_vars

    def Q(self, state):
        state = np.matmul(state, self.vars['model/hidden1/weights:0'])
        # state += self.vars['model/hidden1/biases:0']
        state = BatchNorm(state, self.vars, 'model/hidden1/BatchNorm')
        state = ReLu(state)

        state = np.matmul(state, self.vars['model/hidden2/weights:0'])
        # state += self.vars['model/hidden2/biases:0']
        state = BatchNorm(state, self.vars, 'model/hidden2/BatchNorm')
        state = ReLu(state)

        value = np.matmul(state, self.vars['model/val_hid/weights:0'])
        value = BatchNorm(value, self.vars, 'model/val_hid/BatchNorm')
        # value += self.vars['model/val_hid/biases:0']
        value = ReLu(value)
        value = np.matmul(value, self.vars['model/value/weights:0'])
        value += self.vars['model/value/biases:0']

        adv = np.matmul(state, self.vars['model/adv_hid/weights:0'])
        adv = BatchNorm(adv, self.vars, 'model/adv_hid/BatchNorm')
        # adv += self.vars['model/adv_hid/biases:0']
        adv = ReLu(adv)
        adv = np.matmul(adv, self.vars['model/advantage/weights:0'])
        adv += self.vars['model/advantage/biases:0']

        return value + (adv - adv.mean())

In [280]:
ql.pred_vars[6].eval({ql.state: ss, ql.is_training: False})

array([[ 0.4263148 ,  0.27988333,  0.52581638,  0.42325473,  0.46426272,
         0.47350475,  0.352005  ]], dtype=float32)

In [284]:
QFunction(network_vars).Q(ss)

array([[ 0.42631482,  0.27988335,  0.52581639,  0.4232547 ,  0.46426274,
         0.47350477,  0.352005  ]])

In [170]:
ql.q.eval(feed_dict)

array([ 1746.92163086,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,  3156.87939453], dtype=float32)

In [174]:
ql.q_s1.eval(feed_dict)

array([ 10.50465488,  10.50465488,  10.50465488,  10.50465488,
        10.50465488,  10.50465488,  10.50465488,  10.50465488,
        10.50465488,  10.50465488], dtype=float32)

In [173]:
sess.run(ql.copy_op)

In [138]:
Select(ql.pred_vars[2], ql.action).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [148]:
tf.reduce_sum(ql.pred_vars[2] * tf.one_hot(ql.action, ql.num_actions), reduction_indices=1).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [93]:
tf.shape(ql.pred_vars[2]).eval(feed_dict)

array([10,  7], dtype=int32)

In [51]:
ql.delta.eval()

array([ 25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.], dtype=float32)

In [40]:
tf.reduce_mean(ql.delta).eval({ql.state: ss, ql.action: aa, ql.reward: rr, ql.state1:ss1, ql.gamma: gg})

25.0

In [283]:
ql.q.eval({ql.state: ss, ql.action: aa})

array([    0.        ,     0.        ,  2561.84985352,  2696.66064453,
        3319.86474609,  2376.26171875,  2375.80859375,  2602.42016602,
        2548.83056641,     0.        ], dtype=float32)

In [284]:
ql.q_s1.eval({ql.state1: ss1})

array([ 9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268,
        9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268], dtype=float32)