In [1]:
import random
import time

In [2]:
import tensorflow as tf
import numpy as np

import zmq

In [3]:
exp_socket_addr = "tcp://127.0.0.1:19884" 
max_actions = 7
strat_socket_addr = "tcp://127.0.0.1:19885"

In [5]:
context = zmq.Context()
sock_exp = context.socket(zmq.REP)
sock_exp.bind(exp_socket_addr)

In [6]:
sock_strat = context.socket(zmq.PUB)
sock_strat.bind(strat_socket_addr)

In [7]:
GAMMA = 0.995
LEARNING_RATE = 0.002

In [8]:
NUM_HIDDEN = 64

In [9]:
def CreateNetwork(state, num_actions, scope, is_training, reuse=False):
    hidden1 = tf.contrib.layers.relu(
        state, NUM_HIDDEN,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        normalizer_fn=tf.contrib.layers.batch_norm,
        normalizer_params={'is_training': is_training},
        scope=scope + '/hidden1', reuse=reuse)
#     hidden2 = tf.contrib.layers.relu(
#         hidden1, NUM_HIDDEN,
#         weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
#         normalizer_fn=tf.contrib.layers.batch_norm,
#         normalizer_params={'is_training': is_training},
#         scope=scope + '/hidden2', reuse=reuse)
    hidden2 = hidden1
    
    value_hid = tf.contrib.layers.relu(hidden2, NUM_HIDDEN,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        normalizer_fn=tf.contrib.layers.batch_norm,
        normalizer_params={'is_training': is_training},
        scope=scope + '/val_hid', reuse=reuse)

    adv_hid = tf.contrib.layers.relu(hidden2, NUM_HIDDEN,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        normalizer_fn=tf.contrib.layers.batch_norm,
        normalizer_params={'is_training': is_training},
        scope=scope + '/adv_hid', reuse=reuse)

    value = tf.contrib.layers.linear(value_hid, 1,
                                     scope=scope + '/value', reuse=reuse)
    adv = tf.contrib.layers.linear(adv_hid, num_actions, scope=scope + '/advantage', reuse=reuse)
    adv = tf.sub(adv, tf.reduce_mean(adv, reduction_indices=1, keep_dims=True), 'advantage')
    
    output = tf.add(value, adv, 'output')
    # Add hidden2
    return hidden1, value_hid, adv_hid, value, adv, output

In [10]:
def Select(value, index):
    # Value - float tensor of (batch, actions) size
    # index - int32 tensor of (batch) size
    # returns float tensor of batch size where in every batch the element from index is selected
    batch_size = tf.shape(value)[0]
    _range = tf.range(0, batch_size)
    ind = tf.concat(1, [tf.expand_dims(_range, 1), 
                        tf.expand_dims(index, 1)])
    return tf.gather_nd(value, ind)

In [11]:
def Select4(value, index):
    # Value - float tensor of (batch, actions) size
    # index - int32 tensor of (batch) size
    # returns float tensor of batch size where in every batch the element from index is selected
    shp = tf.shape(value)
    return tf.reduce_sum(value * tf.one_hot(index, shp[1]), reduction_indices=1)

In [12]:
class QLearner(object):
    def __init__(self, exp_buffer, num_actions, clip_error=5., run_index=1):
        self.exp_buffer = exp_buffer
        self.num_actions = num_actions
        self.run_index = run_index
        
        self.state = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size], name='state')
        self.action = tf.placeholder(tf.int32, shape=[None], name='action')
        self.reward = tf.placeholder(tf.float32, shape=[None], name='reward')
        self.state1 = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size], name='state1')
        self.gamma = tf.placeholder(tf.float32, shape=[None], name='gamma')
        
        self.is_training = tf.placeholder(tf.bool, shape=None, name='is_training')

        self.pred_vars = CreateNetwork(self.state, num_actions, 'model', self.is_training)
        self.pred_vars_s1 = CreateNetwork(self.state1, num_actions, 'model', self.is_training, True)
        self.target_vars = CreateNetwork(self.state1, num_actions, 'target', False)
        
        self.vars_pred = tf.get_collection(tf.GraphKeys.VARIABLES, 'model')
        self.vars_target = tf.get_collection(tf.GraphKeys.VARIABLES, 'target')

        self.copy_op = tf.group(
            *[tf.assign(y, x) for x, y in zip(self.vars_pred, self.vars_target)]
        )

        idx = len(self.pred_vars) - 1

        self.act_s1 = tf.cast(tf.argmax(self.pred_vars_s1[idx], dimension=1), tf.int32)
        self.q_s1 = Select(self.target_vars[idx], self.act_s1)
        self.target_q = tf.stop_gradient(self.reward + self.gamma * self.q_s1)
        self.q = Select4(self.pred_vars[idx], self.action)

        self.delta = tf.clip_by_value(self.target_q - self.q , -clip_error, clip_error)
        self.loss = tf.reduce_mean(tf.square(self.delta))

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        tf.histogram_summary('TD Error', self.delta)
        tf.scalar_summary("Loss", tf.clip_by_value(tf.sqrt(self.loss), -10., 100.))
        tf.scalar_summary("Q Func", tf.reduce_mean(self.q))
        
        optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
        grads = optimizer.compute_gradients(self.loss, self.vars_pred)
        if True:
            grads = [(tf.clip_by_norm(g, 5.), v) for g, v in grads
                     if g is not None]

        for grad, v in grads:
            tf.histogram_summary(v.name, v)
            if grad is not None:
                tf.histogram_summary('{}/grad'.format(v.name), grad)

        for ten in self.pred_vars:
            tf.histogram_summary('vars/' + ten.name, ten)


        self.train_op = optimizer.apply_gradients(grads, self.global_step)
        if update_ops:
            self.train_op = tf.group(self.train_op, *update_ops)

        self.summary_op = tf.merge_all_summaries()
        self.writer = None
        self.saver = None
        self.cur_step = None

    def step(self, sess, batch_size=32):
        ss, aa, rr, ss1, gg = buf.sample(batch_size)
        if ss is None:
            return
        
        if self.writer is None:
            self.writer = tf.train.SummaryWriter('/media/vertix/UHDD/tmp/tensorflow_logs/aicup/%d'
                                                 % self.run_index)
            self.saver = tf.train.Saver(self.vars_pred)
            self.last_start = time.time()

        feed_dict = {self.state: ss, self.action: aa, self.reward: rr, self.state1:ss1,
                     self.gamma: gg, self.is_training: True}

        if self.cur_step and self.cur_step % 100 != 0:
            self.cur_step, _ = sess.run([self.global_step, self.train_op], feed_dict)
        else:
            self.cur_step, _, smr = sess.run([self.global_step, self.train_op, self.summary_op], feed_dict)
            self.writer.add_summary(smr, self.cur_step)
            
        if self.cur_step % 20000 == 0:
            print 'Updated target network'
            sess.run(self.copy_op)
            self.saver.save(sess, 'q_func', global_step=self.global_step)
            if self.last_start is not None:
                self.writer.add_summary(
                    tf.Summary(
                        value=[tf.Summary.Value(
                                tag='Steps per sec',
                                simple_value=20000 / (time.time() - self.last_start))]),
                    self.cur_step)
            self.last_start = time.time()

    def stat(self, data):
        self.writer.add_summary(
            tf.Summary(
                value=[tf.Summary.Value(tag=name, simple_value=value)
                       for name, value in data.items()]), self.cur_step)

In [13]:
class ExperienceBuffer(object):
    def __init__(self, buffer_size = 50000):
        self.ss, self.aa, self.rr, self.ss1, self.gg = None, None, None, None, None
        self.buffer_size = buffer_size
        self.inserted = 0
        self.index = []
    
    def add(self, s, a, r, s1):
        if self.ss is None:
            # Initialize
            state_size = len(s)
            self.ss = np.zeros((state_size, self.buffer_size))
            self.aa = np.zeros(self.buffer_size, dtype=np.int16)
            self.ss1 = np.zeros((state_size, self.buffer_size))
            self.rr = np.zeros(self.buffer_size)
            self.gg = np.zeros(self.buffer_size)

        cur_index = self.inserted % self.buffer_size
        self.ss[:, cur_index] = s
        self.aa[cur_index] = a
        self.rr[cur_index] = r
        if s1 is not None:
            self.ss1[:, cur_index] = s1
            self.gg[cur_index] = GAMMA
        else:
            self.ss1[:, cur_index] = s
            self.gg[cur_index] = 0.
        
        if len(self.index) < self.buffer_size:
            self.index.append(self.inserted)
        self.inserted += 1

    @property
    def state_size(self):
        return None if self.ss is None else self.ss.shape[0]
            
    def sample(self, size):
        if size > self.inserted:
            return None, None, None, None, None

        indexes = random.sample(self.index, size)

        return (np.transpose(self.ss[:,indexes]), self.aa[indexes], self.rr[indexes],
                np.transpose(self.ss1[:, indexes]), self.gg[indexes])

In [14]:
buf = ExperienceBuffer(50 * 10 ** 5)  # 500K experiences
for _ in range(5):
    msg = sock_exp.recv_pyobj()
    sock_exp.send('Ok')
    if msg['type'] == 'exp':
        msg = msg['data']
        buf.add(msg['s'], msg['a'], msg['r'], msg['s1'])

In [27]:
sess.close()
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [28]:
ql = QLearner(buf, 2 + 5, 100, 1)
# feed_dict = {ql.state: ss, ql.action: aa, ql.reward: rr, ql.state1:ss1, ql.gamma: gg}
sess.run(tf.initialize_all_variables())

1. N-step Q learning
2. Prioritized replay
3. Output stats about games

In [None]:
for i in range(15 * 10 ** 6):
    msg = sock_exp.recv_pyobj()
    sock_exp.send('Ok')
    
    if msg['type'] == 'exp':
        buf.add(msg['data']['s'], msg['data']['a'], msg['data']['r'], msg['data']['s1'])
        ql.step(sess)

        if i > 0 and i % 500 == 0:
            sock_strat.send_pyobj({v.name: sess.run(v, {ql.is_training: False})
                                   for v in ql.vars_pred})
    elif msg['type'] == 'stat':
        ql.stat(msg['data'])

In [173]:
ss, aa, rr, ss1, gg = buf.sample(1)

In [282]:
network_vars = {v.name: sess.run(v, {ql.is_training: False}) for v in ql.vars_pred}

In [283]:
network_vars.keys()

[u'model/adv_hid/BatchNorm/moving_mean:0',
 u'model/hidden1/BatchNorm/moving_mean:0',
 u'model/advantage/biases:0',
 u'model/advantage/weights:0',
 u'model/val_hid/BatchNorm/moving_mean:0',
 u'model/val_hid/weights:0',
 u'model/val_hid/BatchNorm/beta:0',
 u'model/val_hid/BatchNorm/moving_variance:0',
 u'model/hidden2/BatchNorm/moving_mean:0',
 u'model/hidden2/BatchNorm/beta:0',
 u'model/hidden2/weights:0',
 u'model/hidden1/weights:0',
 u'model/adv_hid/weights:0',
 u'model/value/weights:0',
 u'model/value/biases:0',
 u'model/hidden1/BatchNorm/moving_variance:0',
 u'model/hidden2/BatchNorm/moving_variance:0',
 u'model/hidden1/BatchNorm/beta:0',
 u'model/adv_hid/BatchNorm/moving_variance:0',
 u'model/adv_hid/BatchNorm/beta:0']

In [23]:
def ReLu(x):
    return np.maximum(x, 0)

In [220]:
def BatchNorm(state, network_vars, key):
    eps = 0.001
    inv = 1.0 / np.sqrt(network_vars[key + '/moving_variance:0'] + eps)

    return state * inv + (network_vars[key + '/beta:0'] - network_vars[key + '/moving_mean:0'] * inv)

In [279]:
class QFunction(object):
    def __init__(self, network_vars):
        self.vars = network_vars

    def Q(self, state):
        state = np.matmul(state, self.vars['model/hidden1/weights:0'])
        # state += self.vars['model/hidden1/biases:0']
        state = BatchNorm(state, self.vars, 'model/hidden1/BatchNorm')
        state = ReLu(state)

        state = np.matmul(state, self.vars['model/hidden2/weights:0'])
        # state += self.vars['model/hidden2/biases:0']
        state = BatchNorm(state, self.vars, 'model/hidden2/BatchNorm')
        state = ReLu(state)

        value = np.matmul(state, self.vars['model/val_hid/weights:0'])
        value = BatchNorm(value, self.vars, 'model/val_hid/BatchNorm')
        # value += self.vars['model/val_hid/biases:0']
        value = ReLu(value)
        value = np.matmul(value, self.vars['model/value/weights:0'])
        value += self.vars['model/value/biases:0']

        adv = np.matmul(state, self.vars['model/adv_hid/weights:0'])
        adv = BatchNorm(adv, self.vars, 'model/adv_hid/BatchNorm')
        # adv += self.vars['model/adv_hid/biases:0']
        adv = ReLu(adv)
        adv = np.matmul(adv, self.vars['model/advantage/weights:0'])
        adv += self.vars['model/advantage/biases:0']

        return value + (adv - adv.mean())

In [280]:
ql.pred_vars[6].eval({ql.state: ss, ql.is_training: False})

array([[ 0.4263148 ,  0.27988333,  0.52581638,  0.42325473,  0.46426272,
         0.47350475,  0.352005  ]], dtype=float32)

In [284]:
QFunction(network_vars).Q(ss)

array([[ 0.42631482,  0.27988335,  0.52581639,  0.4232547 ,  0.46426274,
         0.47350477,  0.352005  ]])

In [170]:
ql.q.eval(feed_dict)

array([ 1746.92163086,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,  3156.87939453], dtype=float32)

In [174]:
ql.q_s1.eval(feed_dict)

array([ 10.50465488,  10.50465488,  10.50465488,  10.50465488,
        10.50465488,  10.50465488,  10.50465488,  10.50465488,
        10.50465488,  10.50465488], dtype=float32)

In [173]:
sess.run(ql.copy_op)

In [138]:
Select(ql.pred_vars[2], ql.action).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [148]:
tf.reduce_sum(ql.pred_vars[2] * tf.one_hot(ql.action, ql.num_actions), reduction_indices=1).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [93]:
tf.shape(ql.pred_vars[2]).eval(feed_dict)

array([10,  7], dtype=int32)

In [51]:
ql.delta.eval()

array([ 25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.], dtype=float32)

In [40]:
tf.reduce_mean(ql.delta).eval({ql.state: ss, ql.action: aa, ql.reward: rr, ql.state1:ss1, ql.gamma: gg})

25.0

In [283]:
ql.q.eval({ql.state: ss, ql.action: aa})

array([    0.        ,     0.        ,  2561.84985352,  2696.66064453,
        3319.86474609,  2376.26171875,  2375.80859375,  2602.42016602,
        2548.83056641,     0.        ], dtype=float32)

In [284]:
ql.q_s1.eval({ql.state1: ss1})

array([ 9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268,
        9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268], dtype=float32)