In [1]:
import random

In [2]:
import tensorflow as tf
import numpy as np

import zmq

In [3]:
exp_socket_addr = "tcp://127.0.0.1:19884" 
max_actions = 7
strat_socket_addr = "tcp://127.0.0.1:19885"

In [4]:
context = zmq.Context()
sock_exp = context.socket(zmq.REP)
sock_exp.bind(exp_socket_addr)

In [5]:
sock_strat = context.socket(zmq.PUB)
sock_strat.bind(strat_socket_addr)

In [6]:
GAMMA = 0.999
LEARNING_RATE = 0.001

In [7]:
class ExperienceBuffer(object):
    def __init__(self, buffer_size = 50000):
        self.ss, self.aa, self.rr, self.ss1, self.gg = None, None, None, None, None
        self.buffer_size = buffer_size
        self.inserted = 0
    
    def add(self, s, a, r, s1):
        if self.ss is None:
            # Initialize
            state_size = len(s)
            self.ss = np.zeros((state_size, self.buffer_size))
            self.aa = np.zeros(self.buffer_size, dtype=np.int16)
            self.ss1 = np.zeros((state_size, self.buffer_size))
            self.rr = np.zeros(self.buffer_size)
            self.gg = np.zeros(self.buffer_size)

        cur_index = self.inserted % self.buffer_size
        self.ss[:, cur_index] = s
        self.aa[cur_index] = a
        self.rr[cur_index] = r
        if s1 is not None:
            self.ss[:, cur_index] = s1
            self.gg[cur_index] = GAMMA
        else:
            self.ss[:, cur_index] = s
            self.gg[cur_index] = 0.
        
        self.inserted += 1

    @property
    def state_size(self):
        return None if self.ss is None else self.ss.shape[0]
            
    def sample(self, size):
        if size > self.inserted:
            return None, None, None, None, None

        indexes = random.sample(range(min(self.inserted, self.buffer_size)), size)

        return (np.transpose(self.ss[:,indexes]), self.aa[indexes], self.rr[indexes],
                np.transpose(self.ss1[:, indexes]), self.gg[indexes])

In [8]:
buf = ExperienceBuffer()

In [9]:
for _ in range(50):
    msg = sock_exp.recv_pyobj()
    sock_exp.send('Ok')

    buf.add(msg['s'], msg['a'], msg['r'], msg['s1'])

In [10]:
def CreateNetwork(state, num_actions, scope, reuse=False):
    hidden1 = tf.contrib.layers.relu(
        state, 20,
        weights_initializer=tf.truncated_normal_initializer(stddev=1.),
        biases_initializer=tf.constant_initializer(0.1),
        scope=scope + '/hidden1', reuse=reuse)
    hidden2 = tf.contrib.layers.relu(
        hidden1, 20,
        weights_initializer=tf.truncated_normal_initializer(stddev=1.),
        biases_initializer=tf.constant_initializer(0.1),
        scope=scope + '/hidden2', reuse=reuse)
    output = tf.contrib.layers.fully_connected(hidden2, num_actions,
                                               biases_initializer=tf.constant_initializer(10.),
                                               scope=scope + '/output',
                                               reuse=reuse)
    return hidden1, hidden2, output

In [107]:
def Select(value, index):
    # Value - float tensor of (batch, actions) size
    # index - int32 tensor of (batch) size
    # returns float tensor of batch size where in every batch the element from index is selected
    batch_size = tf.shape(value)[0]
    _range = tf.range(0, batch_size)
    ind = tf.concat(1, [tf.expand_dims(_range, 1), 
                        tf.expand_dims(index, 1)])
    return tf.gather_nd(value, ind)

In [149]:
def Select4(value, index):
    # Value - float tensor of (batch, actions) size
    # index - int32 tensor of (batch) size
    # returns float tensor of batch size where in every batch the element from index is selected
    shp = tf.shape(value)
    return tf.reduce_sum(value * tf.one_hot(index, shp[1]), reduction_indices=1)

In [162]:
class QLearner(object):
    def __init__(self, exp_buffer, num_actions, clip_error=5.):
        self.exp_buffer = exp_buffer
        self.num_actions = num_actions
        
        self.state = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size], name='state')
        self.action = tf.placeholder(tf.int32, shape=[None], name='action')
        self.reward = tf.placeholder(tf.float32, shape=[None], name='reward')
        self.state1 = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size], name='state1')
        self.gamma = tf.placeholder(tf.float32, shape=[None], name='gamma')
        
        self.pred_vars = CreateNetwork(self.state, num_actions, 'model')
        self.pred_vars_s1 = CreateNetwork(self.state1, num_actions, 'model', True)                
        self.target_vars = CreateNetwork(self.state1, num_actions, 'target')
        
        vars_pred = tf.get_collection(tf.GraphKeys.VARIABLES, 'model')
        vars_target = tf.get_collection(tf.GraphKeys.VARIABLES, 'target')

        self.copy_op = tf.group(
            *[tf.assign(y, x) for x, y in zip(vars_pred, vars_target)]
        )

        self.act_s1 = tf.cast(tf.argmax(self.pred_vars_s1[-1], dimension=1), tf.int32)
        # This complicated code just extracts from target Q-values the values
        # selected by self.act_s1
        self.q_s1 = Select4(self.target_vars[2], self.act_s1)
        self.target_q = tf.stop_gradient(self.reward + self.gamma * self.q_s1)
        self.q = Select4(self.pred_vars[2], self.action)

        self.delta = self.target_q - self.q 
        self.delta = tf.clip_by_value(self.delta,
                                      -clip_error,
                                      clip_error)
        self.loss = tf.reduce_mean(tf.square(self.delta))

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
        # TODO: Clip gradients
        self.train_op = optimizer.minimize(tf.reduce_mean(self.delta), self.global_step)
        
        tf.histogram_summary('TD Error', self.delta)
        tf.scalar_summary("Loss", self.loss)
        tf.scalar_summary("Q Func", tf.reduce_mean(self.q))
        for v in vars_pred:
            tf.histogram_summary(v.name, v)
        
        self.summary_op = tf.merge_all_summaries()
        self.writer = None
        self.cur_step = None

    def step(self, sess, batch_size=32):
        ss, aa, rr, ss1, gg = buf.sample(batch_size)
        if ss is None:
            return
        
        if self.writer is None:
            self.writer = tf.train.SummaryWriter('/media/vertix/UHDD/tmp/tensorflow_logs/aicup')

        feed_dict = {self.state: ss, self.action: aa, self.reward: rr, self.state1:ss1,
                     self.gamma: gg}
            
        if self.cur_step and self.cur_step % 100 != 0:
            self.cur_step, _ = sess.run([self.global_step, self.train_op], feed_dict)
        else:
            self.cur_step, _, smr = sess.run([self.global_step, self.train_op, self.summary_op], feed_dict)
            self.writer.add_summary(smr, self.cur_step)
        
        if self.cur_step % 10000 == 0:
            sess.run(self.copy_op)

In [163]:
sess.close()
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [164]:
ql = QLearner(buf, 7)
feed_dict = {ql.state: ss, ql.action: aa, ql.reward: rr, ql.state1:ss1, ql.gamma: gg}
sess.run(tf.initialize_all_variables())

In [169]:
for _ in range(5000):
    msg = sock_exp.recv_pyobj()
    sock_exp.send('Ok')

    buf.add(msg['s'], msg['a'], msg['r'], msg['s1'])
    
    ql.step(sess)

In [12]:
ss, aa, rr, ss1, gg = buf.sample(10)

In [138]:
Select(ql.pred_vars[2], ql.action).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [148]:
tf.reduce_sum(ql.pred_vars[2] * tf.one_hot(ql.action, ql.num_actions), reduction_indices=1).eval(feed_dict)

array([ 2186.74389648,  6496.70410156,  2797.15771484,  4662.953125  ,
           0.        ,  2761.18579102,  3284.98608398,  6556.95996094,
        2748.39379883,     0.        ], dtype=float32)

In [93]:
tf.shape(ql.pred_vars[2]).eval(feed_dict)

array([10,  7], dtype=int32)

In [51]:
ql.delta.eval()

array([ 25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.,  25.], dtype=float32)

In [40]:
tf.reduce_mean(ql.delta).eval({ql.state: ss, ql.action: aa, ql.reward: rr, ql.state1:ss1, ql.gamma: gg})

25.0

In [283]:
ql.q.eval({ql.state: ss, ql.action: aa})

array([    0.        ,     0.        ,  2561.84985352,  2696.66064453,
        3319.86474609,  2376.26171875,  2375.80859375,  2602.42016602,
        2548.83056641,     0.        ], dtype=float32)

In [284]:
ql.q_s1.eval({ql.state1: ss1})

array([ 9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268,
        9.87346268,  9.87346268,  9.87346268,  9.87346268,  9.87346268], dtype=float32)