In [1]:
import os
import random
import time

import numpy as np
import tensorflow as tf

import tensorflow.contrib.layers
import gym

In [2]:
class WeightedExperienceBuffer(object):
    def __init__(self, alpha, beta, max_weight, buffer_size=1<<16):
        self.ss, self.aa, self.rr, self.ss1, self.gg = None, None, None, None, None
        self.buffer_size = buffer_size
        self.inserted = 0
        self.tree_size = buffer_size << 1
        # root is 1
        self.weight_sums = np.zeros(self.tree_size)
        self.weight_min = np.ones(self.tree_size) * (max_weight ** alpha)
        self.max_weight = max_weight
        self.alpha = alpha
        self.beta = beta

    def update_up(self, index):
        self.weight_sums[index] = self.weight_sums[index << 1] + self.weight_sums[(index << 1) + 1]
        self.weight_min[index] = min(self.weight_min[index << 1], self.weight_min[(index << 1) + 1])
        if index > 1:
            self.update_up(index >> 1)

    def index_in_tree(self, buffer_index):
        return buffer_index + self.buffer_size

    def index_in_buffer(self, tree_index):
        return tree_index - self.buffer_size

    def tree_update(self, buffer_index, new_weight):
        index = self.index_in_tree(buffer_index)
        new_weight = min(new_weight + 0.01, self.max_weight) ** self.alpha

        self.weight_sums[index] = new_weight
        self.weight_min[index] = new_weight
        self.update_up(index >> 1)

    def add(self, s, a, r, s1, gamma, weight):
        if self.ss is None:
            # Initialize
            state_size = s.shape[1]
            self.ss = np.zeros((state_size, self.buffer_size), dtype=np.float32)
            self.aa = np.zeros(self.buffer_size, dtype=np.int16)
            self.ss1 = np.zeros((state_size, self.buffer_size), dtype=np.float32)
            self.rr = np.zeros(self.buffer_size, dtype=np.float32)
            self.gg = np.zeros(self.buffer_size, dtype=np.float32)

        indexes = []
        for _ in a:
            cur_index = self.inserted % self.buffer_size
            self.inserted += 1
            indexes.append(cur_index)

        self.ss[:, indexes] = s.transpose()
        self.aa[indexes] = a
        self.rr[indexes] = r
        self.ss1[:, indexes] = s1.transpose()
        self.gg[indexes] = gamma

        for idx in indexes:
            self.tree_update(idx, weight)

    @property
    def state_size(self):
        return None if self.ss is None else self.ss.shape[0]

    def find_sum(self, node, sum):
        if node >= self.buffer_size:
            return self.index_in_buffer(node)
        left = node << 1
        left_sum = self.weight_sums[left]
        if sum < left_sum:
            return self.find_sum(left, sum)
        else:
            return self.find_sum(left + 1, sum - left_sum)

    def sample_indexes(self, size):
        total_weight = self.weight_sums[1]
        indexes = np.zeros(size, dtype=np.int32)
        for i in xrange(size):
            search = np.random.random() * total_weight
            indexes[i] = self.find_sum(1, search)
        return indexes

    def sample(self, size):
        if size > self.inserted:
            return None, None, None, None, None, None, None

        indexes = self.sample_indexes(size)
        max_w = (self.weight_min[1] / self.weight_sums[1]) ** -self.beta
        w = (self.weight_sums[self.index_in_tree(indexes)] / self.weight_sums[1]) ** -self.beta

        return (indexes,
                np.transpose(self.ss[:, indexes]), self.aa[indexes], self.rr[indexes],
                np.transpose(self.ss1[:, indexes]), self.gg[indexes],
                w / max_w)

In [3]:
def HuberLoss(tensor, boundary):
    abs_x = tf.abs(tensor)
    delta = boundary
    quad = tf.minimum(abs_x, delta)
    lin = (abs_x - quad)
    return 0.5 * quad ** 2 + delta * lin

In [4]:
class BaseLearner(object):
    def __init__(self, options):
        self.options = options

    def Vars(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

    def Init(self, sess, run_index):
        self.run_index = run_index
        sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(
            '/Users/vertix/Documents/tensorflow_logs/%s'  % self.run_index
        )
        self.saver = tf.train.Saver(self.Vars())
        self.cur_step = 0
        self.writer.add_graph(tf.get_default_graph())
        self.last_start = time.time()

    def Optimize(self, loss):
        """Returns optimization operation"""
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.optimizer = tf.train.AdamOptimizer(self.options['learning_rate'])
        variables = self.Vars()
        grads = self.optimizer.compute_gradients(loss, variables)
        if 'clip_grad' in self.options:
            gg = [g for g, _ in grads]
            vv = [v for _, v in grads]
            global_norm = tf.global_norm(gg)
            tf.summary.scalar('Scalars/Grad_norm', global_norm)
            grads = zip(tf.clip_by_global_norm(gg, self.options['clip_grad'], global_norm)[0], vv)

        for grad, v in grads:
            if grad is not None:
                tf.summary.histogram('{}/grad'.format(v.name), grad)
            tf.summary.histogram(v.name, v)

        tf.summary.scalar("Scalars/Total_Loss", loss)
        return self.optimizer.apply_gradients(grads, self.global_step)

    def Stat(self, data):
        self.writer.add_summary(
            tf.Summary(
                value=[tf.Summary.Value(tag=name, simple_value=value)
                       for name, value in data.items()]), self.cur_step)

    def Save(self, sess):
        self.saver.save(sess, os.path.basename(self.run_index),
                        global_step=self.global_step)
        if self.last_start is not None:
            self.writer.add_summary(
                tf.Summary(
                    value=[tf.Summary.Value(
                        tag='Steps per sec',
                        simple_value=self.options['update_steps'] / (time.time() - self.last_start))]),
                self.cur_step)
        self.last_start = time.time()

In [5]:
DEFAULT_OPTIONS = {
    'clip_grad': 3.,
    'learning_rate': 0.001,
    'update_steps': 10000,
}

In [6]:
class QLearner(BaseLearner):
    def __init__(self, exp_buffer, state2q, options=DEFAULT_OPTIONS):
        super(QLearner, self).__init__(options)

        self.exp_buffer = exp_buffer

        self.state = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size],
                                    name='state')
        self.action = tf.placeholder(tf.int32, shape=[None], name='action')
        self.reward = tf.placeholder(tf.float32, shape=[None], name='reward')
        self.state1 = tf.placeholder(tf.float32, shape=[None, self.exp_buffer.state_size],
                                     name='state1')
        self.gamma = tf.placeholder(tf.float32, shape=[None], name='gamma')
        self.is_weights = tf.placeholder(tf.float32, shape=[None], name='is_weights')
        self.is_training = tf.placeholder(tf.bool, shape=None, name='is_training')

        with tf.variable_scope('model', reuse=False):
            self.qvalues = state2q(self.state, self.is_training)
        with tf.variable_scope('model', reuse=True):
            self.qvalues1 = state2q(self.state1, self.is_training)
        with tf.variable_scope('target', reuse=False):
            self.qvalues_target = state2q(self.state1, self.is_training)

        self.vars_pred = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'model')
        self.vars_target = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'target')

        self.copy_op = tf.group(
            *[tf.assign(y, x) for x, y in zip(self.vars_pred, self.vars_target)]
        )

        self.act_s1 = tf.cast(tf.argmax(self.qvalues1, dimension=1), tf.int32)
        self.q_s1 = Select(self.qvalues_target, self.act_s1)
        self.target_q = tf.stop_gradient(self.reward + self.gamma * self.q_s1)
        self.q = Select(self.qvalues, self.action)
#         self.q = Select4(self.qvalues, self.action)

        self.delta = self.target_q - self.q
        self.td_err_weight = tf.abs(self.delta)
        self.loss = tf.reduce_mean(HuberLoss(self.delta, 5) * self.is_weights)

        self.train_op = self.Optimize(self.loss)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        self.train_op = tf.group(self.train_op, *update_ops)

        tf.summary.histogram('Monitor/TD_Error', self.delta)
        tf.summary.histogram('Monitor/Q', self.q)
        tf.summary.histogram('Monitor/Weights', self.is_weights)
        tf.summary.scalar("Scalars/Q", tf.reduce_mean(self.q))
        tf.summary.scalar('Scalars/Weights', tf.reduce_mean(self.is_weights))

        self.summary_op = tf.summary.merge_all()
        self.saver = None

    def Vars(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'model')

    def Step(self, sess, batch_size=32):
        idx, ss, aa, rr, ss1, gg, ww = self.exp_buffer.sample(batch_size)
        if ss is None:
            return

        feed_dict = {self.state: ss, self.action: aa, self.reward: rr, self.state1:ss1,
                     self.gamma: gg, self.is_weights: ww,
                     self.is_training: True}

        if self.cur_step and self.cur_step % 100 != 0:
            self.cur_step, weights, _ = sess.run(
                [self.global_step, self.td_err_weight, self.train_op], feed_dict)
        else:
            self.cur_step, weights, _, smr = sess.run(
                [self.global_step, self.td_err_weight, self.train_op, self.summary_op], feed_dict)
            self.writer.add_summary(smr, self.cur_step)

        for ii, td_w in zip(idx, weights):
            self.exp_buffer.tree_update(ii, td_w)

        if self.cur_step % self.options['update_steps'] == 0:
            print 'Updated target network'
            sess.run(self.copy_op)
            self.Save(sess)

In [2]:
env = gym.make('Pong-v0')

[2017-02-25 23:05:30,487] Making new env: Pong-v0


In [7]:
import atari_wrappers

In [11]:
def EnvFactory(env_name):
    parts = env_name.split(':')
    if len(parts) > 2:
        raise ValueError('Incorrect environment name %s' % env_name)

    env = gym.make(parts[0])
    if len(parts) == 2:
        for letter in parts[1]:
            if letter == 'L':
                env = atari_wrappers.EpisodicLifeEnv(env)
            elif letter == 'N':
                env = atari_wrappers.NoopResetEnv(env, noop_max=30)
            elif letter == 'S':
                env = atari_wrappers.MaxAndSkipEnv(env, skip=4)
            elif letter == 'F':
                env = atari_wrappers.FireResetEnv(env)
            elif letter == 'C':
                env = atari_wrappers.ClippedRewardsWrapper(env)
            elif letter == 'P':
                env = atari_wrappers.ProcessFrame84(env)
            else:
                raise ValueError('Unexpected code of wrapper %s' % letter)
    return env

In [12]:
env = EnvFactory('Pong-v0:LNSFCP')

[2017-02-25 23:32:04,734] Making new env: Pong-v0


In [15]:
env.reset().shape

(84, 84, 1)

In [13]:
env.observation_space

Box(84, 84, 1)

In [15]:
res = env.render(mode='rgb_array')
env.render(close=True)

In [10]:
tf.image.encode_png(res)

<tf.Tensor 'EncodePng:0' shape=() dtype=string>

In [8]:
def CartPoleQNetwork(state, unused_is_training):
    hidden = tf.contrib.layers.fully_connected(
        state, 32,
        activation_fn=tf.nn.elu,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        scope='hidden1')
    hidden = tf.contrib.layers.fully_connected(
        hidden, 32,
        activation_fn=tf.nn.elu,
        weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        scope='hidden2')

    value = tf.contrib.layers.linear(hidden, 1,
                                     weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
                                     biases_initializer=tf.constant_initializer(0.),
                                     scope='value')
    adv = tf.contrib.layers.linear(hidden, env.action_space.n,  # num_actions
                                   weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
                                   scope='advantage')
    adv = tf.subtract(adv, tf.reduce_mean(adv, reduction_indices=1, keep_dims=True), 'advantage')

    output = tf.add(value, adv, 'output')
    return output

In [9]:
ROLLOUT_LEN = 20
GAMMA = 0.99

In [10]:
buf = WeightedExperienceBuffer(0.6, 0.4, 100, 1 << 15)
old_s = env.reset()
for _ in range(50):
    ss, aa, rr, ss1, gg = [], [], [], [], []
    done = False
    while not done and len(ss) < ROLLOUT_LEN:
        a = env.action_space.sample()
    
        s, r, done, _ = env.step(a)
        ss.append(old_s)
        aa.append(a)
        rr.append(r)
        ss1.append(s)
        gg.append(GAMMA if not done else 0.)

        old_s = s
    
    rew = 0.
    g = 1.
    for i in reversed(range(len(ss))):
        rew = rr[i] + gg[i] * rew
        g *= gg[i]
        ss1[i] = old_s
        rr[i] = r
        gg[i] = g
    
    if done:
        old_s = env.reset()
    
    buf.add(np.array(ss), np.array(aa), np.array(rr), np.array(ss1), np.array(gg), 100)

In [11]:
def Select(value, index):
    # Value - float tensor of (batch, actions) size
    # index - int32 tensor of (batch) size
    # returns float tensor of batch size where in every batch the element from index is selected
    batch_size = tf.shape(value)[0]
    _range = tf.range(0, batch_size)
    ind = tf.concat([tf.expand_dims(_range, 1), 
                     tf.expand_dims(index, 1)], 1)
    return tf.gather_nd(value, ind)


def Select4(value, index):
    # Value - float tensor of (batch, actions) size
    # index - int32 tensor of (batch) size
    # returns float tensor of batch size where in every batch the element from index is selected
    shp = tf.shape(value)
    return tf.reduce_sum(value * tf.one_hot(index, shp[1]), reduction_indices=1)

In [13]:
# sess.close()
tf.reset_default_graph()
sess = tf.InteractiveSession()

In [14]:
ql = QLearner(buf, CartPoleQNetwork, options={
    'clip_grad': 3.,
    'learning_rate': 0.0001,
    'update_steps': 15000,
    })

INFO:tensorflow:Summary name model/hidden1/weights:0/grad is illegal; using model/hidden1/weights_0/grad instead.


[2017-02-19 23:41:14,555] Summary name model/hidden1/weights:0/grad is illegal; using model/hidden1/weights_0/grad instead.


INFO:tensorflow:Summary name model/hidden1/weights:0 is illegal; using model/hidden1/weights_0 instead.


[2017-02-19 23:41:14,559] Summary name model/hidden1/weights:0 is illegal; using model/hidden1/weights_0 instead.


INFO:tensorflow:Summary name model/hidden1/biases:0/grad is illegal; using model/hidden1/biases_0/grad instead.


[2017-02-19 23:41:14,562] Summary name model/hidden1/biases:0/grad is illegal; using model/hidden1/biases_0/grad instead.


INFO:tensorflow:Summary name model/hidden1/biases:0 is illegal; using model/hidden1/biases_0 instead.


[2017-02-19 23:41:14,565] Summary name model/hidden1/biases:0 is illegal; using model/hidden1/biases_0 instead.


INFO:tensorflow:Summary name model/hidden2/weights:0/grad is illegal; using model/hidden2/weights_0/grad instead.


[2017-02-19 23:41:14,568] Summary name model/hidden2/weights:0/grad is illegal; using model/hidden2/weights_0/grad instead.


INFO:tensorflow:Summary name model/hidden2/weights:0 is illegal; using model/hidden2/weights_0 instead.


[2017-02-19 23:41:14,572] Summary name model/hidden2/weights:0 is illegal; using model/hidden2/weights_0 instead.


INFO:tensorflow:Summary name model/hidden2/biases:0/grad is illegal; using model/hidden2/biases_0/grad instead.


[2017-02-19 23:41:14,576] Summary name model/hidden2/biases:0/grad is illegal; using model/hidden2/biases_0/grad instead.


INFO:tensorflow:Summary name model/hidden2/biases:0 is illegal; using model/hidden2/biases_0 instead.


[2017-02-19 23:41:14,581] Summary name model/hidden2/biases:0 is illegal; using model/hidden2/biases_0 instead.


INFO:tensorflow:Summary name model/value/weights:0/grad is illegal; using model/value/weights_0/grad instead.


[2017-02-19 23:41:14,586] Summary name model/value/weights:0/grad is illegal; using model/value/weights_0/grad instead.


INFO:tensorflow:Summary name model/value/weights:0 is illegal; using model/value/weights_0 instead.


[2017-02-19 23:41:14,590] Summary name model/value/weights:0 is illegal; using model/value/weights_0 instead.


INFO:tensorflow:Summary name model/value/biases:0/grad is illegal; using model/value/biases_0/grad instead.


[2017-02-19 23:41:14,595] Summary name model/value/biases:0/grad is illegal; using model/value/biases_0/grad instead.


INFO:tensorflow:Summary name model/value/biases:0 is illegal; using model/value/biases_0 instead.


[2017-02-19 23:41:14,598] Summary name model/value/biases:0 is illegal; using model/value/biases_0 instead.


INFO:tensorflow:Summary name model/advantage/weights:0/grad is illegal; using model/advantage/weights_0/grad instead.


[2017-02-19 23:41:14,602] Summary name model/advantage/weights:0/grad is illegal; using model/advantage/weights_0/grad instead.


INFO:tensorflow:Summary name model/advantage/weights:0 is illegal; using model/advantage/weights_0 instead.


[2017-02-19 23:41:14,607] Summary name model/advantage/weights:0 is illegal; using model/advantage/weights_0 instead.


INFO:tensorflow:Summary name model/advantage/biases:0/grad is illegal; using model/advantage/biases_0/grad instead.


[2017-02-19 23:41:14,610] Summary name model/advantage/biases:0/grad is illegal; using model/advantage/biases_0/grad instead.


INFO:tensorflow:Summary name model/advantage/biases:0 is illegal; using model/advantage/biases_0 instead.


[2017-02-19 23:41:14,614] Summary name model/advantage/biases:0 is illegal; using model/advantage/biases_0 instead.


In [15]:
ql.Init(sess, 'practice/lander-ql-1')

In [19]:
episode_rew = 0.
episode_len = 0.
old_s = env.reset()
for i in range(100500100):
    ss, aa, rr, ss1, gg = [], [], [], [], []
    done = False
    if i % 5 == 0:
        while not done and len(ss) < ROLLOUT_LEN:
            epsilon = 0.3 / (1 + (ql.cur_step or 0.) / 500000.)
            if np.random.sample() < epsilon:
                a = env.action_space.sample()
            else:
                a = sess.run(ql.act_s1, {ql.state1: np.reshape(old_s, (1, -1)),
                                         ql.is_training: False})
                a = a[0]

            s, r, done, _ = env.step(a)

            ss.append(old_s)
            aa.append(a)
            rr.append(r)
            ss1.append(s)
            gg.append(GAMMA if not done else 0.0)

            episode_rew += r
            episode_len += 1

            old_s = s

        rew = 0.
        g = 1.
        for i in reversed(range(len(ss))):
            rew = rr[i] + gg[i] * rew
            g *= gg[i]
            ss1[i] = old_s
            rr[i] = r
            gg[i] = g

        if done:
            ql.Stat({'Env/Reward': episode_rew, 'Env/Length': episode_len})
            episode_rew, episode_len = 0., 0.
            old_s = env.reset()

        buf.add(np.array(ss), np.array(aa), np.array(rr), np.array(ss1), np.array(gg), 100)

    ql.Step(sess)

Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network
Updated target network


KeyboardInterrupt: 