# Introduction

## Imports

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import time

%matplotlib inline

## Modeling MDPs in TensorFlow

All classes defining MDPs must inherit from abstract class ```MDP```.

In [None]:
import abc

class MDP(metaclass=abc.ABCMeta):

    @abc.abstractmethod
    def transition(self, state, action):
        return

    @abc.abstractmethod
    def reward(self, state, action):
        return


### Navigation in 2D grid with deceleration zone at the center

In [None]:
class Navigation(MDP):

    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

        self.state_size = self.ndim
        self.action_size = self.ndim

        # grid constants
        self.__size = tf.constant(self.size, dtype=tf.float32)
        self.__center = tf.constant(self.center, dtype=tf.float32)
        self.__goal = tf.constant(self.goal, dtype=tf.float32)

        # numerical constants
        self.__0_00 = tf.constant(0.00, dtype=tf.float32)
        self.__0_99 = tf.constant(0.99, dtype=tf.float32)
        self.__1_00 = tf.constant(1.00, dtype=tf.float32)
        self.__2_00 = tf.constant(2.00, dtype=tf.float32)
        self.__decay = tf.constant(self.decay, dtype=tf.float32)

    def transition(self, state, action):
        # distance to center of grid
        d = tf.sqrt(tf.reduce_sum(tf.square(state - self.__center), 1, keep_dims=True))

        # deceleration_factor
        deceleration = self.__2_00 / (self.__1_00 + tf.exp(-self.__decay * d)) - self.__1_00
#         deceleration = self.__1_00
        
        # next position
        next_state = state + deceleration * action
        next_state = tf.clip_by_value(next_state, self.__0_00, self.__size)

        return next_state

    def reward(self, state, action):
        return -tf.reduce_sum(tf.abs(state - self.__goal), 1, keep_dims=True)


## Encoding an MDP as a Recurrent Neural Net

In [None]:
def policy(state, limits=1.0, n_h=1000):
    W1 = tf.get_variable("W1", shape=[n_h, state.shape[1]],
                         dtype=tf.float32, initializer=tf.glorot_normal_initializer())
    b1 = tf.get_variable("b1", shape=[n_h, 1],
                         dtype=tf.float32, initializer=tf.constant_initializer(0.0))
    W2 = tf.get_variable("W2", shape=[2, n_h],
                         dtype=tf.float32, initializer=tf.glorot_normal_initializer())
    b2 = tf.get_variable("b2", shape=[2, 1],
                         dtype=tf.float32, initializer=tf.constant_initializer(0.0))

    A = tf.nn.relu(tf.matmul(W1, tf.transpose(state)) + b1)
    action = tf.transpose(tf.constant(limits) * tf.nn.tanh(tf.matmul(W2, A) + b2))

    return action


In [None]:
class MDP_RNNCell(tf.nn.rnn_cell.RNNCell):

    def __init__(self, mdp, policy):
        self.mdp = mdp
        self.policy = policy

    @property
    def state_size(self):
        return mdp.state_size

    @property
    def output_size(self):
        return mdp.state_size + 2*mdp.action_size + 1 + 2

    def __call__(self, inputs, state, scope=None):
        # choose action from policy
        action = self.policy(state)

        # apply rotation noise
        theta = 15 # degrees
        cos, sin = tf.cos(theta * np.pi / 180 * inputs), tf.sin(theta * np.pi / 180 * inputs)
        
        noise = tf.stack([ cos, -sin, sin, cos], axis=1)
        noise = tf.reshape(noise, [-1, 2, 2])
        noisy_action = tf.matmul(noise, tf.reshape(action, [-1, 2, 1]))
        noisy_action = tf.reshape(noisy_action, [-1, 2])
        
        cos_sin = tf.reshape(tf.stack([cos, sin], axis=1), [-1, 2])

        # add MDP components to the RNN cell output
        next_state =  self.mdp.transition(state, noisy_action)
        reward = self.mdp.reward(next_state, noisy_action)

        return tf.concat([reward, next_state, action, noisy_action, cos_sin], 1), next_state


In [None]:
class MDP_RNN(object):
    
    def __init__(self, mdp, policy, batch_size=1):
        self.cell = MDP_RNNCell(mdp, policy)
        self.batch_size = batch_size
    
    def unroll(self, inputs, initial_state=None):
        # set initial state 
        if initial_state is None:
            initial_state = self.cell.zero_state(self.batch_size, dtype=tf.float32)           

        # dynamic time unrolling
        outputs, final_state = tf.nn.dynamic_rnn(
            self.cell,
            inputs,
            initial_state=initial_state,
            dtype=tf.float32)

        # gather reward, state and action series
        outputs = tf.unstack(outputs, axis=2)
        max_time = int(inputs.shape[1])

        reward_series = tf.reshape(outputs[0], [-1, max_time, 1])
        state_series = tf.stack(outputs[1:3], axis=2)
        action_series = tf.stack(outputs[3:5], axis=2)
        noisy_action_series = tf.stack(outputs[5:7], axis=2)
        
        cos_sin = tf.stack(outputs[7:9], axis=2)

        return reward_series, state_series, action_series, noisy_action_series, final_state, cos_sin


## Defining the action optimizer

In [None]:
class PolicyOptimizer(object):
    
    def __init__(self, loss, learning_rate, size):
        self.loss = loss
        
        # input size
        self.size = size

        # optimization hyperparameters
        self.learning_rate = learning_rate

        # backprop via RMSProp
        self.train_step = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

    def run(self, sess, epoch=100, show_progress=True):
        # initialize variables
        sess.run(tf.global_variables_initializer())
        
#         variables = sess.run({ var.name: var for var in tf.trainable_variables() })
#         print("Before training ...")
#         for name, var in variables.items():
#             print(name)
#             print(var)
#             print()
        
        losses = []
        for epoch_idx in range(epoch):
            # sample noise data
            inputs_data = np.random.normal(size=self.size).astype(np.float32)

            # backprop and update weights
            _, loss = sess.run([self.train_step, self.loss], feed_dict={inputs: inputs_data})

            # store and show loss information
            losses.append(loss)
            if show_progress:
                print('Epoch {0:5}: loss = {1}\r'.format(epoch_idx, loss), end='')
        print()
    
        variables = sess.run({ var.name: var for var in tf.trainable_variables() })
#         print("After training ...")
#         for name, var in variables.items():
#             print(name)
#             print(var)
#             print()

        return losses, variables


## Putting all together

### Instantiate the MDP model

In [None]:
params = {
    'ndim': 2,
    'size': (10.0, 10.0),
    'initial': (1.0, 5.0),
    'goal': (9.0, 5.0),
    'center': (5.0, 5.0),
    'decay': 2.0,
    'limits': (-1.0, 1.0)
}

# MDP model
mdp = Navigation(**params)


### Train the Policy Network

In [None]:
def train(batch_size=1000, max_time=10, epoch=200, learning_rate=0.01):
    start = time.time()
    with tf.Session() as sess:
        losses, variables = PolicyOptimizer(loss, learning_rate, (batch_size, max_time, 1)).run(sess, epoch)
    end = time.time()
    uptime = end - start
    print("Done in {0:.6f} sec".format(uptime))
    return losses, variables, uptime

In [None]:
x_grid_size = y_grid_size = 100

max_time = 9
batch_size = x_grid_size * y_grid_size
epoch=100
learning_rate = 0.0005

# inputs
inputs = tf.placeholder(tf.float32, shape=[None, max_time, 1], name="inputs")

# initial state
x_grid = np.linspace(0.0, params['size'][0], x_grid_size)
y_grid = np.linspace(0.0, params['size'][1], y_grid_size)
initial_states_grid = []
for x in x_grid:
    for y in y_grid:
        initial_states_grid.append([x, y])
initial_state = tf.constant(initial_states_grid)

# unrolled MDP model
rnn = MDP_RNN(mdp, policy, batch_size)
rewards, states, actions, _, final_state, _ = rnn.unroll(inputs, initial_state)

# loss based on total reward
total = tf.reduce_sum(rewards, 1)
loss = tf.reduce_mean(tf.square(total))

# optimize it, babe!
losses, variables, uptime = train(batch_size, max_time, epoch, learning_rate)
# for name, var in variables.items():
#     print(name)
#     print(var)
#     print()


In [None]:
def simulate(series, batch_size=1, max_time=10):
    with tf.Session() as sess:
        # initialize variables
        sess.run(tf.global_variables_initializer())

        # sample noise data
        inputs_data = np.random.normal(size=(batch_size, max_time, 1)).astype(np.float32)

        # run MDP series
        result = sess.run(series, feed_dict={inputs: inputs_data})
    return result

In [None]:
max_time = 9
batch_size = 3

# initial state
# initial_state = tf.constant([params['initial']], dtype=tf.float32)
x_start, y_start = params['initial']
x_initial = tf.fill([batch_size], tf.constant(x_start, tf.float32))
y_initial = tf.fill([batch_size], tf.constant(y_start, tf.float32))
initial_state = tf.stack([x_initial, y_initial], axis=1)
delta = [-3.0, -1.5, 1.5, 3.0]
for delta_y in delta:
    x_initial = tf.fill([batch_size], tf.constant(x_start, tf.float32))
    y_initial = tf.fill([batch_size], tf.constant(y_start + delta_y, tf.float32))
    initial_state = tf.concat([initial_state, tf.stack([x_initial, y_initial], axis=1)], axis=0)

batch_size = initial_state.shape[0]

# inputs
inputs = tf.placeholder(tf.float32, shape=[batch_size, max_time, 1], name="inputs")

def trained_policy(state, init, limits=1.0, n_h=1000):
    W1_new = tf.get_variable("W1_new", shape=[n_h, state.shape[1]],
                             dtype=tf.float32, initializer=tf.constant_initializer(init["rnn/W1:0"]))
    b1_new = tf.get_variable("b1_new", shape=[n_h, 1],
                             dtype=tf.float32, initializer=tf.constant_initializer(init["rnn/b1:0"]))
    W2_new = tf.get_variable("W2_new", shape=[2, n_h],
                             dtype=tf.float32, initializer=tf.constant_initializer(init["rnn/W2:0"]))
    b2_new = tf.get_variable("b2_new", shape=[2, 1],
                             dtype=tf.float32, initializer=tf.constant_initializer(init["rnn/b2:0"]))
    
    A = tf.nn.relu(tf.matmul(W1_new, tf.transpose(state)) + b1_new)
    action = tf.transpose(tf.constant(limits) * tf.nn.tanh(tf.matmul(W2_new, A) + b2_new))

    return action

# unrolled MDP model
policy = lambda s: trained_policy(s, variables)
rnn = MDP_RNN(mdp, policy, batch_size)
rewards, states, actions, noisy_action, final_state, cos_sin = rnn.unroll(inputs, initial_state)

# simulate!
r_series, s_series, a_series, n_series, cos_sin = simulate([rewards, states, actions, noisy_action, cos_sin], batch_size, max_time)
# print(s_series)
# print()
# print(a_series)
# r_series, s_series, a_series, n_series, cos_sin = np.squeeze(r_series), np.squeeze(s_series), np.squeeze(a_series), np.squeeze(n_series), np.squeeze(cos_sin)
# print("Action, Noise, Noisy Action, State, Reward")
# i = 1
# for a, n, s, r, x in zip(a_series, n_series, s_series, r_series, cos_sin):
#     print(i, end=', ')
#     print("[{0:-10.6f}, {1:-10.6f}]".format(a[0], a[1]), end=', ')
#     print("[{0:-10.6f}, {1:-10.6f}]".format(x[0], x[1]), end=', ')
#     print("[{0:-10.6f}, {1:-10.6f}]".format(n[0], n[1]), end=', ')
#     print("[{0:-10.6f}, {1:-10.6f}]".format(s[0], s[1]), end=', ')
#     print("{0:-10.6f}".format(r))
#     i += 1

## Visualizing results

In [None]:
plt.figure(figsize=(15, 5))

# plotting losses
plt.subplot(121)
plt.plot(losses, 'b-')
plt.xlim(0, epoch)
plt.title('Total Loss')
plt.xlabel("# iterations")
plt.ylabel("total loss")
plt.grid()


In [None]:
def plot_navigation(ax, start, end, s_series, a_series, deceleration=True, initial=False):
    # params
    xlim, ylim = params['size']
    xcenter, ycenter = params['center']
    
    # plot configuration
    ax.axis([0.0, xlim, 0.0, ylim])
    ax.set_aspect('equal')
    ax.grid()
#     ax.title("Navigation trajectory and deceleration zone", fontweight="bold", fontsize=16)
    ax.set_xlabel("x coordinate")
    ax.set_ylabel("y coordinate")
#     ax.legend(loc='lower right')

    if deceleration:
        npoints = 1000
        X, Y = np.meshgrid(np.linspace(0.0, xlim, npoints), np.linspace(0.0, ylim, npoints))
        D = np.sqrt((X - xcenter) ** 2 + (Y - ycenter) ** 2)
        Lambda = 2 / (1 + np.exp(-params['decay'] * D)) - 1.00

        ticks = np.arange(0.0, 1.01, 0.10)
        cp = ax.contourf(X, Y, Lambda, ticks, cmap=plt.cm.bone)
        # ax.colorbar(cp, ticks=ticks)
        cp = ax.contour(X, Y, Lambda, ticks, colors='black', linestyles='dashed')
        # ax.clabel(cp, inline=True, fontsize=10)

    if initial:
        initial_states_x = [ p[0] for p in initial_states_grid ]
        initial_states_y = [ p[1] for p in initial_states_grid ]
        ax.plot(initial_states_x, initial_states_y, 'gx')

    # actions
    positions = np.concatenate([[start], s_series])
    ax.quiver(positions[:-1, 0], positions[:-1, 1], a_series[:, 0], a_series[:, 1],
              angles='xy', scale_units='xy', scale=1, color='dodgerblue', width=0.005,
              label='actions')

    # states
    ax.plot(positions[:, 0], positions[:, 1], '-', marker='o', color='darkblue', markersize=8, label='states')

    # start and end
    ax.plot([start[0]], [start[1]], marker='X', markersize=15, color='limegreen', label='initial')
    ax.plot([end[0]], [end[1]], marker='X', markersize=15, color='crimson', label='goal')


In [None]:
start, end = params['initial'], params['goal']
fig = plt.figure(figsize=(15, 25))
num_plots = int(initial_state.shape[0])
deltas = [0] + delta
rows = len(deltas)
cols = num_plots // len(deltas)
for i in range(num_plots):
    ax = fig.add_subplot(len(deltas),num_plots/len(deltas),i+1)
    idx = i//cols
    start = (params['initial'][0], params['initial'][1] + deltas[idx])
    plot_navigation(ax, start, end, s_series[i], a_series[i])