# Adding tensorboard visualization to MDPs in TensorFlow - Navigation with Noisy Transitions

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import abc
import functools
import time
import os
import shutil

%matplotlib inline

## Functions to help cleaning and creating the log and save folders

In [None]:
def clean_folder(folder_name):
    """
    Clean the folder 'folder_name'

    :type folder_name: str
    """
    currentdir = os.getcwd()
    log_path = os.path.join(currentdir, folder_name)
    if os.path.exists(log_path):
        shutil.rmtree(log_path)


def get_time_stamp():
    """
    Time stamp generator e.g. 12-11-2016_18-20-45

    :rtype: str
    """
    return time.strftime('%d-%m-%Y_%H-%M-%S')

## Modeling MDPs in TensorFlow (as before)

All classes defining MDPs must inherit from abstract class ```MDP```.

In [None]:
class MDP(metaclass=abc.ABCMeta):
    
    @abc.abstractproperty
    def action_size(self):
        return
    
    @abc.abstractproperty
    def state_size(self):
        return

    @abc.abstractmethod
    def transition(self, state, action):
        return

    @abc.abstractmethod
    def reward(self, state, action):
        return


## New version of the navigation class

In [None]:
class Navigation(MDP):
    """
    Class that encodes a mnp navigation scenario

    :type graph: tf.Graph
    :type kwargs: dict
    """

    def __init__(self, graph, **kwargs):
        self.__dict__.update(kwargs)
        with graph.as_default():
            with tf.name_scope("grid_constants"):
                self._size = tf.constant(self.size, dtype=tf.float32, name="size")
                self._center = tf.constant(self.center, dtype=tf.float32)
                self._goal = tf.constant(self.goal, dtype=tf.float32)
            with tf.name_scope("numerical_constants"):
                self._0_00 = tf.constant(0.00, dtype=tf.float32)
                self._1_00 = tf.constant(1.00, dtype=tf.float32)
                self._2_00 = tf.constant(2.00, dtype=tf.float32)
                self._8_00 = tf.constant(8.00, dtype=tf.float32)
                self._decay = tf.constant(self.decay, dtype=tf.float32)

    @property
    def action_size(self):
        """
        :rtype: int
        """
        return self.ndim
    
    @property
    def state_size(self):
        """
        :rtype: int
        """
        return self.ndim
        
    def transition(self, state, action, noise):
        """
        takes one transition step on the mdp

        :type stape: tf.Tensor
                     shape=(batch_size,
                            self.ndim)
                     dtype=float32

        :type action: tf.Tensor
                     shape=(batch_size,
                            self.ndim)
                     dtype=float32

        :type noise: tf.Tensor
                     shape=(batch_size, 1)
                     dtype=float32

        :rtype: tf.Tensor
                shape=(batch_size,
                       self.ndim)
                dtype=float32
        """

        # rotation angle (in degrees)
        velocity = tf.norm(action, axis=1, keep_dims=True)
        
        # f(x) = 1 / (1 + exp(-8(x-1)))
        atenuation = self._1_00 / (self._1_00 + tf.exp(-self._8_00 * (velocity - self._1_00)))
        max_theta = 20  # degrees
        theta =  max_theta * atenuation

        # apply rotation noise
        cos, sin = tf.cos(theta * np.pi / 180 * noise), tf.sin(theta * np.pi / 180 * noise)
        
        noise_matrix = tf.stack([cos, -sin, sin, cos], axis=1)
        noise_matrix = tf.reshape(noise_matrix, [-1, 2, 2])
        noisy_action = tf.matmul(noise_matrix, tf.reshape(action, [-1, 2, 1]))
        noisy_action = tf.reshape(noisy_action, [-1, 2])
        
        # distance to center of grid
        d = tf.sqrt(tf.reduce_sum(tf.square(state - self._center), 1, keep_dims=True))

        # deceleration_factor
        deceleration = self._2_00 / (self._1_00 + tf.exp(-self._decay * d)) - self._1_00
        # deceleration = self.__1_00
        
        # next position
        next_state = state + deceleration * noisy_action
        next_state = tf.clip_by_value(next_state, self._0_00, self._size)

        return next_state

    def reward(self, state, action=None):
        """
        calculates the norm L-2 (euclidean distance)
        actions are not used

        :type stape: tf.Tensor
                     shape=(batch_size,
                            self.ndim)
                     dtype=float32

        :type action: None

        :rtype: tf.Tensor
                shape=(batch_size, 1)
                dtype=float32
        """
        return -tf.sqrt(tf.reduce_sum(tf.square(state - self._goal), 1, keep_dims=True))

## Encoding an MDP as a Recurrent Neural Net (as before, docstting addition)

In [None]:
class MDP_RNNCell(tf.nn.rnn_cell.RNNCell):
    """
    MDP as a RNN cell

    :type mdp: MDP

    :type policy: function
                     -input: tf.Tensor
                            shape=(batch_size,
                                   ndim)
                            dtype=float32
                     -output: tf.Tensor
                            shape=(batch_size,
                                   ndim)
                            dtype=float32 
    """

    def __init__(self, mdp, policy):
        self.mdp = mdp
        self.policy = policy

    @property
    def state_size(self):
        """
        :rtype: int
        """
        return self.mdp.state_size

    @property
    def output_size(self):
        """
        :rtype: int
        """
        return self.mdp.state_size + self.mdp.action_size + 1

    def __call__(self, inputs, state, scope=None):
        """
        Perform the rnn call
        
        :type input: tf.Tensor
                     shape=(batch_size,
                            1)
                     dtype=float32

        :type state: tf.Tensor
                     shape=(batch_size,
                            mdp.state_size)
                     dtype=float32

        :rtype output: tf.Tensor
                       shape=(batch_size,
                              mdp.state_size
                              + mdp.action_size
                              + 1)
                       dtype=float32

        :rtype next_state: tf.Tensor
                           shape=(batch_size,
                                  mdp.state_size)
                           dtype=float32
        """
        # add policy network
        action = self.policy(state)

        # add MDP components to the RNN cell output
        noise = inputs
        next_state =  self.mdp.transition(state, action, noise)
        reward = self.mdp.reward(next_state, action)
        output = tf.concat([reward, next_state, action], 1)
        
        return output, next_state


## Policy as a Neural Net (as before, docstting addition)

In [None]:
def policy_network(layers, state, limits=1.0):
    """
    Neural network to approximate the optimal policy

    :type layer: [int]

    :type state: tf.Tensor
                 shape=(batch_size,
                        ndim)
                dtype=float32

    :rtype: tf.Tensor
            shape=(batch_size,
                        ndim)
                dtype=float32
    """
    assert(layers[0] == state.shape[1])

    with tf.variable_scope('policy'):
        
        # hidden layers
        outputs = state
        for i, n_h in enumerate(layers[1:]):
            if i != len(layers)-2:
                activation = tf.nn.relu
            else:
                activation = tf.nn.tanh

            outputs = tf.layers.dense(outputs,
                                      units=n_h,
                                      activation=activation,
                                      kernel_initializer=tf.glorot_normal_initializer(),
                                      name="layer"+str(i+1))

        # add action limits over last tanh layer
        action = tf.constant(limits) * outputs

    # print(tf.get_default_graph().get_collection('variables'))
    return action

## A class to hold all the hyperparams

In [None]:
class Config(object):
    """
    Holds model hyperparams.

    :type batch_size: int
    :type max_time: int
    :type epoch: int
    :type noise_ratio: float
    :type learning_rate: float
    """
    def __init__(self,
                 batch_size=10000,
                 max_time=9,
                 epoch=100,
                 noise_ratio = 1.0,
                 learning_rate=0.1):
        self.batch_size = batch_size
        self.max_time = max_time
        self.epoch = epoch
        self.noise_ratio = noise_ratio
        self.learning_rate = learning_rate


## A wrapper for the navigation task

In [None]:
class NAV_Wrapper(object):
    """
    Wrapper for the Navigation MDP.
    
    :type config: Config
    :type params: dict
    :type policy: funtion
    :type name: None or str
    """
    def __init__(self,
                 params,
                 config,
                 policy,
                 name=None):
        if not os.path.exists("models"):
            os.makedirs("models")
        if name is not None:
            self.log_path = os.path.join('graphs', name)
            self.save_path = os.path.join('models', name, 'model.ckpt')
        else:
            time_stamp = get_time_stamp()
            self.log_path = os.path.join('graphs', time_stamp)
            self.save_path = os.path.join('models', time_stamp, 'model.ckpt')
        self.config = config
        self.graph = tf.Graph()
        self.mdp = Navigation(self.graph, **params)
        self.policy = policy
        self.build_graph()

    def build_graph(self):
        """
        Build the tensorflow graph
        """
        max_time = self.config.max_time
        batch_size = self.config.batch_size
        learning_rate = self.config.learning_rate
        ndim = self.mdp.ndim
        with self.graph.as_default():
            with tf.name_scope("grid_constants"):
                x_initial, y_initial = self.mdp.initial
                x_initial = tf.fill([batch_size], tf.constant(x_initial, tf.float32))
                y_initial = tf.fill([batch_size], tf.constant(y_initial, tf.float32))
                initial_state = tf.stack([x_initial, y_initial], axis=1)

            with tf.name_scope("Noise"):
                self.inputs = tf.placeholder(tf.float32, shape=[None, max_time, 1], name="inputs")

            with tf.name_scope("RNNCell"):
                cell = MDP_RNNCell(self.mdp, self.policy)
                outputs, self.final_state = tf.nn.dynamic_rnn(cell,
                                                         self.inputs,
                                                         initial_state=initial_state,
                                                         dtype=tf.float32)


                outputs = tf.unstack(outputs, axis=2)
                max_time = int(self.inputs.shape[1])

                self.rewards = tf.reshape(outputs[0], [-1, max_time, 1])
                self.states  = tf.stack(outputs[1:3], axis=2)
                self.actions = tf.stack(outputs[3:5], axis=2)

            with tf.name_scope("rewards"):
                self.total = tf.reduce_sum(self.rewards, 1)
                
            with tf.name_scope("last_state_summary"):
                last_states = tf.slice(self.states, [0, max_time -1, 0], [-1, -1, -1])
                last_states = tf.reshape(last_states, (batch_size, ndim))
                best_batch = tf.argmax(self.total,0)
                best_last_state = tf.nn.embedding_lookup(last_states, best_batch)
                best_last_state = tf.reshape(best_last_state, (ndim, 1))
                tf.summary.histogram('last_states_summ', best_last_state)

            with tf.name_scope("loss"):
                self.loss  = tf.reduce_mean(tf.square(self.total))
                tf.summary.scalar("loss", self.loss)

            with tf.name_scope("optimizer"):
                self.train_step = tf.train.RMSPropOptimizer(learning_rate).minimize(self.loss)

            with tf.name_scope("global_initializer"):
                self.init_op = tf.global_variables_initializer()

            with tf.name_scope("saver"):
                self.saver = tf.train.Saver()
                
    def train(self, show_progress=True):
        """
        Train the model, log all the progress in tensorboard
        and save the model paramaters in the folder 'self.save_path'

        :type show_progress: boolean
        """
        start = time.time()
        epoch = self.config.epoch
        size = (self.config.batch_size, self.config.max_time, 1)
        ratio = self.config.noise_ratio
        noise_size = (int(ratio * size[0]), size[1], 1)
        with tf.Session(graph=self.graph) as sess:

            # writing in tensorboard
            summary_writer = tf.summary.FileWriter(self.log_path, sess.graph)
            all_summaries = tf.summary.merge_all()

            # initialize variables
            sess.run(self.init_op)
            self.losses = []

            for epoch_idx in range(epoch):
                # sample noise data from normal distribution
                # random_noise = np.random.normal(size=self.noise_size).astype(np.float32)

                # sample noise data from uniform distribution
                random_noise = np.random.uniform(low=-1.0, high=1.0, size=noise_size).astype(np.float32)

                # no noise at all...
                shape = (size[0] - noise_size[0], size[1], 1)
                zero_noise = np.zeros(shape=shape).astype(np.float32)

                # noise
                inputs_data = np.concatenate([random_noise, zero_noise], axis=0)

                # backprop and update weights
                _, loss, summary, total = sess.run([self.train_step,
                                                    self.loss,
                                                    all_summaries,
                                                    self.total],
                                                   feed_dict={self.inputs: inputs_data})
                # writing the log
                summary_writer.add_summary(summary, epoch_idx)
                summary_writer.flush()

                # store and show loss information
                self.losses.append(loss)
                if show_progress:
                    print('Epoch {0:5}: loss = {1}\r'.format(epoch_idx, loss), end='')

            self.variables = sess.run({ var.name: var for var in tf.trainable_variables() })
            self.total_cost_per_batch = total
            
            # save model
            save_path = self.saver.save(sess, self.save_path)
            print("\nModel saved in file: %s" % save_path)
            print("\n&&&&&&&&& For TensorBoard visualization type &&&&&&&&&&&")
            print("\ntensorboard  --logdir {}\n".format(self.log_path))

        end = time.time()
        self.uptime = end - start
        print("Done in {0:.6f} sec".format(self.uptime))
        


## Run a training with the wrapper class using the default hyperparams 

In [None]:
clean_folder("graphs")
clean_folder("models")

mdp_params = {
    'ndim': 2,
    'size': (10.0, 10.0),
    'initial': (1.0, 5.0),
    'goal': (8.0, 5.0),
    'center': (5.0, 5.0),
    'decay': 2.0,
    'limits': (-1.0, 1.0)
}

layers = [2, 20, 5, 2]
policy = functools.partial(policy_network, layers)

my_config = Config()
nav = NAV_Wrapper(mdp_params, my_config, policy)
nav.train()

## You can run the tensorboard by uncommenting and running the cell below

In [None]:
# ! tensorboard  --logdir <folder_name>

## I tried to add 3 visualizations in Tensorboard


### A plot of the loss over the different epochs


<img src="files/img/loss.png" width="600px">

### The graph of the model

<img src="files/img/graph.png" width="600px">

### A summary with the final position with the best planner over the different epochs

<img src="files/img/summary.png" width="600px">








