# MDPs in TensorFlow - Reservoir with Noisy

In this IPython notebook, we'll explore **Continuous State-Action MDPs** with stochastic transitions in TensorFlow. All stochastic transitions will be defined by a deterministic function combined with external noise that is considered an input to the MDP cell.

## Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
import sys
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)


from tf_mdp import utils
from tf_mdp.models import mdp 
import functools
import time

# import utils

%matplotlib inline

## Modeling MDPs in TensorFlow

### Navigation in 2D grid with deceleration zone at the center

In [2]:
reserv_dict = {'n_reservoirs': 3,
               'upper_bounds': [200.0, 200.0, 200.0],
               'lower_bounds': [10.0, 10.0, 10],
               'initial_states': [105.0, 105.0, 105.0],
               'rain_mean': 5.0,
               'rain_std': 1.,
               'evaporation_std': 5.}

In [3]:
class Reservoir_non_linear(mdp.TF_MDP):
    """
    Class that encodes a mnp reservoir scenario

    :param graph: computation graph
    :type graph: tf.Graph
    :param reserv_dict: specific parameters of the problem
    :type reserv_dict: dict
    """
    def __init__(self,
                 graph,
                 reserv_dict):
        self.graph = graph
        self.n_reservoirs = reserv_dict["n_reservoirs"]
        self.rain_mean = reserv_dict["rain_mean"]
        self.rain_std = reserv_dict["rain_std"]
        self.e_t_std = reserv_dict["evaporation_std"]
        self.lower = np.array(reserv_dict['lower_bounds'], dtype="float32")
        self.upper = np.array(reserv_dict['upper_bounds'], dtype="float32")
        self.halfs = (self.upper + self.lower) / 2.0
        self.max_capacity_largest = np.max(self.upper - self.lower) 
        self.max_capacity = float(sys.maxsize)


        with self.graph.as_default():
            pass

    @property
    def action_size(self):
        return self.n_reservoirs
    
    @property
    def state_size(self):
        return self.n_reservoirs
        
    def transition(self, state, action):
        """
        Takes one step on the MDP.

        :param state: MDP state
        :type state: tf.Tensor
                     shape=(batch_size,
                            self.state_size)
                     dtype=float32
                     
        :param action: MDP action
        :type state: tf.Tensor
                     shape=(batch_size,
                            self.action_size)
                     dtype=float32

        :rtype: tf.Tensor
                shape=(batch_size, self.state_size)
                dtype=float32
        """
        state_shape = state.get_shape()
        with self.graph.as_default():
            with tf.name_scope("random_variables"):
                sin_e_t = tf.sin(state / self.max_capacity_largest)
                loc_e_t = tf.multiply(0.5 * state, sin_e_t)  
                rain_noise = tf.distributions.Normal(loc=self.rain_mean,
                                                     scale=self.rain_std)
                water_loss_noise = tf.distributions.Normal(loc=loc_e_t,
                                                           scale= self.e_t_std)
                r_t = rain_noise.sample(state_shape, seed=1)
                e_t = water_loss_noise.sample(seed=1)

            with tf.name_scope("transition"):
                new_state = state + r_t - e_t - action
                new_state = tf.clip_by_value(new_state,
                                             clip_value_min=0.0,
                                             clip_value_max=self.max_capacity)

        return new_state   
        

    def reward(self, state, action):
        """
        calculates the reward.

        :param state: MDP state
        :type state: tf.Tensor
                     shape=(batch_size,
                            self.state_size)
                     dtype=float32
                     
        :param action: MDP action
        :type action: None

        :rtype: tf.Tensor
                shape=(batch_size, 1)
                dtype=float32
        """
        with self.graph.as_default():

            
            with tf.name_scope("reward"):
                __zeros = tf.zeros(state.get_shape(), dtype=tf.float32)
                lower_comparison = tf.greater_equal(state,
                                                    self.lower)
                upper_comparison = tf.greater_equal(self.upper,
                                                    state)
                in_bounds = tf.logical_and(lower_comparison,
                                           upper_comparison)
                below_bounds = tf.less(state, self.lower) 
                rewards = tf.where(in_bounds,
                                   __zeros,
                                  tf.where(below_bounds,
                                           - 5 * (self.lower - state),
                                           - 100 * (state - self.upper)))
                rewards += tf.abs(self.halfs - state) * (-0.1)
                rewards = tf.reduce_sum(rewards, 1, keep_dims=True)

        return rewards


In [4]:
state_batch = np.array([[105.,105.,105.],
                       [100.,20.,50.]])

action_batch = np.array([[100.,45.,25.],
                       [50.,20.,10.]])

In [5]:
my_graph = tf.Graph()
reser = Reservoir_non_linear(my_graph, reserv_dict)
with reser.graph.as_default():
    states = tf.constant(state_batch, dtype="float32")
    actions = tf.constant(action_batch, dtype="float32")
    new_state = reser.transition(states, actions)

sess = tf.Session(graph=reser.graph)
new_state = sess.run(new_state)
new_state

array([[  0.        ,  31.50283813,  57.17991638],
       [ 39.65325928,   3.55231857,  36.13182831]], dtype=float32)