In [None]:
import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
import collections

from gym.envs.toy_text import discrete

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

class CliffWalkingEnv(discrete.DiscreteEnv):

    metadata = {'render.modes': ['human', 'ansi']}

    def _limit_coordinates(self, coord):
        coord[0] = min(coord[0], self.shape[0] - 1)
        coord[0] = max(coord[0], 0)
        coord[1] = min(coord[1], self.shape[1] - 1)
        coord[1] = max(coord[1], 0)
        return coord

    def _calculate_transition_prob(self, current, delta):
        new_position = np.array(current) + np.array(delta)
        new_position = self._limit_coordinates(new_position).astype(int)
        new_state = np.ravel_multi_index(tuple(new_position), self.shape)
        reward = -100.0 if self._cliff[tuple(new_position)] else -1.0
        is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (3,11))
        return [(1.0, new_state, reward, is_done)]

    def __init__(self):
        self.shape = (4, 12)

        nS = np.prod(self.shape)
        nA = 4

        # Cliff Location
        self._cliff = np.zeros(self.shape, dtype=np.bool)
        self._cliff[3, 1:-1] = True

        # Calculate transition probabilities
        P = {}
        for s in range(nS):
            position = np.unravel_index(s, self.shape)
            P[s] = { a : [] for a in range(nA) }
            P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
            P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
            P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
            P[s][LEFT] = self._calculate_transition_prob(position, [0, -1])

        # We always start in state (3, 0)
        isd = np.zeros(nS)
        isd[np.ravel_multi_index((3,0), self.shape)] = 1.0

        super(CliffWalkingEnv, self).__init__(nS, nA, P, isd)

    def render(self, mode='human', close=False):
        self._render(mode, close)

    def _render(self, mode='human', close=False):
        if close:
            return

        outfile = StringIO() if mode == 'ansi' else sys.stdout

        for s in range(self.nS):
            position = np.unravel_index(s, self.shape)
            # print(self.s)
            if self.s == s:
                output = " x "
            elif position == (3,11):
                output = " T "
            elif self._cliff[position]:
                output = " C "
            else:
                output = " o "

            if position[1] == 0:
                output = output.lstrip() 
            if position[1] == self.shape[1] - 1:
                output = output.rstrip() 
                output += "\n"

            outfile.write(output)
        outfile.write("\n")
        
class PolicyEstimator():
    def __init__(self, learning_rate=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.action = tf.placeholder(dtype=tf.int32, name="action")
            self.target = tf.placeholder(dtype=tf.float32, name="target")
            
            state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(state_one_hot, 0),
                num_outputs=env.action_space.n,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)
            
            self.action_probs = tf.squeeze(tf.nn.softmax(self.output_layer))
            self.picked_action_prob = tf.gather(self.action_probs, self.action)
            
            self.loss = -tf.log(self.picked_action_prob) * self.target
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
    
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.action_probs, {self.state: state})
    
    def update(self, state, target, action, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target, self.action: action}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss
    

# class ValueEstimator():
#     def __init__(self, learning_rate=0.1, scope="value_estimator"):
#         with tf.variable_scope(scope):
#             self.state = tf.placeholder(tf.int32, [], name="state")
#             self.target = tf.placeholder(dtype=tf.float32, name="target")
            
#             state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
#             self.output_layer = tf.contrib.layers.fully_connected(
#                 inputs=tf.expand_dims(state_one_hot, 0),
#                 num_outputs=1,
#                 activation_fn=None,
#                 weights_initializer=tf.zeros_initializer)
            
#             self.value_estimate = tf.squeeze(self.output_layer)
#             self.loss = tf.squared_difference(self.value_estimate, self.target)
#             self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
#             self.train_op = self.optimizer.minimize(
#                 self.loss, global_step=tf.contrib.framework.get_global_step())
            
#     def predict(self, state, sess=None):
#         sess = sess or tf.get_default_session()
#         return sess.run(self.value_estimate, {self.state: state})
    
#     def update(self, state, target, sess=None):
#         sess = sess or tf.get_default_session()
#         feed_dict = {self.state: state, self.target: target}
#         _, loss = sess.run([self.train_op, self.loss], feed_dict)
#         return loss
    
def reinforce(env, estimator_policy, num_episodes, discount_factor=1.0):
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    total_reward = []
    for i_episode in range(num_episodes):
        state = env.reset()
        episode = []
        for t in itertools.count():                
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            episode.append(Transition(state=state, action=action,
                                      reward=reward, next_state=next_state, done=done))
            
            if done:
                total_reward.append(sum([x.reward for x in episode]))
                print("episode num=" + str(i_episode) + " : " + str(sum([x.reward for x in episode])))
                break
                
            state = next_state
        
        for t, transition in enumerate(episode):
            total_return = sum(discount_factor ** i * t.reward for i, t in enumerate(episode[t:]))
#             baseline_value = estimator_value.predict(transition.state)
#             advantage = total_return - baseline_value
#             estimator_value.update(transition.state, total_return)
            estimator_policy.update(transition.state, total_return, transition.action)
            
env = CliffWalkingEnv()
tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator()
# value_estimator = ValueEstimator()

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
#     reinforce(env, policy_estimator, value_estimator, 20000, discount_factor=1.0)
    reinforce(env, policy_estimator, 20000, discount_factor=1.0)