# <center> Introduction to Reinforcement Learning</center>

# Getting Used to the Grid World

#### Import dependencies

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import pickle

import sys
import os

from IntroRL_Support.helper import *
from ece4078.gym_simple_gridworlds.envs.grid_env import GridEnv
from ece4078.gym_simple_gridworlds.envs.grid_2dplot import *


from IPython.display import display, HTML

UP = 0; DOWN = 1; LEFT = 2; RIGHT = 3; STAY = np.nan

This is the grid world we are dealing with:

![GridWorldExample.png](https://i.postimg.cc/5tMM5vqf/Grid-World-Example.png)

# Examining a few existing methods in GridEnv

`__get_reachable_states__(s)` given a state `s`, what is the state that the agent might end up in for all possible actions, remember that the grid has borders, and the agent cannot go into obstacle tile.

Try to predict the output of the following cell

In [None]:
grid_world = GridEnv(gamma=0.9, noise=0.2, living_reward=-0.04)
grid_world.__get_reachable_states__(1) 

The `rewards` property of class `GridEnv` is an $n\times m$ matrix where $n$ is the size of the state space, and m is the size of the action space. Each entry is the expected reward given the state-action pair. Let's try to manually calcuclate a few of these entries.

In [None]:
grid_world.rewards

# Define a new transition function

Let's make the environment such that there is an $p$ chance that the agent will move as we expected, and $1 - p$ chance that it will stay exactly where it is. Given $|p| \leq 1$.

Note that obstacle is a state with value `np.nan`

In [None]:
def generate_transition_LUT(env, p):
    state_size = env.observation_space.n
    action_size = env.action_space.n
    state_transitions = np.zeros((state_size, action_size, state_size))

    #TODO 1: Specify the new transition function -------------------------------------
    
    #ENDTODO -------------------------------------------------------------------------
    return state_transitions

state_transitions = generate_transition_LUT(grid_world, 0.7)

# Define a new reward function

Let's make the reward such that moving UP or DOWN is considered a really bad thing. Make the reward such that whenever you move up or down, the reward is subtracted by `up_down_penalty`. Otherwise the state-action reward is $R(s,a) = \sum_{s^{\prime}\in\mathcal{S}}\mathbb{P}(s^{\prime}| s, a)\mathcal{R}(s^{\prime})$

In [None]:
def generate_state_action_reward_LUT(env, up_down_penalty):
    state_size = env.observation_space.n
    action_size = env.action_space.n
    rewards = np.zeros((state_size, action_size))

    #TODO 2: Specify new reward function --------------------------------------------
    
    #ENDTODO -------------------------------------------------------------------------
    return rewards

rewards = generate_state_action_reward_LUT(grid_world, 0.1)

# Define a stochastic policy

Define a policy that is state-independent, i.e. its action is the same regardless of the state it is in. The function expects 4 inputs:
- `up_chance`: the probability that the robot will go up
- `down_chance`: the probability that the robot will go down
- `left_chance`: the probability that the robot will go left
- `right_chance`: the probability that the robot will go right

In [None]:
def policy(up_chance, down_chance, left_chance, right_chance):
    #TODO 3: Define stochastic policy ------------------------------------------------
        pass
    #ENDTODO -------------------------------------------------------------------------

my_policy = policy(0.5, 0.3, 0.15, 0.05)

# Let visualize what happens when we use these reward, policy and transition

In [None]:
# Create a Grid World instance
grid_world = GridEnv(gamma=0.9, noise=0.2, living_reward=-0.04)
s_x, s_y = get_state_to_plot(grid_world)
fig, ax = grid_world.render()
agent, = ax.plot([], [], 'o', color='b', linewidth=6)
reward_text = ax.text(0.02, 0.95, '', transform=ax.transAxes)

done = False
cumulative_reward = 0
cur_state = grid_world.cur_state
path_to_plot = []

grid_world.state_transitions = generate_transition_LUT(grid_world, 0.7)
grid_world.rewards = generate_state_action_reward_LUT(grid_world, 0.1)
my_policy = lambda: policy(0.5, 0.3, 0.15, 0.05)
i = 0
max_steps = 1000

while not done:
    _, cur_reward, done, _ = grid_world.step(int(my_policy()))
    i = i + 1
    cur_state = grid_world.cur_state
    n_x, n_y = get_state_to_plot(grid_world)
    cumulative_reward += cur_reward
    path_to_plot.append([cumulative_reward, n_x, n_y])
    if i > max_steps:
        done = True
        print(f"Could not reach terminal state after {max_steps} steps.")

def init():
    agent.set_data([s_x + 0.5], [s_y + 0.5])
    reward_text.set_text('')
    return agent, reward_text

def animate(i):
    if i < len(path_to_plot):
        r, n_x, n_y = path_to_plot[i]
        agent.set_data([n_x + 0.5], [n_y + 0.5])
        reward_text.set_text('Cumulative reward: %.2f' % r)
    return agent, reward_text

ani = animation.FuncAnimation(fig, animate, frames=len(path_to_plot), blit=False, interval=500, init_func=init,
                              repeat=False)

plt.close('all') 
display(HTML(f"<div align=\"center\">{ani.to_jshtml()}</div>"))

# Test cases

In [None]:
import otter
from ece4078.Utility import pretty_print_otter
grader = otter.Notebook(tests_dir = "IntroRL_Support/tests")

grader.check_all()