# Model-Free Prediction and Control

#### Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = "Courier New"
import seaborn as sns
import copy

from maze_class import *
from monte_carlo import *
from td_learning import *

  import pandas.util.testing as tm


## Monte Carlo

#### Initialise parameters

In [2]:
MAX_EP = 3000
STEP_COST = -0.1
MAX_EP_LEN = 30

maze_coords = {
    "[0, 0]": 0, "[0, 1]": 1, "[0, 2]": 2, "[0, 3]": 3,
    "[1, 0]": 4, "[1, 1]": 5, "[1, 2]": 6, "[1, 3]": 7,
    "[2, 0]": 8, "[2, 1]": 9, "[2, 2]": 10, "[2, 3]": 11,
    "[3, 0]": 12, "[3, 1]": 13, "[3, 2]": 14, "[3, 3]": 15
}

reversed_maze = {
    "0": [0, 0], "1": [0, 1], "2": [0, 2], "3": [0, 3],
    "4": [1, 0], "5": [1, 1], "6": [1, 2], "7": [1, 3],
    "8": [2, 0], "9": [2, 1], "10": [2, 2], "11": [2, 3],
    "12": [3, 0], "13": [3, 1], "14": [3, 2], "15": [3, 3],
}

In [None]:
env = Maze(maze_coords,reversed_maze, step_cost=STEP_COST, max_ep_length=MAX_EP_LEN)

#### Visualize rewards with function below

In [3]:
def plot_state_value_grid(state_value_grid):
    """
    """
    plt.figure(figsize=(10,5))
    p=sns.heatmap(state_value_grid, cmap='coolwarm', annot=True, fmt=".1f",annot_kws={'size':16},square=True)
    p.set_ylim(len(state_value_grid)+0.01, -0.01)

In [None]:
# factor 1
policy_eval1 = mc_evaluation_policy(env, discount_factor=1)

# factor 0.99
policy_eval2 = mc_evaluation_policy(env, discount_factor=0.9)

#### Factor 1

In [None]:
plot_state_value_grid(policy_eval1.reshape((4,4)))


#### Factor 0.9

In [None]:
plot_state_value_grid(policy_eval2.reshape((4,4)))

## TD Learning

In [None]:
# factor 1
policy_eval1 = td_learning(env, discount_factor=1)

# factor 0.99
policy_eval2 = td_learning(env, discount_factor=0.9)

#### Factor 1

In [None]:
plot_state_value_grid(policy_eval1.reshape((4,4)))

#### Factor 0.9

In [None]:
plot_state_value_grid(policy_eval2.reshape((4,4)))

## On-policy first-visit Monte-Carlo Control

#### Initialize rewards lists 

In [None]:
def rewards_lists(cells, actions):
    # Each state has four possible actions to take
    def create_array(n, lst):
        for i in range(n):
            lst.append(str(i))
        return lst 

    possible_states = []
    possible_states = create_array(cells, possible_states)

    possible_actions = []
    possible_actions = create_array(actions, possible_actions)

    rewards = {}
    for state in possible_states:
        for action in possible_actions:
            rewards[state+", "+action] = []

    return rewards

In [None]:
env = Maze(maze_coords,reversed_maze, step_cost=STEP_COST, max_ep_length=MAX_EP_LEN)

In [None]:
def td_learning(env, discount_factor):
    maze_rewards = np.zeros((16,4))

    maze_rewards[3] = 40
    maze_rewards[12] = 10
    maze_rewards[13] = -2
    maze_rewards[6] = -10
    maze_rewards[7] = -10

    rewards = rewards_lists(16,4)

    for ep in range(MAX_EP):
        G = 0
        state = env.reset()
        trajectory = []
        while True:
            action_values = maze_rewards[state]
            probs = probability(action_values)
            action = np.random.choice(np.arange(4),p=probs) 

            next_state, reward, done = env.step(action)
            trajectory.append((state, action, reward))
            
            state = next_state
            if done:
                break

        for idx, step in enumerate(trajectory[::-1]):
            G = discount_factor * G + step[2]
            # first visit check
            if step[0] not in np.array(trajectory[::-1])[:,0][idx+1:]:
                rewards[str(step[0])+", "+str(step[1])].append(G)
                maze_rewards[step[0]][step[1]] = np.mean(rewards[str(step[0])+", "+str(step[1])])

In [None]:
grid_values = td_learning(env,discount_factor=0.9)
print(grid_values)

#### Factor 1

In [None]:
def quatromatrix(left, bottom, right, top, ax=None, triplotkw={},tripcolorkw={}):

    if not ax: ax=plt.gca()
    n = left.shape[0]; m=left.shape[1]

    a = np.array([[0,0],[0,1],[.5,.5],[1,0],[1,1]])
    tr = np.array([[0,1,2], [0,2,3],[2,3,4],[1,2,4]])

    A = np.zeros((n*m*5,2))
    Tr = np.zeros((n*m*4,3))

    for i in range(n):
        for j in range(m):
            k = i*m+j
            A[k*5:(k+1)*5,:] = np.c_[a[:,0]+j, a[:,1]+i]
            Tr[k*4:(k+1)*4,:] = tr + k*5

    C = np.c_[ left.flatten(), bottom.flatten(), 
              right.flatten(), top.flatten()   ].flatten()

    triplot = ax.triplot(A[:,0], A[:,1], Tr, **triplotkw)
    tripcolor = ax.tripcolor(A[:,0], A[:,1], Tr, facecolors=C, **tripcolorkw)
    return tripcolor

In [None]:
def plot_action_value(action_value_grid):
    top=action_value_grid[:,0].reshape((4,4))
    top_value_positions = [(0.38,0.25),(1.38,0.25),(2.38,0.25),(3.38,0.25),
                           (0.38,1.25),(1.38,1.25),(2.38,1.25),(3.38,1.25),
                           (0.38,2.25),(1.38,2.25),(2.38,2.25),(3.38,2.25),
                           (0.38,2.25),(1.38,2.25),(2.38,2.25),(3.38,2.25)]

    right=action_value_grid[:,1].reshape((4,4))
    right_value_positions = [(0.65,0.5),(1.65,0.5),(2.65,0.5),(3.65,0.5),
                           (0.65,1.5),(1.65,1.5),(2.65,1.5),(3.65,1.5),
                           (0.65,2.5),(1.65,2.5),(2.65,2.5),(3.65,2.5),
                           (0.65,2.5),(1.65,2.5),(2.65,2.5),(3.65,2.5)]

    bottom=action_value_grid[:,2].reshape((4,4))
    bottom_value_positions = [(0.38,0.8),(1.38,0.8),(2.38,0.8),(3.38,0.8),
                           (0.38,1.8),(1.38,1.8),(2.38,1.8),(3.38,1.8),
                           (0.38,2.8),(1.38,2.8),(2.38,2.8),(3.38,2.8),
                           (0.38,2.8),(1.38,2.8),(2.38,2.8),(3.38,2.8)]

    left=action_value_grid[:,3].reshape((4,4))
    left_value_positions = [(0.05,0.5),(1.05,0.5),(2.05,0.5),(3.05,0.5),
                           (0.05,1.5),(1.05,1.5),(2.05,1.5),(3.05,1.5),
                           (0.05,2.5),(1.05,2.5),(2.05,2.5),(3.05,2.5),
                           (0.05,2.5),(1.05,2.5),(2.05,2.5),(3.05,2.5)]

    fig, ax=plt.subplots(figsize=(12,5))
    ax.set_ylim(3, 0)
    tripcolor = quatromatrix(left, top, right, bottom, ax=ax,
                 triplotkw={"color":"k", "lw":1},
                 tripcolorkw={"cmap": "coolwarm"}) 

    ax.margins(0)
    ax.set_aspect("equal")
    fig.colorbar(tripcolor)

    for i, (xi,yi) in enumerate(top_value_positions):
        plt.text(xi,yi,round(top.flatten()[i],2), size=11, color="w")
    for i, (xi,yi) in enumerate(right_value_positions):
        plt.text(xi,yi,round(right.flatten()[i],2), size=11, color="w")
    for i, (xi,yi) in enumerate(left_value_positions):
        plt.text(xi,yi,round(left.flatten()[i],2), size=11, color="w")
    for i, (xi,yi) in enumerate(bottom_value_positions):
        plt.text(xi,yi,round(bottom.flatten()[i],2), size=11, color="w")

    plt.show()

In [None]:
plot_action_value(grid_values)

#### Factor 0.99