<b> What is this notebook? </b><br>
This notebook includes an implementation of Q-Learning and the Taxi problem.

In [3]:
%matplotlib inline

import gym
from gym.wrappers import Monitor
import matplotlib.pyplot as plt
import numpy as np

from collections import deque, namedtuple

## Functions

In [205]:
def take_epsilon_greedy_action(epsilon, policy, state):
    '''
    returns action based on epsilon greedy policy
    '''
    choices = range(6)
    eps_greedy = np.random.choice(np.arange(0,1.01,.01))
    if eps_greedy > epsilon:
        action = policy[state]
    else:
        action = np.random.choice(choices)
    return action

In [189]:
def epsilon_decay(epsilon, iter_count):
    '''
    returns decayed epsilon
    '''
    return epsilon * np.exp(-.004 * iter_count)

In [216]:
def perform_q_learning(taxi_env, epsilon=1, alpha=.01, gamma=0.9):
    # Matrix of values for state action pairs
    Q = np.zeros((5000, 6))
    choices = range(6)
    policy = [np.random.choice(choices) for _ in Q]
    
    # Q-learning loop, we run through 10000 episodes
    for iter_count in range(100000):
        taxi_env.reset()
        # status is (state, reward, Done, prob.)
        status = (0,0,False,{'prob':1})
        
        while status[2] == False:
            state = status[0]
            decayed_eps = epsilon_decay(epsilon, iter_count)
            action = take_epsilon_greedy_action(decayed_eps, policy, state)

            status = taxi_env.step(action)
            reward = status[1]
            next_state = status[0]
            next_action = policy[next_state]
            Q[state, action] = (
                Q[state, action] + 
                alpha * (reward + (gamma * Q[next_state, next_action]) - Q[state, action])
            )
            
            # policy updates
            Q_state = list(Q[state])
            policy[state] = Q_state.index(max(Q_state))
         
        # Set the Q value of the terminal state to 0
        Q[state] = 0
        
    return Q

In [167]:
def get_mae(list_1, list_2):
    '''
    Produces Mean Absolute Error given two iterables.
    '''
    mae = np.average([np.abs(x - y) for x,y in zip(list_1, list_2)])
    return mae

## Evaluating Results

In [None]:
env = gym.make('Taxi-v2')
Q = perform_q_learning(env, )

In [None]:
test_indices = [(462, 4), (398, 3), (253, 0), (377, 1), (83, 5)]
test_values = [-11.374402515, 4.348907, -0.5856821173, 9.683, -12.8232660372]

In [None]:
Q_estimates = [Q[index] for index in test_indices]
mae = get_mae(Q_estimates, test_values)
print "Mean Absolute Error of Q-value estimates:", mae

In [212]:
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
