<b> What is this notebook? </b><br>
This notebook includes an implementation of Q-Learning and the Taxi problem.

In [1]:
%matplotlib inline

from collections import defaultdict
import gym
from gym.wrappers import Monitor
import itertools
import matplotlib.pyplot as plt
import numpy as np
import sys

from collections import deque, namedtuple

## Functions

In [2]:
def take_epsilon_greedy_action(epsilon, policy, state):
    '''
    returns action based on epsilon greedy policy
    '''
    choices = range(6)
    eps_greedy = np.random.choice(np.arange(0,1.01,.001))
    if eps_greedy > epsilon:
        action = policy[state]
    else:
        action = np.random.choice(choices)
    return action

In [3]:
def epsilon_decay(epsilon, iter_count):
    '''
    returns decayed epsilon
    '''
    return epsilon * np.exp(-.001 * iter_count)

In [57]:
def perform_q_learning(
        taxi_env, 
        epsilon=1, 
        alpha=.5, 
        gamma=0.9, 
        decay=False, 
        episode_count=1000
):
    '''
    Performs Q-Learning on taxi environment by exploring aggressively until
    it learns the optimal policy. This applies a decaying epsilon greedy method.
    '''
    # Matrix of values for state action pairs
    Q = np.zeros((5000, 6))
    choices = range(6)
    policy = np.array([np.random.choice(choices) for _ in range(Q.shape[0])])
    
    # Q-learning loop, we run through 10000 episodes
    for iter_count in range(episode_count):
        if iter_count % 1000 == 0 and iter_count > 0:
            print "Iteration {}".format(iter_count)
        
        step_count = 0
        taxi_env.reset()
        # status is (state, reward, Done, prob.)
        status = (0,0,False,{'prob':1})
        
        while True:
            state = status[0]
            # 
            if decay:
                decayed_eps = epsilon_decay(epsilon, iter_count)
            else:
                decayed_eps = epsilon
                
            action = take_epsilon_greedy_action(decayed_eps, policy, state)

            status = taxi_env.step(action)
            prob = status[-1]['prob']
            step_count += 1
            reward = status[1]
            next_state = status[0]
            if next_state == state:
                continue
            next_action = policy[next_state]
            
            # terminal state case
            if status[2]:
                Q[state, action] += alpha * (reward - Q[state, action])
                break
                
            # Normal Q-learning update
            Q[state, action] += (
                alpha * (reward + 
                (gamma * Q[next_state, next_action]) - Q[state, action])
            )
            
            # policy updates
            Q_state = list(Q[state])
            policy[state] = Q_state.index(max(Q_state))
            
            if step_count >= 200:
                break
        
    return Q

In [5]:
def get_mae(list_1, list_2):
    '''
    Produces Mean Absolute Error given two iterables.
    '''
    mae = np.average([np.abs(x - y) for x,y in zip(list_1, list_2)])
    return mae

## Evaluating Results

In [58]:
env = gym.make('Taxi-v2')
test_indices = [(462, 4), (398, 3), (253, 0), (377, 1), (83, 5)]
test_values = [-11.374402515, 4.348907, -0.5856821173, 9.683, -12.8232660372]

Q = perform_q_learning(
    env, 
    epsilon=1, 
    alpha=.65, 
    decay=True, 
    episode_count=10000
)
Q_estimates = [Q[index] for index in test_indices]
mae = get_mae(Q_estimates, test_values)
print "MAE :", mae

Iteration 1000
Iteration 2000
Iteration 3000
Iteration 4000
Iteration 5000
Iteration 6000
Iteration 7000
Iteration 8000
Iteration 9000
MAE : 8.12841079731


In [None]:
# plt.rcParams['figure.figsize'] = (12, 5)
# plt.plot(range(len(updates)), updates)
# plt.title("Average Learning by Episode")
# plt.xlabel("Episode")
# plt.ylabel("Average Learning")
# plt.xticks([])
# plt.show()