#### Import dependencies

In [12]:
import gym
import random
import numpy as np
import time
from IPython import display
import matplotlib.pyplot as plt
from collections import defaultdict
import pylab as pl

%matplotlib inline

### Load Environment

In [2]:
env = gym.make("Taxi-v3")

### Inspect Environment

In [3]:
# There are 6 discrete deterministic actions:
# - 0: move south
# - 1: move north
# - 2: move east
# - 3: move west
# - 4: pickup passenger
# - 5: drop off passenger

action_size = env.action_space.n
print("Action size ", action_size)

# There are 500 discrete states since there are 25 taxi positions
# 5 possible locations of the passenger (including the case when the passenger is in the taxi)
# and 4 destination locations.
# Start-Position is random
state_size = env.observation_space.n
print("State size ", state_size)

Action size  6
State size  500


In [4]:
env.reset()
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



### Agent

In [17]:
class Agent():
    def __init__(self, n_actions, n_states, gamma=0.9):
        self.n_actions = n_actions
        
        self.gamma = gamma
        self.Q = np.zeros((n_states, n_actions))
        
    def decay_schedule(self, init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
        decay_steps = int(max_steps * decay_ratio)
        rem_steps = max_steps - decay_steps
        values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
        values = (values - values.min()) / (values.max() - values.min())
        values = (init_value - min_value) * values + min_value
        values = np.pad(values, (0, rem_steps), 'edge')
        return values        
            
    def act(self, state, eps=0):
        if random.uniform(0, 1) < eps:
            return random.choice(np.arange(self.n_actions))        
        else:
            return np.argmax(self.Q[state])
    
    def learn(self, state, action, reward, next_state, done, alpha, algo='qlearn'):             
        if algo == 'qlearn':            
            #  Q-Learning
            td_target = reward + self.gamma * np.max(self.Q[next_state, :]) * (not done)
        
        else:        
            # SARSA
            td_target = reward + self.gamma * self.Q[next_state, self.act(next_state)] * (not done)
               
        td_error = td_target - self.Q[state, action]  
        
        self.Q[state, action] = self.Q[state, action] + alpha * td_error

### Q - Learning

In [18]:
def learning(n_actions, n_states, episodes=50000, max_steps=500, print_every=5000):
    agent = Agent(n_actions, n_states)
    
    alphas = agent.decay_schedule(0.9, 0.01, 0.2, episodes)
    epsilons = agent.decay_schedule(1.0, 0.01, 0.5, episodes)
    
    for n_episode in range(episodes):
        state = env.reset()        
                
        for n_step in range(max_steps):
            action = agent.act(state, epsilons[n_episode])
            next_state, reward, done, info = env.step(action)            
            
            agent.learn(state, action, reward, next_state, done, alphas[n_episode])
            
            state = next_state
            
            if done:      
                break
        
        if n_episode % print_every == 1:
            print('Episode: {0} done after {1} Steps.'.format(n_episode+1, n_step))
    
    print('Done.')
    env.close()
    
    return agent

#### Training

In [19]:
agent = learning(action_size, state_size)

Episode: 2 done after 199 Steps.
Episode: 5002 done after 30 Steps.
Episode: 10002 done after 12 Steps.
Episode: 15002 done after 14 Steps.
Episode: 20002 done after 5 Steps.
Episode: 25002 done after 12 Steps.
Episode: 30002 done after 10 Steps.
Episode: 35002 done after 10 Steps.
Episode: 40002 done after 11 Steps.
Episode: 45002 done after 10 Steps.
Done.


### Replay trained Agent

In [20]:
def replay(agent, max_steps=20):    
    n_steps = 0

    state, done = env.reset(), False
    rewards = 0

    while not done and n_steps < max_steps:
        action = agent.act(state)
        next_state, reward, done, info = env.step(action) 
        
        state = next_state
        rewards += reward
        
        display.clear_output(wait=True)
        env.render()
        time.sleep(.5)

        n_steps+=1

    print('Solved after {0} Steps.'.format(n_steps))

In [21]:
for _ in range(5):
    replay(agent)
    time.sleep(1)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Solved after 14 Steps.
