In [1]:
import numpy as np
import gym
import random
import sys
import os
import time
import datetime
import pandas as pd
from IPython.display import clear_output

SOLVE_TAXI_MESSAGE = """Task : \n
1) The cab(YELLOW) should find the shortest path to BLUE(passenger) 
2) Perform a "pickup" action to board the passenger which turns the cab(GREEN)
3) Take the passenger to the PINK(drop location) using the shortest path
4) Perform a "dropoff" action
"""

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |B: |
+---------+



In [3]:
action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

Action size  6
State size  500


# SARSA

In [4]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [5]:
# Do not change the following four hyperparameters!!!!

total_episodes = 50000        # Total episodes
total_test_episodes = 100     # Total test episodes
total_demo_episodes = 10      # Total demo episodes
max_steps = 99                # Max steps per episode



#***********Part1: Set the following hyperparameters by yourself************
# lr = 0.3
# gamma = 0.99
# epsilon = 1.0
# max epsi = 1.0
# min epso = 0.1
# decay_rate = 0.01
learning_rate = 0.05             # Learning rate # a.k.a alpha
gamma = 0.99                     # Discounting rate 

# Exploration parameters
epsilon = 0.9                     # Exploration rate 
max_epsilon = 0.9                # Exploration probability at start
min_epsilon = 0.1                # Minimum exploration probability 
decay_rate = 0.01                 # Exponential decay rate for exploration prob

In [6]:
#n_actions = env.action_space.n
#print(n_actions)
# 2 For life or until learning is stopped
def getNextAction(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(qtable[state, :])
    return action

def getNextState(action):
    return env.step(action)

for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        
        #******************Part2: Implement SARSA by yourself*******************
        # Hint1: You can refer to the Q-Learning example to finish this part.
        # Hint2: Because SARSA is an on-policy learning strategy. You have to choose a policy, for example, random, epsilon-greedy and so on.
        
        # policy
        action = getNextAction(state)
        
        new_state, reward, done, info = env.step(action)
        
        new_action = getNextAction(new_state)
        
        #update Q table
        predict = qtable[state, action]
        target = reward + gamma * qtable[new_state, new_action]
        qtable[state, action] = qtable[state, action] + learning_rate * (target - predict)
        #***********************************************************************
        
        
        # Our new state is state
        state = new_state
        
        # If done : finish episode
        if done == True: 
            break
    
    #**************Part3: Try different epsilon reduction methods by yourself****************    
    # Reduce epsilon (because we need less and less exploration)    
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

In [25]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("SARSA | Score over time: " +  str(sum(rewards)/total_test_episodes))

SARSA | Score over time: 8.26


### note
#if the score over time is < 8.0 try to restart and run again, i dont know but sometimes it get around 7.9

In [8]:
#DUMP
#print(str(np.random.randint(0, action_size)))
#print(str(random.uniform(0,1)))
#print(str(np.random.rand()))
#random.uniform(0,1)

# Demo (SARSA)

In [None]:
env.reset()
rewards = []
perf=0
score=0

for episode in range(total_demo_episodes):
    clear_screen(0)
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_steps):
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            log_progress(env, reward=reward, total_reward=total_rewards, delay=0.5, message=perf_message(episode, perf))
            break
        log_progress(env, reward=reward, total_reward=total_rewards, delay=0.5, message=perf_message(episode, perf))
        state = new_state
        
    clear_screen(0)  
    score += total_rewards
    perf = score/(episode + 1)
    
env.close()
print ("Score", total_rewards)

# Questions:

Q1: Which policy algorithm do you use?

Q2: Briefly introduce SARSA algorithm.

Q3: Discuss the difference between Q-Learning and SARSA algorithm.

#### References
#https://www.quora.com/What-is-the-difference-between-Q-learning-and-SARSA-learning
#https://medium.com/swlh/introduction-to-reinforcement-learning-coding-sarsa-part-4-2d64d6e37617
#https://towardsdatascience.com/reinforcement-learning-temporal-difference-sarsa-q-learning-expected-sarsa-on-python-9fecfda7467e