In [6]:
import gym
import sys
import numpy as np
import random

taxi_env = gym.make("Taxi-v3").env

taxi_env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [11]:
def q_learning(env, gamma=0.9, alpha=0.9, epsilon=0.1, episodes=5):
    '''
    Q-Learning Algorithm

    Inputs:
        env: game environment, i.e. taxi_env
        gamma: Discount rate for future rewards.
        alpha: Learning rate. "How much you accept the new value vs the old value," i.e. how much weight will you assign
            to the old vs new value of Q.
        epsilon: Used to control balance of exploration (choose a random action) vs exploitation, i.e. we pick a value at
            random in the range (0,1) and if this value < epsilon, we will choose a random action. Else, we pick the action
            that maximizes Q (based on current knowledge of Q).
        episodes: Number of epochs to run.

    Helper Methods:
        extract_policy: Returns the optimal policy for a given value function. It is run once at the end of the algorithm
                        after the optimal Q (value function) has been estimated.

    Outputs:
        A tuple (policy, Q, steps) of the policy extracted from the estimated Q function, the approximated optimal value 
        function, and the number of steps the algorithm took to converge.
    '''
    
    def extract_policy(Q):
        policy = np.zeros([env.nS, env.nA])
        
        for s in range(env.nS):
            best_action = np.argmax(Q[s, :]) # returns index of action that has maximum V
            policy[s, best_action] = 1 # deterministic optimal policy, i.e. always take best_action for given state
        
        return policy
    
    # initialize Q(s,a) matrix to all zeros
    Q = np.zeros([env.nS, env.nA])
    steps = 0
    
    for t in range(episodes):
        #print('Episode #', t)
        converged = False
        
         # select random state
        state = env.reset()
        done = False
        
        # run inner loop for each episode until a terminal state has been reached
        while not done:
            #print('Q learning, step ', steps, '...')
            
            # select action
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # exploration
            else:
                action = np.argmax(Q[state, :]) # exploitation
            
            next_state, reward, done, info = env.step(action)

            # in next state, select action with highest Q-value
            max_next_action_value = np.max(Q[next_state, :])

            # update Q-values tables with equation
            Q[state][action] = ((1-alpha)*Q[state][action]) + (alpha*(reward + (gamma * max_next_action_value)))

            # set next state as current state & repeat
            state = next_state 
            steps += 1

    # extract optimal policy after calculating optimal V
    policy = extract_policy(Q)

    return policy, Q, steps

In [52]:
eps = 500
print('Num episodes: ', eps)
policy, Q, steps = q_learning(env=taxi_env, episodes=eps)
print(policy)
print('Steps: ', steps)
print('Average steps per episode: ', steps/eps)

print('------------------------------------------')
eps = 5000
print('Num episodes: ', eps)
policy2, Q2, steps2 = q_learning(env=taxi_env, episodes=eps)
print(policy2)
print('Steps: ', steps2)
print('Average steps per episode: ', steps2/eps)

Num episodes:  500
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
Steps:  24782
Average steps per episode:  49.564
------------------------------------------
Num episodes:  5000
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
Steps:  91162
Average steps per episode:  18.2324


In [53]:
from IPython.display import clear_output
from time import sleep

# run the game once under the current policy
# and animate game output
def run_episode(policy):
    gamma = 0.9
    init_state = taxi_env.reset() # initialize the environment to a random state
    #taxi_env.render()
    total_reward = 0
    penalties = 0
    step = 0
    frames = [] # for rendering animation of output
    done = False

    while not done:
        #print('init ', init_state)
        action = policy[init_state].tolist().index(1)
        #print(action)
        next_state, reward, done, info = taxi_env.step(action) # move to the next state
        #print('next ', next_state)
        
        if reward == -10:
            penalties += 1

        total_reward += (gamma ** step * reward)
        step += 1

        frames.append(taxi_env.render(mode='ansi'))
        init_state = next_state
        #print('init now ', init_state)

    #print('Total reward: ', total_reward)
    for i in range(len(frames)):
        clear_output(wait=True)
        print(frames[i])
        sleep(.1)
        
    return total_reward, penalties

In [50]:
# Testing
taxi_env.reset()
taxi_env.render()
taxi_env.step(1)
taxi_env.render()

+---------+
|R: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|R: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)


In [54]:
run_episode(policy2)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)



(-3.1369622635116983, 0)

In [55]:
# run for 5 simulations and calculate average metrics

all_rewards = []
all_penalties = []

for t in range(5):
    print('Simulation # ', t)
    total_reward, penalties = run_episode(policy)
    all_rewards.append(total_reward)
    all_penalties.append(penalties)

print('Avg total reward: ', np.mean(all_rewards))
print('Avg # penalties: ', np.mean(all_penalties))

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Avg total reward:  -1.6419685837389388
Avg # penalties:  0.0


In [56]:
# run for 5 simulations and calculate average metrics

all_rewards = []
all_penalties = []

for t in range(5):
    print('Simulation # ', t)
    total_reward, penalties = run_episode(policy2)
    all_rewards.append(total_reward)
    all_penalties.append(penalties)

print('Avg total reward: ', np.mean(all_rewards))
print('Avg # penalties: ', np.mean(all_penalties))

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Avg total reward:  -1.9997126632815387
Avg # penalties:  0.0
