In [7]:
import gym
import random
import numpy as np

env = gym.make("Taxi-v3").env
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[34;1mB[0m: |
+---------+



### six actions:
- move south
- move east
- move north
- move werst
- pickup a passenger
- drop off a passenger


### rewards and penalties:
- succesfull drop-off (+20)
- every step taken (-1)
- picking or dropping off at an illegal location (-10)

In [9]:
env.s

332

initail state at (0, 1), and passenger at pickup location 2, and destination at location 0.

In [14]:
init_state = env.encode(0, 1, 2, 0)
env.s = init_state
env.render()

+---------+
|[35mR[0m:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [25]:
init_state

28

In [15]:
env.P[init_state]

{0: [(1.0, 128, -1, False)],
 1: [(1.0, 28, -1, False)],
 2: [(1.0, 28, -1, False)],
 3: [(1.0, 8, -1, False)],
 4: [(1.0, 28, -10, False)],
 5: [(1.0, 28, -10, False)]}

each row corresponds to a potential action
row 0 - move south
row 1 - move north
row 2 - move east
row 3 - move west
row 4 - pickup
row 5 - drop off

values in a row:
[(probability, next state that results from the action, reward, is successfull dropoff?)]

In [18]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table.shape

(500, 6)

In [73]:
learning_rate = 0.1
discount_factor = 0.9
exploration = 0.1
epochs = 10000

for epoch in range(epochs):
    state = env.reset()
    done = False
    
    while not done:
        random_val = random.uniform(0, 1)
        if random_val < exploration:
            action = env.action_space.sample() # random_action
        else:
            action = np.argmax(q_table[state]) # use the action with the highest Q value
        
        next_state, reward, done, info = env.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * \
                                                                next_max_q)
        q_table[state, action] = new_q
        
        state = next_state

In [74]:
q_table[init_state]

array([ 0.4603532 , -0.58615316, -0.58584163,  0.45968475, -9.58629969,
       -9.58601872])

In [75]:
from IPython.display import clear_output
import time

all_steps = 0
for trip in range(1, 200):
    state = env.reset()
    
    done = False
    trip_len = 0
    
    while not done:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = env.step(action)
        clear_output(wait=True)
        print(f'Trip number: {trip}. Step: {trip_len}. All steps: {all_steps}')
        print(env.render(mode="ansi"))
        time.sleep(0.001)
        state = next_state
        trip_len += 1
        all_steps += 1
#     time.sleep(2)

Trip number: 199. Step: 9. All steps: 2700
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

