# Solving the Markov Decision Process

In [1]:
import sys  # logging purpose

import gym
import numpy as np

In [2]:
env_name = 'Taxi-v2'
env = gym.make(env_name)

In [3]:
n_states = env.env.nS
n_actions = env.env.nA
print(f'{env_name} has {n_states} states and {n_actions} possible actions.')

Taxi-v2 has 500 states and 6 possible actions.


### Q-Table

In [4]:
Q = np.zeros(shape=[n_states, n_actions])
gamma = 0.5  # Discount Factor

In [5]:
print('=== BEFORE LEARNING ===')
env.render()

episodes = 2000
total_reward = 0

# Game Loop
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    rewards = 0
    while not done:
        # Choose action from table
        action = np.argmax(Q[state])
        # Transition to a new state based on the action
        new_state, reward, done, _ = env.step(action)
        # Update Q table
        Q[state, action] += reward + gamma * np.max(Q[new_state])
        rewards += reward
        state = new_state
    # Update total rewards & log results
    total_reward += rewards
    # Every 500 steps
    if episode % 500 == 0 or episode == episodes:
        print(f'Episode: {episode:,}\tRewards: {rewards:,}'
              f'\tTotal rewards: {total_reward:,}')

print('\n=== AFTER LEARNING ===')        
env.render()

=== BEFORE LEARNING ===
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |B: |
+---------+

Episode: 500	Rewards: 11	Total rewards: -98,985
Episode: 1,000	Rewards: 11	Total rewards: -96,872
Episode: 1,500	Rewards: 8	Total rewards: -92,894
Episode: 2,000	Rewards: 10	Total rewards: -88,826
Episode: 2,500	Rewards: 7	Total rewards: -84,611
Episode: 3,000	Rewards: 13	Total rewards: -80,373
Episode: 3,500	Rewards: 7	Total rewards: -76,179
Episode: 4,000	Rewards: 11	Total rewards: -71,867
Episode: 4,500	Rewards: 6	Total rewards: -67,765
Episode: 5,000	Rewards: 11	Total rewards: -63,583
=== AFTER LEARNING ===
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)


In [6]:
env.close()