# Solving the Markov Decision Process

In [1]:
import gym
import numpy as np

In [2]:
env_name = 'Taxi-v2'
env = gym.make(env_name)

In [3]:
n_states = env.env.nS
n_actions = env.env.nA
print(f'{env_name} has {n_states} states and {n_actions} possible actions.')

Taxi-v2 has 500 states and 6 possible actions.


### Q-Table

In [4]:
Q = np.zeros(shape=[n_states, n_actions])
gamma = 0.5  # Discount Factor

In [5]:
print('=== BEFORE LEARNING ===')
env.render()

episodes = 2000
total_reward = 0

# Game Loop
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    rewards = 0
    while not done:
        # Choose action from table
        action = np.argmax(Q[state])
        # Transition to a new state based on the action
        new_state, reward, done, _ = env.step(action)
        # Update Q table
        Q[state, action] += reward + gamma * np.max(Q[new_state])
        rewards += reward
        state = new_state
    # Update total rewards & log results
    total_reward += rewards
    # Every 500 steps
    if episode % 500 == 0 or episode == episodes:
        print(f'Episode: {episode:,}\tRewards: {rewards:,}'
              f'\tTotal rewards: {total_reward:,}')

print('\n=== AFTER LEARNING ===')        
env.render()

=== BEFORE LEARNING ===
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |[35mB[0m: |
+---------+

Episode: 500	Rewards: 9	Total rewards: -101,047
Episode: 1,000	Rewards: -12	Total rewards: -98,826
Episode: 1,500	Rewards: 8	Total rewards: -94,828
Episode: 2,000	Rewards: 7	Total rewards: -90,642

=== AFTER LEARNING ===
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)


### Release resourses

In [6]:
env.close()