# Solving the Markov Decision Process

In [1]:
import gym
import numpy as np

In [2]:
env_name = 'Taxi-v2'
env = gym.make(env_name)

In [3]:
n_states = env.env.nS
n_actions = env.env.nA
print(f'{env_name} has {n_states} states and {n_actions} possible actions.')

Taxi-v2 has 500 states and 6 possible actions.


### Hyperparameters

In [4]:
# Q-values
Q = np.zeros(shape=[n_states, n_actions])

In [5]:
def run(episodes=1000, training=False, render=False, **kwargs):
    # Keyword arguments
    gamma = kwargs.get('gamma', 0.9)       # Discount Factor
    alpha = kwargs.get('alpha', 0.01)      # Learning Rate
    render = kwargs.get('render', False)   # Render the environment (slow)
    epsilon = kwargs.get('epsilon', 0.1)   # Exploration-Exploitation Dilemma
    logging = kwargs.get('logging', True)  # Show results
    log_step = kwargs.get('log_step', 500) # Log step
    
    total_reward = 0
    # Game Loop
    for episode in range(1, episodes+1):
        state = env.reset()
        done = False
        rewards = 0
        while not done:
            # epsilon percent of the time we take a random action
            if training or np.random.randn(1) > epsilon:
                # Exploitation: Choose action from table
                action = np.argmax(Q[state])            
            else:
                # Exploration: Take random actions
                action = env.action_space.sample()
            # Transition to a new state based on the action
            new_state, reward, done, _ = env.step(action)
            # Update Q table
            # current_Qvalue += learning_rate * (reward + (discount * max(next_Qvalues)) - current_Qvalue)
            Q[state, action] += alpha * (reward + (gamma * np.max(Q[new_state])) - Q[state, action])
            rewards += reward
            state = new_state
        # Update total rewards & log results
        total_reward += rewards
        if logging:
            if episode % log_step == 0 or episode == episodes:
                print(f'Episode: {episode:,}\tRewards: {rewards:,}'
                      f'\tTotal rewards: {total_reward:,}')
    print('\n')
    return total_reward

In [None]:
if __name__ == '__main__':
    # Before training
    ###############################################################
    print('{0}\n\t\t\t{1}\n{0}'.format('='*80, 'BEFORE LEARNING'))
    env.render()
    ###############################################################
    # Training
    episodes = 5000
    total_reward = run(episodes, training=True)
    print(f'Total reward = {total_reward:,} for {episodes:,} episodes')

    # After training
    ###############################################################
    episodes = 100
    total_reward = run(episodes, training=False, logging=False)
    print('{0}\n\t\t\t{1}\n{0}'.format('='*80, 'AFTER LEARNING'))
    print(f'Total reward = {total_reward:,} for {episodes:,} episodes')
    env.render()
    ###############################################################

			BEFORE LEARNING
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



### Release resourses

In [None]:
    env.close()