# A Simple Q Learner

In [1]:
import sys

import gym
import numpy as np

In [2]:
env = gym.make('Taxi-v2')

[2017-11-20 18:17:08,802] Making new env: Taxi-v2


In [3]:
n_states = env.observation_space.n
n_actions = env.action_space.n
print(f'States: {n_states:,}\tActions: {n_actions:,}')

States: 500	Actions: 6


In [4]:
# Initializing the Q-matrix
Q = np.zeros(shape=[n_states, n_actions])
# Learning rate
lr = 0.1
# Discount factor
y = 0.5

In [5]:
# how many episodes
episodes = 10000
# number of times an agent performs a transition
transitions = []
# all total rewards
rewards = []

for episode in range(episodes):
    done = False
    state = env.reset()
    n_transition = 0
    total_reward = 0
    while not done:
        n_transition += 1
        action = np.argmax(Q[state])
        state1, reward, done, _ = env.step(action)
        # Update Q value
        Q[state, action] += lr * (reward + y * np.max(Q[state1]) - Q[state, action])
        total_reward += reward
    # Record all transitions and rewards collected
    transitions.append(n_transition)
    rewards.append(total_reward)
    sys.stdout.write(f'\rEpisode: {episode+1:,}\tTotal reward: {total_reward:,}\t'
                     f'Transitions: {n_transition:,}')
    sys.stdout.flush()

Episode: 10,000	Total reward: -200Transitions: 200

In [6]:
# maximum transitions & reward
max_transition = max(transitions)
max_reward = max(rewards)
# minimum transitions & reward
min_transition = min(transitions)
min_reward = min(rewards)
print(f'Maximum »»» Transitions: {max_transition:,}\tReward: {max_reward:,}')
print(f'Minimum »»» Transitions: {min_transition:,}\tReward: {min_reward:,}')

Maximum »»» Transitions: 200	Reward: -200
Minimum »»» Transitions: 200	Reward: -236


In [7]:
# Reward @ maximum transition
reward_max_trans = rewards[transitions.index(max_transition)]
# Reward @ minimum transition
reward_min_trans = rewards[transitions.index(min_transition)]
print(f'Reward at max transition: {reward_max_trans:,}')
print(f'Reward at min transition: {reward_min_trans:,}')

Reward at max transition: -218
Reward at min transition: -218


In [8]:
# Transition @ maximum reward
trans_max_reward = transitions[rewards.index(max_reward)]
# Transition @ minimum reward
trans_min_reward = transitions[rewards.index(min_reward)]
print(f'Transition at max reward: {trans_max_reward:,}')
print(f'Transition at min reward: {trans_min_reward:,}')

Transition at max reward: 200
Transition at min reward: 200
