# A Simple Q Learner

In [1]:
import sys

import gym
import numpy as np

In [2]:
env = gym.make('FrozenLake-v0')

[2017-11-20 19:01:25,746] Making new env: FrozenLake-v0


In [3]:
n_states = env.observation_space.n
n_actions = env.action_space.n
print(f'States: {n_states:,}\tActions: {n_actions:,}')

States: 16	Actions: 4


In [4]:
# Initializing the Q-matrix
Q = np.zeros(shape=[n_states, n_actions])
# Learning rate
lr = 0.1
# Discount factor
y = 0.5

In [5]:
# how many episodes
episodes = 10000
# number of times an agent performs a transition
transitions = []
# all total rewards
rewards = []

for episode in range(episodes):
    done = False
    state = env.reset()
    n_transition = 0
    total_reward = 0
    while not done:
        n_transition += 1
        action = np.argmax(Q[state])
        state1, reward, done, _ = env.step(action)
        # Update Q value
        Q[state, action] += lr * (reward + y * np.max(Q[state1]) - Q[state, action])
        # Update states
        state = state1
        total_reward += reward
    # Record all transitions and rewards collected
    transitions.append(n_transition)
    rewards.append(total_reward)
    sys.stdout.write(f'\rEpisode: {episode+1:,}\tTotal reward: {total_reward:,}\t'
                     f'Transitions: {n_transition:,}')
    sys.stdout.flush()

Episode: 10,000	Total reward: 0.0	Transitions: 34

In [6]:
env.render()

  (Left)
SFFF
FHFH
FFFH
[41mH[0mFFG


In [7]:
# maximum transitions & reward
max_transition = max(transitions)
max_reward = max(rewards)
# minimum transitions & reward
min_transition = min(transitions)
min_reward = min(rewards)
print(f'Maximum »»» Transitions: {max_transition:,}\tReward: {max_reward:,}')
print(f'Minimum »»» Transitions: {min_transition:,}\tReward: {min_reward:,}')

Maximum »»» Transitions: 100	Reward: 0.0
Minimum »»» Transitions: 3	Reward: 0.0


In [8]:
# Episode with max & min rewards
reward_max_episode = rewards.index(max_reward)
reward_min_episode = rewards.index(min_reward)
# Episode with max & min transitions
trans_max_episode = transitions.index(max_transition)
trans_min_episode = transitions.index(min_transition)
print(f'Episode with maximum »»» transition: {trans_max_episode:,}\t\t reward: {reward_max_episode:,}')
print(f'Episode with minimum »»» transition: {trans_min_episode:,}\t reward: {reward_min_episode:,}')

Episode with maximum »»» transition: 2,846		 reward: 0
Episode with minimum »»» transition: 0	 reward: 0


In [9]:
# Reward @ maximum transition
reward_max_trans = rewards[trans_max_episode]
# Reward @ minimum transition
reward_min_trans = rewards[trans_min_episode]
print(f'Reward at max transition: {reward_max_trans:,}')
print(f'Reward at min transition: {reward_min_trans:,}')

Reward at max transition: 0.0
Reward at min transition: 0.0


In [10]:
# Transition @ maximum reward
trans_max_reward = transitions[reward_max_episode]
# Transition @ minimum reward
trans_min_reward = transitions[reward_min_episode]
print(f'Transition at max reward: {trans_max_reward:,}')
print(f'Transition at min reward: {trans_min_reward:,}')

Transition at max reward: 3
Transition at min reward: 3
