In [82]:
import numpy as np
import gym
import random
import time
from tqdm import tqdm_notebook
from IPython.display import clear_output

In [83]:
env = gym.make("FrozenLake-v0")

In [84]:
dir(env) #.observation_space

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_elapsed_steps',
 '_max_episode_steps',
 'action_space',
 'class_name',
 'close',
 'compute_reward',
 'env',
 'metadata',
 'observation_space',
 'render',
 'reset',
 'reward_range',
 'seed',
 'spec',
 'step',
 'unwrapped']

In [85]:
dir(env.action_space)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_np_random',
 'contains',
 'dtype',
 'from_jsonable',
 'n',
 'np_random',
 'sample',
 'seed',
 'shape',
 'to_jsonable']

In [86]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

In [87]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [88]:
rewards_all_episodes = []
for episode in tqdm_notebook(range(num_episodes)):
    # print("####################################")
    state = env.reset()

    rewards_current_episode = 0
    for step in range(max_steps_per_episode):
        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        # Take new action
        observation, reward, done, info = env.step(action)
        new_state = observation

        # Update Q-table
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
        learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        # Set new state
        state = new_state
        # Add new reward        
        rewards_current_episode+=reward

        # print(step)
        # print(q_table)

        if done:
            break

    # Add current episode reward to total rewards list
    rewards_all_episodes.append(rewards_current_episode)

    # Exploration rate decay   
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/10000 [00:00<?, ?it/s]

In [89]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.04300000000000003
2000 :  0.22000000000000017
3000 :  0.4080000000000003
4000 :  0.5590000000000004
5000 :  0.6390000000000005
6000 :  0.7030000000000005
7000 :  0.6740000000000005
8000 :  0.6730000000000005
9000 :  0.6820000000000005
10000 :  0.6990000000000005


In [98]:
for episode in range(3):
    state = env.reset()
    done = False
    print("EPISODE:", episode+1)
    time.sleep(1)

    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)

        action = np.argmax(q_table[state, :])
        observation, reward, done, info = env.step(action)
        state = observation

        if done:
            clear_output(wait=True)
            env.render()

            if reward:
                print("Reached the goal")
                time.sleep(1)
            else:
                print("Fell through a hole")
                time.sleep(1)
            break


  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Reached the goal


In [99]:
env.close()