In [1]:
import numpy as np
import gym

env = gym.make("FrozenLake8x8-v0")
env.reset()
epsilon = 1
alpha = .005
gamma = .94

Q = np.zeros([env.observation_space.n, env.action_space.n]) # inicializa Q com zeros
for episode in range(1, 1000001):
    done = False
    obs_old = env.reset()  
    while not done: # done é true quando se morre ou quando se pega o frisbie
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample() # Pegamos uma ação aleatória de acordo com a probabilidade epsilon
            epsilon = max(.1, epsilon - 2*1e-7) # Diminuir o epsilon com o tempo até o mínimo de 10%
        else:
            action = np.argmax(Q[obs_old]) # caso contrário pegamos a ação ideal já aprendida
        obs_new, reward, done, _ = env.step(action) # Usamos essa ação para gerar um novo estado
        Q[obs_old,action] += alpha * (reward + gamma * np.max(Q[obs_new]) - Q[obs_old,action]) # Bellman
        obs_old = obs_new
    
    if episode % 10000 == 0: # Mostrar o andamento do aprendizado
        rew_total = 0
        for i in range(100):
            obs = env.reset()
            done = False
            while not done: 
                action = np.argmax(Q[obs])
                obs, reward, done, info = env.step(action)
                rew_total += reward
        rew = rew_total/100
        print("Episode {} epsilon: {}".format(episode, epsilon))
        print('Episode {} reward: {}'.format(episode,rew))
        print()
        
        if rew >= 0.83:
            print("FIM!!")
            break

Episode 10000 epsilon: 0.9381113999982204
Episode 10000 reward: 0.08

Episode 20000 epsilon: 0.8752153999964117
Episode 20000 reward: 0.35

Episode 30000 epsilon: 0.8141025999946544
Episode 30000 reward: 0.4

Episode 40000 epsilon: 0.7532805999929054
Episode 40000 reward: 0.31

Episode 50000 epsilon: 0.6946587999912197
Episode 50000 reward: 0.44

Episode 60000 epsilon: 0.6380151999895909
Episode 60000 reward: 0.58

Episode 70000 epsilon: 0.5829909999880086
Episode 70000 reward: 0.58

Episode 80000 epsilon: 0.5313337999865232
Episode 80000 reward: 0.73

Episode 90000 epsilon: 0.48334879998514335
Episode 90000 reward: 0.61

Episode 100000 epsilon: 0.43824439998384634
Episode 100000 reward: 0.82

Episode 110000 epsilon: 0.39627059998263936
Episode 110000 reward: 0.8

Episode 120000 epsilon: 0.3565647999814976
Episode 120000 reward: 0.74

Episode 130000 epsilon: 0.3197797999804398
Episode 130000 reward: 0.68

Episode 140000 epsilon: 0.28672019997948917
Episode 140000 reward: 0.83

FIM!!


In [6]:
rew_total = 0.
obs = env.reset()
done = False
while not done: 
    action = np.argmax(Q[obs])
    obs, rew, done, info = env.step(action)
    rew_total += rew
    env.render()

print("Reward:", rew_total)  

  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
F[41mF[0mFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FF[41mF[0mFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFF[41mF[0mFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
SFFFFFFF
FFFF[41mF[0mFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
F