In [None]:
# initialie the game environment

import numpy as np
import gym
import random
env = gym.make('FrozenLake-v0',is_slippery=False)

In [None]:
# initialize the Q-table with zeros

action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size,action_size))

In [None]:
# reset

episode_rewards = []
for i in range(10000):
    state = env.reset()
    total_rewards = 0
    for step in range(50):
        action = env.action_space.sample()
        new_state,reward,done,info = env.step(action)
        qtable[state,action]+=0.1*(reward+0.9*np.max(qtable[new_state,:])-qtable[state,action])
        state=new_state
        total_rewards+=reward
    episode_rewards.append(total_rewards)
print(qtable)

[[0.53144093 0.59048992 0.59048992 0.53144093]
 [0.53144093 0.         0.65609992 0.59048992]
 [0.59048992 0.72899991 0.59048991 0.65609991]
 [0.6560999  0.         0.5904899  0.5904899 ]
 [0.59048992 0.65609991 0.         0.53144093]
 [0.         0.         0.         0.        ]
 [0.         0.80999991 0.         0.65609986]
 [0.         0.         0.         0.        ]
 [0.6560999  0.         0.72899991 0.5904899 ]
 [0.65609986 0.80999914 0.80999991 0.        ]
 [0.72899979 0.89999992 0.         0.72899978]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.809997   0.89999933 0.72899743]
 [0.80994972 0.89999856 0.99999996 0.8099952 ]
 [0.         0.         0.         0.        ]]


In [None]:
# Exploration and exploitation

episode_rewards = []
epsilon = 1
max_epsilon =1
min_epsilon = 0.01
decay_rate = 0.005
for episode in range(1000):
    state = env.reset()
    total_rewards = 0
    for step in range(50):
        exp_exp_tradeoff = random.uniform(0,1)
        #Exploitation
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        else:
            #Exploration
            action = env.action_space.sample()
        new_state,reward,done,info=env.step(action)
        qtable[state,action] +=0.9*(reward+0.9*np.max(qtable[new_state,:])- qtable[state,action])
        state = new_state
        total_rewards += reward
    episode_rewards.append(total_rewards)
    epsilon = min_epsilon+(max_epsilon-min_epsilon)*np.exp(decay_rate*episode)
print(qtable)

[[0.531441 0.59049  0.59049  0.531441]
 [0.531441 0.       0.6561   0.59049 ]
 [0.59049  0.729    0.59049  0.6561  ]
 [0.6561   0.       0.59049  0.59049 ]
 [0.59049  0.6561   0.       0.531441]
 [0.       0.       0.       0.      ]
 [0.       0.81     0.       0.6561  ]
 [0.       0.       0.       0.      ]
 [0.6561   0.       0.729    0.59049 ]
 [0.6561   0.81     0.81     0.      ]
 [0.729    0.9      0.       0.729   ]
 [0.       0.       0.       0.      ]
 [0.       0.       0.       0.      ]
 [0.       0.81     0.9      0.729   ]
 [0.81     0.9      1.       0.81    ]
 [0.       0.       0.       0.      ]]


In [None]:
# Once we've obtained the Q-table, we can leverage it to identify the steps that the agent needs to take to reach its destination:
env.reset()
for episode in range(1):
    state=env.reset()
    step=0
    done=False
    print("-----------------------")
    print("Episode",episode)
    for step in range(50):
        env.render()
        action=np.argmax(qtable[state,:])
        print(action)
        new_state,reward,done,info=env.step(action) 
        if done:
            print("Number of Steps",step+1)
            break
        state=new_state
env.close()

-----------------------
Episode 0

[41mS[0mFFF
FHFH
FFFH
HFFG
1
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
1
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
2
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
1
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
2
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
2
Number of Steps 6
