In [41]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [42]:
try:
    register(
    id='FrozenLakeNoSlip-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4','is_slippery':False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)
except:
  pass

env_name = "FrozenLakeNoSlip-v0"
env = gym.make(env_name)
print("observation space:",env.observation_space)
print("Action space", env.action_space)
type(env.action_space)

observation space: Discrete(16)
Action space Discrete(4)


gym.spaces.discrete.Discrete

In [43]:
class Agent():
  def __init__(self,env):
    self.is_discrete = \
        type(env.action_space) == gym.spaces.discrete.Discrete

    if self.is_discrete:
      self.action_size = env.action_space.n
      print("Action size:", self.action_size)
    else:
      self.action_low = env.action_space.low
      self.action_high = env.action_space.high
      self.action_shape = env.action_space.shape
      pring("Action range:", self.action_low, self.action_high)

  def get_action(self, state):
    if self.is_discrete:
      action = random.choice(range(self.action_size))
    else:
      action = np.random.uniform(self.action_low,
                                 self.action_high,
                                 self.action_shape)
    return action     

In [44]:
class QAgent(Agent):
  def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
    super().__init__(env)
    self.state_size = env.observation_space.n
    print("State size:", self.state_size)

    self.eps = 1.0
    self.discount_rate = discount_rate
    self.learning_rate = learning_rate
    self.build_model()

  def build_model(self):
    self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
  
  def get_action(self,state):
    q_state = self.q_table[state]
    action_greedy = np.argmax(q_state)
    action_random = super().get_action(state)
    return action_random if random.random() < self.eps else action_greedy
  
  def train(self, experience):
    state, action, next_state, reward, done = experience

    q_next = self.q_table[next_state]
    q_next = np.zeros([self.action_size]) if done else q_next
    q_target = reward + self.discount_rate * np.max(q_next)

    q_update = q_target - self.q_table[state,action]
    self.q_table[state,action] += self.learning_rate * q_update

    if done:
      self.eps = self.eps * 0.99

agent = QAgent(env)

Action size: 4
State size: 16


In [46]:
total_reward = 0
for ep in range(100):
  state = env.reset()
  done = False
  while not done:
    # action = env.action_space.sample()
    action = agent.get_action(state)
    next_state, reward, done, info = env.step(action)
    agent.train((state,action,next_state,reward,done))
    state = next_state
    total_reward += reward

    print("s:", state, "a:", action)
    print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
    env.render()
    print(agent.q_table)
    time.sleep(0.01)
    clear_output(wait=True)

s: 15 a: 2
Episode: 99, Total reward: 23.0, eps: 0.13397967485796175
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[5.34807558e-05 6.23788597e-05 7.59241701e-05 6.83093877e-05]
 [4.44309764e-05 1.63302909e-06 8.25222045e-05 8.24286461e-05]
 [8.26220735e-05 9.83830263e-05 5.96644452e-06 2.75342553e-05]
 [1.44793362e-05 7.30713313e-06 1.51810832e-05 1.60634876e-05]
 [3.32765924e-05 7.43314561e-05 5.62036268e-05 3.45581327e-05]
 [7.17559051e-05 1.89918878e-05 6.68216476e-05 7.37204945e-05]
 [5.61371072e-05 9.76108108e-04 7.18073871e-05 1.61409355e-05]
 [9.89744784e-05 6.82512579e-05 7.84283607e-05 6.80111574e-06]
 [5.86680179e-05 5.55721286e-05 1.16126970e-04 2.37377455e-05]
 [7.87474042e-05 1.02721375e-04 1.04677870e-03 3.35784308e-05]
 [2.83187874e-05 2.14621364e-02 4.74720700e-05 6.28310081e-05]
 [8.38986139e-05 3.57242815e-05 8.17481368e-05 4.72744605e-05]
 [7.48207740e-06 4.37286260e-05 5.32596194e-05 6.09696504e-05]
 [4.39785602e-05 8.90330057e-05 3.60635648e-03 3.23661746e-05]
 [6.790689