In [2]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

import matplotlib

In [3]:
# https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py
# https://github.com/openai/gym/blob/master/gym/envs/__init__.py
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':True},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

# env_name = "CartPole-v1"
# env_name = "MountainCar-v0"
# env_name = "MountainCarContinuous-v0"
# env_name = "Acrobot-v1"
# env_name = "Pendulum-v0"
env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [4]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [5]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.99, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
        
agent = QAgent(env)

Action size: 4
State size: 16


In [6]:
total_reward = 0
success_hist = []
for i in range(10):
    success_hist.append(0)
for ep in range(1000):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward

        for num in range(1, len(success_hist)):
             success_hist[num-1] = success_hist[num]
        success_hist[len(success_hist)-1] = reward 

        avg_success = (sum(success_hist)/len(success_hist))*100
        avg_reward = sum(success_hist)/len(success_hist)
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, Average reward {}, Success rate: {}% eps: {}".format(ep,total_reward,avg_reward,avg_success,agent.eps))
        #print(success_hist)
        env.render()
        print(agent.q_table)
        clear_output(wait=True)

s: 5 a: 0
Episode: 999, Total reward: 0.0, Average reward 0.0, Success rate: 0.0% eps: 4.317124741065784e-05
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
[[4.45660481e-05 4.45819217e-05 4.45901104e-05 4.45898087e-05]
 [4.14636162e-05 4.37006715e-05 3.35727208e-05 4.41284702e-05]
 [4.39641711e-05 4.39683968e-05 4.39835631e-05 4.39823972e-05]
 [4.35732446e-05 4.33240392e-05 3.62409490e-05 4.36960933e-05]
 [4.18967870e-05 4.19476997e-05 4.20629128e-05 4.16837021e-05]
 [5.03941186e-05 2.82150298e-05 8.30284701e-05 6.30925352e-05]
 [2.72823309e-05 2.73174448e-05 2.12755223e-05 2.76287242e-05]
 [1.51251751e-05 1.57665203e-05 9.75403760e-05 1.65432518e-05]
 [3.80917707e-05 2.50619618e-05 3.81828281e-05 3.83008413e-05]
 [3.22117366e-05 3.03297400e-05 3.17457963e-05 3.17978008e-05]
 [4.62203607e-05 4.18944416e-05 4.47738361e-05 4.63368506e-05]
 [2.10674727e-05 2.54096795e-05 9.38328816e-06 6.09841137e-06]
 [2.42946905e-05 1.29559389e-05 9.43969721e-05 5.86559591e-05]
 [5.12758349e-05 2.07014286e-05 3.