In [1]:
import gym
import random
import time
import numpy as np
from IPython.display import clear_output

from gym.envs.registration import register


# wrap in a try block to supress a warning we don't care about
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

## Intro to Q learning

#### Overview:

Before we created a random agent that can simulate continuous and discrete action spaces. Now we'll be looking at Frozen Lake. Slippery ice means actions don't always pan out (stochastic). We'll set `is_slippery` to `False` to make this a deterministic environment.

Change iteration loop to run while `done=False`.

Print updates at 0.5 second intervals.

Flush notebook output with `clear_output` command.

https://www.youtube.com/watch?v=wN3rxIKmMgE&list=PLIfPjWrv526bMF8_vx9BqWjec-F-g-lQO&index=3

In [2]:
#env_name = "CartPole-v1"
#env_name = "MountainCar-v0"
#env_name = "MountainCarContinuous-v0"
#env_name = "Acrobot-v1"
#env_name = "Pendulum-v0"
#env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"   # we set is_slippery=False in the register up above

env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

type(env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [3]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
            
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                      self.action_high,
                                      self.action_shape)
        return action

To implement Q-learning, create Q-agent which a subclass of the parent agent.

Q table:
-States as rows
-Actions as columns

#### Redfine get-action method to select actions corresponding to the state. `argmax` is used to select the action with highest q-value.

#### Define a `train` method for updating the q-table at each step.

- It receives the `experience` tuple with `(state, action, next_state, reward, done)`.
- Calls for `q_next` (pulls next_state from q-table), and checks if this next_state is `done`.
- Calculates `q_target` from the Q-learning equation (reward plus _discounted_ future reward)
- Calculates `q_update`, which is the distance from the current q-value to the target.
- Pushes update to the q-table using _learning rate_.
- Constructor defines both the `discount_rate` and the `learning_rate` used above.

#### Balance exploration vs exploitation (aka policy or greedy action): epsilon (eps)
Start with high epsilon (aka 1.0 to explore 100% of the time), then exponentially decay eps at each terminal step

In [4]:
# QAgent is a subclass of the Agent class above

class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        
        # Action size already defined above in parent class
        
        # Define and printout state size here:
        self.state_size = env.observation_space.n
        print("State size", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        # build a Q table
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    # modify get_action method
    def get_action(self, state):
        q_state = self.q_table[state]
        
        # exploration vs exploitation using probability of epsilon
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        # q table
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * q_update
        
        # Exponential decay of epsilon when reaching a terminal step
        if done:
            self.eps = self.eps * 0.99
        
agent = QAgent(env)

Action size: 4
State size 16


### Adjusting the simulation loop for training

- env.step returns the next step, so make a variable called next_step
- call train method
- update the state to the next state
- train over a number of episodes
- and track the reward

In [None]:
total_reward = 0

for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total Reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
        time.sleep(0.5)
        clear_output(wait=True)

s: 14 a: 2
Episode: 13, Total Reward: 0.0, eps: 0.8775210229989678
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
[[1.79339509e-05 1.93369883e-05 2.48723078e-05 1.23596784e-05]
 [6.43760791e-05 9.62612621e-05 2.32803474e-05 4.72823997e-05]
 [3.89138360e-05 9.86737200e-05 4.45590402e-05 2.32369004e-06]
 [7.28790820e-06 4.75590887e-06 2.73619485e-05 5.72925435e-06]
 [4.08016500e-05 5.15348501e-05 8.44746829e-05 6.80068953e-06]
 [8.72718705e-05 2.57313814e-05 9.22717367e-05 3.48182728e-05]
 [6.20448744e-06 1.90639625e-05 3.00377809e-05 8.04442497e-05]
 [1.50490536e-05 2.97014190e-05 8.15887946e-05 5.66548231e-05]
 [4.81074274e-05 5.30682536e-05 2.56897516e-05 7.37199965e-06]
 [5.28499395e-05 8.41539922e-05 3.36712601e-06 2.17479063e-05]
 [8.19415311e-05 3.43426599e-05 1.52617998e-05 4.54044986e-05]
 [4.85457554e-05 4.48512267e-05 9.14212042e-05 2.69982463e-06]
 [7.29367704e-05 9.53455069e-05 5.91185098e-06 6.19458592e-05]
 [2.80261333e-05 1.90403366e-05 2.61266453e-05 3.97752412e-05]
 [2.18621352