In [1]:
import gym
import random
import time
import numpy as np
import pandas as pd
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from IPython.display import clear_output
from collections import deque
from gym.envs.registration import register


# wrap in a try block to supress a warning we don't care about
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
#env_name = "CartPole-v1"
#env_name = "MountainCar-v0"
#env_name = "MountainCarContinuous-v0"
#env_name = "Acrobot-v1"
#env_name = "Pendulum-v0"
#env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"   # we set is_slippery=False in the register up above

env = gym.make(env_name)

type(env.action_space)

gym.spaces.discrete.Discrete

In [3]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
            
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                      self.action_high,
                                      self.action_shape)
        return action

In [4]:
# QAgent is a subclass of the Agent class above

class QAgent(Agent):
    def __init__(self, env, discount_rate, learning_rate):
        super().__init__(env)
        
        # Action size already defined above in parent class
        
        # Define and printout state size here:
        self.state_size = env.observation_space.n
        print("State size", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        # build a Q table
        self.build_model()
        
    def build_model(self):
        np.random.seed(17)
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    # modify get_action method
    def get_action(self, state):
        q_state = self.q_table[state]
        
        # exploration vs exploitation using probability of epsilon
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        # q table
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * q_update
        
        # Exponential decay of epsilon when reaching a terminal step
        if done:
            self.eps = self.eps * 0.99

In [5]:
# QNAgent is a subclass of the Agent class above

class QNAgent(Agent):
    def __init__(self, env, discount_rate, learning_rate):
        super().__init__(env)
        
        # Action size already defined above in parent class
        
        # Define and printout state size here:
        self.state_size = env.observation_space.n
        print("State size", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        # build a Q table
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def build_model(self):
        tf.reset_default_graph()   # do this anytime you name things, like the weights down below
        self.state_in = tf.placeholder(tf.int32, shape=[1])
        self.action_in = tf.placeholder(tf.int32, shape=[1])
        self.target_in = tf.placeholder(tf.float32, shape=[1])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        # attempt to start with same seeded random weight table as the q-table above
        # np.random.seed(17)
        # rand_table = 1e-4*np.random.random([self.state_size, self.action_size])
        # init = tf.constant_initializer(rand_table)
        # self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table", kernel_initializer=init, bias_initializer='zeros')
        # self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    # modify get_action method
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        
        # exploration vs exploitation using probability of epsilon
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = ([exp] for exp in experience)
        
        # q table
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate * np.max(q_next)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict=feed)
        
        # Exponential decay of epsilon when reaching a terminal step
        if experience[4]:
            self.eps = self.eps * 0.99
            
    def __del__(self):
        self.sess.close()
        


In [6]:
# QAgent with Replay is a subclass of the Agent class above

class QNRAgent(Agent):
    def __init__(self, env, discount_rate, learning_rate):
        super().__init__(env)
        
        # Action size already defined above in parent class
        
        # Define and printout state size here:
        self.state_size = env.observation_space.n
        print("State size", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        # build a Q table
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.replay_buffer = deque(maxlen=1000)
        
    def build_model(self):
        tf.reset_default_graph()   # do this anytime you name things, like the weights down below
        self.state_in = tf.placeholder(tf.int32, shape=[None])
        self.action_in = tf.placeholder(tf.int32, shape=[None])
        self.target_in = tf.placeholder(tf.float32, shape=[None])
        
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
    # modify get_action method
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        
        # exploration vs exploitation using probability of epsilon
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience, batch_size=50):
        self.replay_buffer.append(experience)
        samples = random.choices(self.replay_buffer, k=batch_size)
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
        
        
        #state, action, next_state, reward, done = ([exp] for exp in experience)
        
        # q table
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate * np.max(q_next, axis=1)
        
        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict=feed)
        
        # Exponential decay of epsilon when reaching a terminal step
        if experience[4]:
            self.eps = self.eps * 0.99
            
    def __del__(self):
        self.sess.close()

In [7]:
def run(agent):    
    total_reward = 0
    
    total_rewards = []
    for ep in range(200):
        state = env.reset()
        done = False
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.train((state,action,next_state,reward,done))
            state = next_state
            total_reward += reward

            print("s:", state, "a:", action)
            print("Episode: {}, Total Reward: {}, eps: {}".format(ep,total_reward,agent.eps))
            env.render()
            print(agent.q_table)
            time.sleep(0.5)
            clear_output(wait=True)
            
        total_rewards.append(total_reward)
    return total_rewards

In [8]:
def runNN(agent):
    total_reward = 0
    
    total_rewards = []
    for ep in range(200):
        state = env.reset()
        done = False
        while not done:
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.train((state,action,next_state,reward,done))
            state = next_state
            total_reward += reward

            print("s:", state, "a:", action)
            print("Episode: {}, Total Reward: {}, eps: {}".format(ep,total_reward,agent.eps))
            env.render()
            with tf.variable_scope("q_table", reuse=True):
                weights = agent.sess.run(tf.get_variable("kernel"))
                print(weights)
            time.sleep(0.5)
            clear_output(wait=True)
            
        total_rewards.append(total_reward)
    return total_rewards    

In [None]:
discount_rate = 0.97
learning_rate = 0.001

table_agent = None
table_agent = QAgent(env, discount_rate, learning_rate)

run1a = run(table_agent)

s: 8 a: 0
Episode: 136, Total Reward: 2.0, eps: 0.2549097606963092
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
[[3.04884802e-05 5.36922371e-05 2.13308488e-05 9.23500313e-06]
 [7.81369136e-05 6.51101194e-05 6.42504938e-05 5.77085970e-05]
 [4.48285369e-06 3.57908977e-05 9.44332800e-05 6.26121542e-06]
 [8.64357972e-05 8.70296717e-05 5.59710144e-06 6.53403451e-05]
 [5.52631202e-05 5.91180068e-05 4.71112777e-05 2.89465983e-05]
 [2.97725718e-05 5.61508905e-05 3.96047436e-05 7.88700710e-05]
 [4.17647835e-05 1.45477390e-05 1.50755788e-05 5.52413502e-06]
 [7.18037194e-05 2.92317355e-05 1.98773872e-05 8.31363922e-05]
 [5.62311313e-05 7.83220329e-06 5.58620155e-05 1.77735150e-05]
 [6.76249247e-05 1.20807440e-05 4.47898495e-05 8.58283917e-05]
 [7.97326650e-05 6.79787283e-06 9.55993416e-05 6.58445514e-05]
 [7.18776083e-05 7.43576765e-05 8.87736352e-05 1.33945668e-05]
 [7.76974581e-05 8.37991512e-05 8.29304764e-05 2.91569965e-06]
 [3.90374491e-05 4.37251992e-05 5.30745228e-05 4.93958109e-05]
 [1.30429282e

In [None]:
# run1b = run(table_agent)

In [None]:
NN_agent = None
NN_agent = QNAgent(env, discount_rate, learning_rate)

run2a = runNN(NN_agent)

In [None]:
# run2b = runNN(NN_agent)

In [None]:
Replay_agent = None
Replay_agent = QNRAgent(env, discount_rate, learning_rate)

run3a = runNN(Replay_agent)

In [None]:
# run3b = runNN(Replay_agent)

In [None]:
from matplotlib import pyplot as plt

In [None]:
episodes = [ep for ep in range(200)]

plt.plot(episodes, run1a, 'r', label='Q-table')   # green solid line
#plt.plot(episodes, run1b, 'r.', label='Q-table') # red dot-dashed line
plt.plot(episodes, run2a, 'b', label='Q NN') # blue dotted line
#plt.plot(episodes, run2b, 'b:', label='Q NN') # blue dotted line
plt.plot(episodes, run3a, 'm', label='Q NN Replay') # blue dotted line
#plt.plot(episodes, run3b, 'm:', label='Q NN Replay') # blue dotted line


plt.legend(loc=9) # loc=9 means "top center"
plt.xlabel("Episode")
plt.title("Learning rate = 0.01")
plt.show()