# Solution for OpenAI Gym - discreete environments

Test cases: 'taxi-v2' and 'CliffWalking-v0'

*Author: Nick Ben*

## Imports

In [0]:
import numpy as np
from collections import defaultdict, deque
import gym
import numpy as np
import sys
import math
import matplotlib.pyplot as plt
import random

# Set plotting options
%matplotlib inline
plt.style.use('ggplot')
np.set_printoptions(precision=3, linewidth=120)

## Helpers

In [0]:
def action_egreedy(Q_state, eps, nA):
  if random.random() < eps:
    action = random.choice(np.arange(nA))
  else:
    action = np.random.choice([action_ for action_, value_ in enumerate(Q_state) if value_ == np.max(Q_state)])
  return action

def run(agent, env, num_episodes=20000, mode='train'):
    """Run agent in given reinforcement learning environment and return scores."""
    window = 100 
    avg_rewards = deque(maxlen=num_episodes)
    best_avg_reward = -math.inf
    samp_rewards = deque(maxlen=window)
  
    for i_episode in range(1, num_episodes+1):
        # Initialize episode
        initial_state = env.reset()
        action = agent.reset_episode(initial_state, i_episode)
        samp_reward = 0
        done = False

        while not done:
            next_state, reward, done, _ = env.step(action)
            action = agent.step(next_state, reward, done, mode)
            samp_reward += reward
        samp_rewards.append(samp_reward)
        
        # Print episode stats
        if i_episode > 100:
          avg_reward = np.mean(samp_rewards)
          avg_rewards.append(avg_reward)
          if avg_reward > best_avg_reward:
            best_avg_reward = avg_reward
            print("\rEpisode {}/{} || Best average reward {} ".format(i_episode, num_episodes, best_avg_reward), end="")
            sys.stdout.flush()
        if i_episode == num_episodes: print('\n')

## Agent

In [0]:
class Agent:
    """Agent that can act on an environment"""

    def __init__(self, env, learning = "sarsa", double = False, train = True, alpha=0.1, gamma=1.0,
                 epsilon_start=1.0, seed=505):

        self.env = env
        self.state_size = env.observation_space.n
        self.action_size = env.action_space.n
        self.seed = np.random.seed(seed)
        self.learning = learning
        self.double = double
        self.alpha = alpha  
        self.gamma = gamma  
        self.epsilon_start = epsilon_start  
        
        # Create Q-table
        if self.double:
          self.q_table_1 = np.zeros((self.state_size, self.action_size))
          self.q_table_2 = np.zeros((self.state_size, self.action_size))
        else:
          self.q_table = np.zeros((self.state_size, self.action_size))

    def reset_episode(self, initial_state, step):

        # Gradually decrease exploration rate
        self.epsilon = self.epsilon_start / (0.5*step + 1)

        # Decide initial action
        self.last_state = initial_state
        
        if self.double:
          self.last_action = np.argmax(np.mean([self.q_table_1[self.last_state], self.q_table_2[self.last_state]], axis=0))
        else:
          self.last_action = np.argmax(self.q_table[self.last_state])
        
        return self.last_action
    
    def step(self, new_state, reward=None, done=None, mode='train'):
        """Pick next action and update internal Q table (when mode != 'test')."""

        if mode == 'test':
            # Test mode: take greedy action
            action = np.argmax(self.q_table[state])
        
        else:
            # Train mode: take a step and return action
            
            # QL step update 
            if self.learning == "q_learning":
              self.q_table[self.last_state, self.last_action] += self.alpha * \
                (reward + self.gamma * max(self.q_table[new_state]) - self.q_table[self.last_state, self.last_action])
              new_action = action_egreedy(self.q_table[self.last_state], self.epsilon, self.action_size)
                          
            # SARSA step update 
            elif self.learning == "sarsa":
              new_action = action_egreedy(self.q_table[new_state], self.epsilon, self.action_size)
              self.q_table[self.last_state, self.last_action] += self.alpha * \
                (reward + self.gamma * self.q_table[new_state, new_action] - self.q_table[self.last_state, self.last_action])
            
            # Expected SARSA step update 
            elif self.learning == "expected_sarsa":
              self.q_table[self.last_state, self.last_action] += self.alpha * \
                (reward + self.gamma * np.mean(self.q_table[new_state]) - self.q_table[self.last_state, self.last_action])
              new_action = action_egreedy(self.q_table[new_state], self.epsilon, self.action_size)
            
            # Double Sarsa step update 
            elif self.learning == "double_sarsa":
              new_action = action_egreedy(np.mean([self.q_table_1[new_state],self.q_table_2[new_state]], axis=0), self.epsilon, self.action_size)
              if random.random() < 0.5:
                self.q_table_1[self.last_state, self.last_action] += self.alpha * (reward + self.gamma * self.q_table_1[new_state, new_action] - self.q_table_1[self.last_state, self.last_action])
              else:
                self.q_table_2[self.last_state, self.last_action] += self.alpha * (reward + self.gamma * self.q_table_2[new_state, new_action] - self.q_table_2[self.last_state, self.last_action])
            
            # Double Expected Sarsa step update 
            elif self.learning == "double_expected_sarsa":
              if random.random() < 0.5:
                self.q_table_1[self.last_state, self.last_action] += self.alpha * (reward + self.gamma * np.mean(self.q_table_2[new_state]) - self.q_table_1[self.last_state, self.last_action])
              else:
                self.q_table_2[self.last_state, self.last_action] += self.alpha * (reward + self.gamma * np.mean(self.q_table_1[new_state]) - self.q_table_2[self.last_state, self.last_action])
              new_action = action_egreedy(np.mean([self.q_table_1[new_state],self.q_table_2[new_state]], axis=0), self.epsilon, self.action_size)
            
            # Double QL step update 
            elif self.learning == "double_q_learning":
              if random.random() < 0.5:
                self.q_table_1[self.last_state, self.last_action] += self.alpha * (reward + self.gamma * self.q_table_2[new_state, np.argmax(self.q_table_1[new_state])] - self.q_table_1[self.last_state, self.last_action])
              else:
                self.q_table_2[self.last_state, self.last_action] += self.alpha * (reward + self.gamma * self.q_table_1[new_state, np.argmax(self.q_table_2[new_state])] - self.q_table_2[self.last_state, self.last_action])
              new_action = action_egreedy(np.mean([self.q_table_1[self.last_state],self.q_table_2[self.last_state]], axis=0), self.epsilon, self.action_size)
            
            else:
              raise ValueError('Learning algorithm not supported')
            
            #rollout state and action
            self.last_state = new_state
            self.last_action = new_action
            return new_action
            

## Runs

### Env: 'taxi-v2'

#### Double Q-Learning

In [4]:
ENV_NAME = "Taxi-v2"
EPISODES_TRAIN = 200000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "double_q_learning"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = True,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 106987/200000 || Best average reward -10.04 



#### Double Sarsa

In [5]:
ENV_NAME = "Taxi-v2"
EPISODES_TRAIN = 200000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "double_sarsa"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = True,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 83595/200000 || Best average reward 9.36 



#### Sarsa

In [6]:
ENV_NAME = "Taxi-v2"
EPISODES_TRAIN = 200000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "sarsa"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 106833/200000 || Best average reward 9.32 



#### Double Expected Sarsa

In [7]:
ENV_NAME = "Taxi-v2"
EPISODES_TRAIN = 200000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "double_expected_sarsa"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = True,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 178868/200000 || Best average reward 9.37 



# Expected Sarsa : *WINNER*

In [8]:
ENV_NAME = "Taxi-v2"
EPISODES_TRAIN = 200000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "expected_sarsa"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 153537/200000 || Best average reward 9.73 



#### Q-learning

In [9]:
ENV_NAME = "Taxi-v2"
EPISODES_TRAIN = 200000
ALPHA = 0.5
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "q_learning"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 188756/200000 || Best average reward -7.96 



### Env: 'CliffWalking-v0'

In [0]:
ENV_NAME = 'CliffWalking-v0'
EPISODES_TRAIN = 2000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505

env = gym.make(ENV_NAME)
agent_ql = Agent(env, 
              learning = "q_learning",
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

agent_sarsa = Agent(env, 
              learning = "sarsa",
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

agent_expsarsa = Agent(env, 
              learning = "expected_sarsa",
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)


In [11]:
run(agent_sarsa, env, num_episodes = EPISODES_TRAIN)

Episode 1131/2000 || Best average reward -15.0 



In [12]:
run(agent_expsarsa, env, num_episodes = EPISODES_TRAIN)

Episode 1297/2000 || Best average reward -15.0 



In [13]:
run(agent_ql, env, num_episodes = EPISODES_TRAIN)

Episode 181/2000 || Best average reward -826.72 



## FrozenLake-v0

In [14]:
ENV_NAME = "FrozenLake-v0"
EPISODES_TRAIN = 200000
ALPHA = 0.1
GAMMA = 1.0
EPS_START = 1.0
SEED = 505
LEARN_ALGO = "sarsa"

env = gym.make(ENV_NAME)
agent = Agent(env, 
              learning = LEARN_ALGO,
              double = False,
              train = True,
              alpha = ALPHA,
              gamma = GAMMA,
              epsilon_start = EPS_START,
              seed = SEED)

run(agent, env, num_episodes = EPISODES_TRAIN)

Episode 107665/200000 || Best average reward 0.38 

