In [2]:
#https://hackernoon.com/reinforcement-learning-part-2-the-q-learning-algorithm
import random

def train(n_episodes: int):
    """
    Pseudo-code of a Reinforcement Learning agent training loop
    """

    # python object that wraps all environment logic. Typically you will
    # be using OpenAI gym here.
    env = load_env()

    # python object that wraps all agent policy (or value function)
    # parameters, and action generation methods.
    agent = get_rl_agent()

    for episode in range(0, n_episodes):

        # random start of the environmnet
        state = env.reset()

        # epsilon is parameter that controls the exploitation-exploration trade-off.
        # it is good practice to set a decaying value for epsilon
        epsilon = get_epsilon(episode)

        done = False
        while not done:

            if random.uniform(0, 1) < epsilon:
                # Explore action space
                action = env.action_space.sample()
            else:
                # Exploit learned values (or policy)
                action = agent.get_best_action(state)

            # environment transitions to next state and maybe rewards the agent.
            next_state, reward, done, info = env.step(action)

            # adjust agent parameters. We will see how later in the course.
            agent.update_parameters(state, action, reward, next_state)

            state = next_state

In [3]:
# env.P is double dictionary.
# - The 1st key represents the state, from 0 to 499
# - The 2nd key represens the action taken by the agent,
#   from 0 to 5

# example
state = 123
action = 0  # move south

# env.P[state][action][0] is a list with 4 elements
# (probability, next_state, reward, done)
# 
#  - probability
#    It is always 1 in this environment, which means
#    there are no external/random factors that determine the
#    next_state
#    apart from the agent's action a.
#
#  - next_state: 223 in this case
# 
#  - reward: -1 in this case
#
#  - done: boolean (True/False) indicates wheter the
#    episode has ended (i.e. the driver has dropped the
#    passenger at the correct destination)
print('env.P[state][action][0]: ', env.P[state][action][0])

NameError: name 'env' is not defined

In [4]:
# Need to call reset() at least once before render() will work
env.reset()

env.s = 123
env.render(mode='human')

NameError: name 'env' is not defined

In [5]:
env.s = 223
env.render(mode='human')

NameError: name 'env' is not defined

3. Random agent baseline 🤖🍷
👉🏽 notebooks/01_random_agent_baseline.ipynb

Before you start implementing any complex algorithm, you should always build a baseline model.

This advice applies not only to Reinforcement Learning problems but Machine Learning problems in general.

It is very tempting to jump straight into the complex/fancy algorithms, but unless you are really experienced, you will fail terribly.

Let’s use a random agent 🤖🍷 as a baseline model.

In [None]:
class RandomAgent:
    """
    This taxi driver selects actions randomly.
    You better not get into this taxi!
    """
    def __init__(self, env):
        self.env = env

    def get_action(self, state) -> int:
        """
        We have `state` as an input to keep
        a consistent API for all our agents, but it
        is not used.
        
        i.e. The agent does not consider the state of
        the environment when deciding what to do next.
        This is why we call it "random".
        """
        return self.env.action_space.sample()

agent = RandomAgent(env)

In [5]:
import torch

def get_open_hours_mask(DAY_SIZE):
    
    open_hour_mask = torch.tensor([ True, True, True, True, True, True, True, True, False, False,
        False, False, True, True, False, False, False, True, True, True,
        True, True, True, True])
    
    return open_hour_mask
    
    
    
    
DAY_SIZE = 24        ### per hour
CALENDAR_SIZE = 4    ### four days
NB_AMO = 2           ### one AMO

# DAYS (D) x OPEN HOURS (OH) x NB AMO (NA)
calendar_model = torch.ones(CALENDAR_SIZE,int(DAY_SIZE), NB_AMO).int()

# calendar = torch.ones(DAY_SIZE).int() # D

calendar = get_open_hours_mask(DAY_SIZE).int()

calendar_mask = torch.zeros(DAY_SIZE).bool() # D 
calendar_mask[0] = True

calendar_mask_model = calendar.unsqueeze(1).expand(calendar_model.size()) # D x OH x NA

print(calendar.size(), calendar_mask.size())
# print(calendar[0,:,1], calendar_mask[2,:,1])

print(calendar)
print(calendar_mask)
print(calendar_mask_model)


# masked_cal = calendar & calendar_mask # D 

# print(masked_cal)



torch.Size([24]) torch.Size([24])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       dtype=torch.int32)
tensor([ True, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False])
tensor([[[1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [0, 0],
         [0, 0],
         [0, 0],
         [0, 0],
         [1, 1],
         [1, 1],
         [0, 0],
         [0, 0],
         [0, 0],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1]],

        [[1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [1, 1],
         [0, 0],
         [0, 0],
         [0, 0],
         [0, 0],
         [1, 1],
         [1, 1],
         [0, 0],
 

In [None]:
class RandomAgent:
    """
    Blocks of rendez-vous are put on the agenda randomly
    dimension: DAY, HOUR, AMO
    """
    def __init__(self, env):
        self.env = env
    
    def get_action(self, state) -> int:
        """
        
        """

In [7]:
# set initial state of the environment
env.reset()
state = 123
env.s = state

epochs = 0
penalties = 0  # wrong pick up or dropp off
reward = 0

# store frames to latter plot them
frames = []

done = False

while not done:
    
    action = agent.get_action(state)
    
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

NameError: name 'env' is not defined

In [8]:
from tqdm import tqdm

n_episodes = 100

# For plotting metrics
timesteps_per_episode = []
penalties_per_episode = []

for i in tqdm(range(0, n_episodes)):
    
    # reset environment to a random state
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        
        action = agent.get_action(state)       
        next_state, reward, done, info = env.step(action) 
               
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
    
    timesteps_per_episode.append(epochs)
    penalties_per_episode.append(penalties)

  0%|                                                   | 0/100 [00:00<?, ?it/s]


NameError: name 'env' is not defined

In [9]:
print(f'Avg steps to complete ride: {np.array(timesteps_per_episode).mean()}')
print(f'Avg penalties to complete ride: {np.array(penalties_per_episode).mean()}')

NameError: name 'np' is not defined

# Learning Agent

In [10]:
import numpy as np

class QAgent:

    def __init__(self, env, alpha, gamma):
        self.env = env

        # table with q-values: n_states * n_actions
        self.q_table = np.zeros([env.observation_space.n,
                                 env.action_space.n])

        # hyper-parameters
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor

    def get_action(self, state):
        """"""
        return np.argmax(self.q_table[state])

    def update_parameters(self, state, action, reward, next_state):
        """"""
        # Q-learning formula
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state])
        new_value = \
            old_value + \
            self.alpha * (reward + self.gamma * next_max - old_value)

        # update the q_table
        self.q_table[state, action] = new_value

In [11]:
import random
from tqdm import tqdm

# exploration vs exploitation prob
epsilon = 0.1

n_episodes = 10000

# For plotting metrics
timesteps_per_episode = []
penalties_per_episode = []


for i in tqdm(range(0, n_episodes)):
    
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        
        if random.uniform(0, 1) < epsilon:
            # Explore action space
            action = env.action_space.sample()
        else:
            # Exploit learned values
            action = agent.get_action(state)
        
        next_state, reward, done, info = env.step(action) 
        
        agent.update_parameters(state, action, reward, next_state)
        
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    timesteps_per_episode.append(epochs)
    penalties_per_episode.append(penalties)

  0%|                                                 | 0/10000 [00:00<?, ?it/s]


NameError: name 'env' is not defined

In [12]:
import pandas as pd
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize = (12, 4))
ax.set_title("Timesteps to complete ride")    
pd.Series(timesteps_per_episode).plot(kind='line')
plt.show()

fig, ax = plt.subplots(figsize = (12, 4))
ax.set_title("Penalties per ride")    
pd.Series(penalties_per_episode).plot(kind='line')
plt.show()

ModuleNotFoundError: No module named 'pandas'