# 데이터 불러오기

In [43]:
import pandas as pd
import numpy as np

orders = pd.read_csv('./data/orders.csv')
couriers = pd.read_csv('./data/couriers.csv')

In [44]:
orders.head()

Unnamed: 0,order,ox,oy,placement_time,restaurant,ready_time,rx,ry
0,o1,9131,7497,743,r1,753,8708,5633
1,o8,7940,6877,612,r1,622,8708,5633
2,o21,7377,5287,336,r1,346,8708,5633
3,o26,6992,7441,623,r1,633,8708,5633
4,o38,10981,6805,594,r1,614,8708,5633


In [45]:
couriers.head()

Unnamed: 0,courier,x,y,on_time,off_time,delivery,trustiness
0,c1,11491,2806,0,90,0.21,0.36
1,c2,7950,223,0,90,0.68,0.28
2,c3,2818,10568,30,210,1.0,0.55
3,c4,12413,7222,90,270,0.5,0.14
4,c5,3572,3264,90,270,0.36,0.81


# 주문 하나 -- 근무 라이더

In [46]:
# 주문 건 무작위 추출
np.random.seed(1)
order = orders.sample(n=1)
order

Unnamed: 0,order,ox,oy,placement_time,restaurant,ready_time,rx,ry
307,o448,11408,6115,268,r40,276,10268,7110


In [47]:
# 근무중인 라이더
# rider = couriers[(couriers['on_time'] <= order[6]) & (couriers['off_time'] >= order[7])]
rider = couriers[(couriers['on_time'] <= 268) & (couriers['off_time'] >= 276)]
rider

Unnamed: 0,courier,x,y,on_time,off_time,delivery,trustiness
21,c22,9556,7777,90,293,0.71,0.31
25,c26,4779,143,210,330,0.18,0.9
26,c27,11457,2757,210,330,0.04,0.48


# Q-learning
- goal : 배달 처리량 증가 + 고객 만족

- action = 배달원 지정
- state = 배달원들과 음식점의 위치, 근무 시간
- reward = 배달 건 수 std 작게, trustiness 크게

https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

https://blog.floydhub.com/an-introduction-to-q-learning-reinforcement-learning/

Breaking it down into steps, we get

- Initialize the Q-table by all zeros.
- Start exploring actions: For each state, select any one among all possible actions for the current state (S).
- Travel to the next state (S') as a result of that action (a).
- For all possible actions from the state (S') select the one with the highest Q-value.
- Update Q-table values using the equation.
- Set the next state as the current state.
- If goal state is reached, then end and repeat the process.

In [1]:
# 나의 state는 배달원(x,y), 음식점(x,y), 근무시간(on,off) --> 6가지
# 나의 action은 배달원 지정. 동,서,남,북,음식픽업 --> 5가지

# Create Env

In [40]:
import gym
from gym import spaces

In [56]:
riders = len(rider)
on_time = rider['on_time']
off_time = rider['off_time']
delivery_std = rider['delivery']
trustiness = rider['trustiness']
riders_x = rider['x']
riders_y = rider['y']

restaurant_x = order['rx']
restaurant_y = order['ry']
placement_time = order['placement_time']
ready_time = order['ready_time']

In [48]:
class Environment(Env):
    
    def __init__(self):
        self.height = 6*len(rider)
        self.width = 5
        self.action_space = spaces.Discrete(5)
        self.observation_space = spaces.Tuple((
                spaces.Discrete(self.height),
                spaces.Discrete(self.width)
                ))
        self.moves = {
                0: (-1, 0),   # up
                1: (0, 1),   # right
                2: (1, 0),  # down
                3: (0, -1), # left
                4: (1, 1),  # pick up 
                }

        # begin in start state
        self.reset()

    def step(self, action):
        x, y = self.moves[action]
        self.S = self.S[0] + x, self.S[1] + y

        self.S = max(0, self.S[0]), max(0, self.S[1])
        self.S = (min(self.S[0], self.height - 1),
                  min(self.S[1], self.width - 1))

        if self.S == (self.height - 1, self.width - 1):
            return self.S, -1, True, {}
        elif self.S[1] != 0 and self.S[0] == self.height - 1:
            # the cliff
            return self.reset(), -100, False, {}
        return self.S, -1, False, {}

    def reset(self):
        self.S = (3, 0)
        return self.S

In [49]:
env = Environment()

In [50]:
import warnings ; warnings.filterwarnings('ignore')

import itertools
import gym, gym_walk, gym_aima
import numpy as np
from tabulate import tabulate
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

from itertools import cycle, count

import random
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline

In [51]:
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)

# Q-learning

In [52]:
def q_learning(env, 
               gamma=1.0,
               init_alpha=0.5,
               min_alpha=0.01,
               alpha_decay_ratio=0.5,
               init_epsilon=1.0,
               min_epsilon=0.1,
               epsilon_decay_ratio=0.9,
               n_episodes=3000):
    nS, nA = env.observation_space.n, env.action_space.n
    pi_track = []
    Q = np.zeros((nS, nA), dtype=np.float64)
    Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
    select_action = lambda state, Q, epsilon: np.argmax(Q[state]) \
        if np.random.random() > epsilon \
        else np.random.randint(len(Q[state]))
    alphas = decay_schedule(init_alpha, 
                           min_alpha, 
                           alpha_decay_ratio, 
                           n_episodes)
    epsilons = decay_schedule(init_epsilon, 
                              min_epsilon, 
                              epsilon_decay_ratio, 
                              n_episodes)
    for e in tqdm(range(n_episodes), leave=False):
        state, done = env.reset(), False
        while not done:
            action = select_action(state, Q, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            td_target = reward + gamma * Q[next_state].max() * (not done)
            td_error = td_target - Q[state][action]
            Q[state][action] = Q[state][action] + alphas[e] * td_error
            state = next_state

        Q_track[e] = Q
        pi_track.append(np.argmax(Q, axis=1))

    V = np.max(Q, axis=1)        
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return Q, V, pi, Q_track, pi_track

In [54]:
Q_qls, V_qls, Q_track_qls = [], [], []
for seed in tqdm(SEEDS, desc='All seeds', leave=True):
    random.seed(seed); np.random.seed(seed) ; env.seed(seed)
    Q_ql, V_ql, pi_ql, Q_track_ql, pi_track_ql = q_learning(env, gamma=gamma, n_episodes=n_episodes)
    Q_qls.append(Q_ql) ; V_qls.append(V_ql) ; Q_track_qls.append(Q_track_ql)
Q_ql = np.mean(Q_qls, axis=0)
V_ql = np.mean(V_qls, axis=0)
Q_track_ql = np.mean(Q_track_qls, axis=0)
del Q_qls ; del V_qls ; del Q_track_qls

All seeds:   0%|          | 0/5 [00:00<?, ?it/s]

AttributeError: 'Tuple' object has no attribute 'n'

In [55]:
print_state_value_function(V_ql, P, n_cols=n_cols, 
                           prec=svf_prec, title='State-value function found by Q-learning:')
print_state_value_function(optimal_V, P, n_cols=n_cols, 
                           prec=svf_prec, title='Optimal state-value function:')
print_state_value_function(V_ql - optimal_V, P, n_cols=n_cols, 
                           prec=err_prec, title='State-value function errors:')
print('State-value function RMSE: {}'.format(rmse(V_ql, optimal_V)))
print()
print_action_value_function(Q_ql, 
                            optimal_Q, 
                            action_symbols=action_symbols, 
                            prec=avf_prec, 
                            title='Q-learning action-value function:')
print('Action-value function RMSE: {}'.format(rmse(Q_ql, optimal_Q)))
print()
print_policy(pi_ql, P, action_symbols=action_symbols, n_cols=n_cols)
success_rate_ql, mean_return_ql, mean_regret_ql = get_policy_metrics(
    env, gamma=gamma, pi=pi_ql, goal_state=goal_state, optimal_Q=optimal_Q)
print('Reaches goal {:.2f}%. Obtains an average return of {:.4f}. Regret of {:.4f}'.format(
    success_rate_ql, mean_return_ql, mean_regret_ql))

State-value function found by Q-learning:
| 00 0.5013 | 01 0.4159 | 02 0.3548 | 03 0.2535 | 04 0.5166 |           | 06 0.2857 |           | 08 0.5494 |
| 09 0.5991 | 10 0.5577 |           |           | 13 0.7016 | 14 0.8329 |           Optimal state-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 | 04 0.5585 |           | 06 0.3583 |           | 08 0.5918 |
| 09 0.6431 | 10 0.6152 |           |           | 13 0.7417 | 14 0.8628 |           State-value function errors:
| 00  -0.04 | 01  -0.08 | 02  -0.12 | 03   -0.2 | 04  -0.04 |           | 06  -0.07 |           | 08  -0.04 |
| 09  -0.04 | 10  -0.06 |           |           | 13  -0.04 | 14  -0.03 |           State-value function RMSE: 0.0707

Q-learning action-value function:
╒════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤═══════╤════════╤════════╕
│    │       │       │       │       │       │     s │     < │     > │   * < │   * > │   er < │   er > │
╞════╪═══════╪═══════╪═══════╪═══════╪

KeyError: 3