In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geopy import great_circle
from sklearn import *
from sklearn import utils

In [None]:
path = 'C:/Users/yixin/Desktop/Machine_Learning_Projects/restaurant-visitor-forecasting'
np.random.seed(2018)

data = {
    'air_reserve': pd.read_csv(path + '/input/air_reserve.csv', \
                               parse_dates=['visit_datetime', 'reserve_datetime']),
    'hpg_reserve': pd.read_csv(path + '/input/hpg_reserve.csv', \
                               parse_dates=['visit_datetime', 'reserve_datetime']),
    'air_visit': pd.read_csv(path + '/input/air_visit_data.csv', parse_dates=['visit_date']), # main training set
    'holidays': pd.read_csv(path + '/input/date_info.csv', parse_dates=['calendar_date']).rename(
        columns={'calendar_date': 'visit_date'}),
    'air_store': pd.read_csv(path + '/input/air_store_info.csv'),
    'hpg_store': pd.read_csv(path + '/input/hpg_store_info.csv'),
    'id': pd.read_csv(path + '/input/store_id_relation.csv'),
    'submission': pd.read_csv(path + '/input/sample_submission.csv'),  # test set
}

data['hpg_reserve'] = pd.merge(data['hpg_reserve'], data['id'], how='inner', on='hpg_store_id')
data['holidays'].drop(['day_of_week'], axis=1, inplace=True)

In [None]:
#######################################################################################################
###                                      Feature Engineering                                        ###
#######################################################################################################
# Add day of week, month into training set and test set
data['submission']['visit_date'] = data['submission']['id'].apply(lambda x:x[-10:])
data['submission']['visit_date'] = pd.to_datetime(data['submission']['visit_date'])
data['submission']['air_store_id'] = data['submission']['id'].apply(lambda x:x[:-11])
data['submission'].drop(['id'], axis=1, inplace=True)
for df in ['air_visit', 'submission']:
    data[df]['day_of_week'] = data[df]['visit_date'].dt.dayofweek
    data[df]['month'] = data[df]['visit_date'].dt.month
    data[df]['year'] = data[df]['visit_date'].dt.year

In [None]:
# Add feature, distance referencere (median latitude and median longitude), into air_store    (possible feature)
ref = data['air_store'][['latitude', 'longitude']].median().values
data['air_store']['diff_dist'] = data['air_store'].apply(lambda x: \
                                great_circle((x['latitude'],x['longitude']), ref).km, axis = 1)

In [None]:
# Add feature, location reference (median latitude and median longitude), into air_store    (possible feature)
data['air_store']['diff_lat_median'] = np.absolute(
    data['air_store']['latitude'].median() - data['air_store']['latitude'])
data['air_store']['diff_long_median'] = np.absolute(
    data['air_store']['longitude'].median() - data['air_store']['longitude'])

In [None]:
# Add feature, location reference (max latitude and max longitude), into air_store    (possible feature)
data['air_store']['diff_lat_max'] = data['air_store']['latitude'].max() - data['air_store']['latitude']
data['air_store']['diff_long_max'] = data['air_store']['longitude'].max() - data['air_store']['longitude']

In [None]:
# Add feature, latitude + longitude, into air_store    (possible feature)
data['air_store']['lat_plus_long'] = data['air_store']['latitude'] + data['air_store']['longitude']

In [None]:
# Add feature, prefecture, into air_store    (possible feature)
data['air_store']['prefecture'] = data['air_store']['air_area_name'].apply(lambda x:str(x).split(' ')[0])

In [None]:
# Add feature, number of restaurants per area, into air_store    (possible feature)
tmp = data['air_store'].groupby('air_area_name', as_index=False)['air_store_id'].count().rename(
    columns={'air_store_id': 'rest_per_area'})
data['air_store'] = pd.merge(data['air_store'], tmp, how='left', on='air_area_name')

In [None]:
# Aggregate min, max, median, and mean of visitors grouped by each store and day of week
unique_stores = data['submission']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'day_of_week': [i] * len(unique_stores)}) \
                    for i in range(7)], ignore_index=True)

funcs = {
    'min': 'visitors_min',
    'max': 'visitors_max',
    'mean': 'visitors_mean',
    'median': 'visitors_median',
    'count': 'observation_count'
}
for func in funcs:
    tmp = data['air_visit'].groupby(['air_store_id', 'day_of_week'], as_index=False).agg(
    {'visitors': func}).rename(columns={'visitors': funcs[func]})
    stores = stores.merge(tmp, how='left', on=['air_store_id', 'day_of_week'])
stores = stores.merge(data['air_store'], how='left', on='air_store_id')

In [None]:
# Merge training and test sets with holidays
train = pd.merge(data['air_visit'], data['holidays'], how='left', on='visit_date')
test = pd.merge(data['submission'], data['holidays'], how='left', on='visit_date')

In [None]:
# Merge training and test sets with store information
train = pd.merge(train, stores, how='inner', on=['air_store_id', 'day_of_week'])
test = pd.merge(test, stores, how='inner', on=['air_store_id', 'day_of_week'])

In [None]:
# Shuffle training data
train = utils.shuffle(train).reset_index(drop=True)

In [None]:
# Define evaluation metric
def RMSE(y_true, y_pred):
    return metrics.mean_squared_error(y_true, y_pred) ** 0.5
rmse = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

In [None]:
#######################################################################################################
###                                         Pre-training                                            ###
#######################################################################################################
X_cols = [col for col in train.columns if col not in ['air_store_id', 'visit_date', 'visitors']]
y_col = ['visitors']
X_train = train[X_cols]; X_test = test[X_cols]
y_train = np.log1p(train[y_col])

In [None]:
# One-hot encoding
le = preprocessing.LabelEncoder()
X_train['year'] = le.fit_transform(X_train['year'])

categorical_columns = ['day_of_week', 'month', 'air_genre_name', 'air_area_name', 'prefecture']
X_train = pd.get_dummies(X_train, columns=categorical_columns)

In [None]:
# Sample pre-training data, size = X_train, with replacement
np.random.seed(2018)
idx = np.random.randint(X_train.shape[0], size=X_train.shape[0])
X_train_pre = X_train.loc[idx]
y_train_pre = y_train.loc[idx]

kf = model_selection.KFold(n_splits=5, shuffle=True)
X, y = X_train_pre.values, y_train_pre.values

In [None]:
# Change and trial any regression method here...
reg = ensemble.RandomForestRegressor(n_jobs=-1, max_features=0.8, n_estimators=200)
reg

In [None]:
rmsle_train, rmsle_val = 0, 0
for train_idx, val_idx in kf.split(X):
    X_train_cv, y_train_cv = X[train_idx], np.ravel(y[train_idx])
    X_val_cv, y_val_cv = X[val_idx], np.ravel(y[val_idx])
    
    ##dtrain = xgb.DMatrix(data=X_train_cv, label=y_train_cv)
    ##dval = xgb.DMatrix(data=X_val_cv, label=y_val_cv)
    ##model = xgb.train(params=params, dtrain=dtrain, num_boost_round=200)
    ##yhat_train = model.predict(dtrain, ntree_limit=200)
    ##yhat_val = model.predict(dval, ntree_limit=200)
    reg.fit(X_train_cv, y_train_cv)
    yhat_train = reg.predict(X_train_cv)
    yhat_val = reg.predict(X_val_cv)
    print('*************************************')
    print('RMSLE on training set:', RMSE(y_train_cv, yhat_train))
    print('RMSLE on validation set:', RMSE(y_val_cv, yhat_val))
    rmsle_train += RMSE(y_train_cv, yhat_train)
    rmsle_val += RMSE(y_val_cv, yhat_val)
print('Average RMSLE on training set:', rmsle_train / 5)
print('Average RMSLE on validation set:', rmsle_val / 5)

In [None]:
sorted(zip(X_train.columns, reg.feature_importances_), key=lambda x:-x[1])

In [None]:
#######################################################################################################
###                                         Pre-training                                            ###
###                          Train a model for each restaurant seperately                           ###
#######################################################################################################
cols = [col for col in train.columns if col not in ['air_genre_name', 'air_area_name', 'visit_date'
                                                    'latitude', 'longitude', 'diff_lat_median', 'diff_lat_max', 
                                                    'diff_long_median', 'diff_long_max', 'lat_plus_long', 
                                                    'diff_dist', 'rest_per_area', 'prefecture']]
train = train[cols]

# Seperate data according to air_store_id
X_train, X_test, y_train = {}, {}, {}
le = preprocessing.LabelEncoder()
drop_columns = ['air_store_id', 'visitors']
categorical_columns = ['month', 'day_of_week']

for store_id in train['air_store_id'].unique():
    if store_id in X_train:
        continue
    tmp1 = train[train['air_store_id'] == store_id]
    tmp2 = test[test['air_store_id'] == store_id]
    y_train[store_id] = np.log1p(tmp1['visitors'])
    
    tmp = pd.concat([tmp1, tmp2], ignore_index=True)
    tmp = pd.get_dummies(tmp, columns=categorical_columns)
    tmp['year'] = le.fit_transform(tmp['year'])
    tmp.drop(drop_columns, axis=1, inplace=True)
    
    X_train[store_id] = tmp[:tmp1.shape[0]]
    X_test[store_id] = tmp[tmp1.shape[0]:]

In [None]:
# Change and trial any regression method here...
kf = model_selection.KFold(n_splits=5, shuffle=True)
reg = neighbors.KNeighborsRegressor(n_jobs=-1, weights='distance')
reg

In [None]:
res_train, res_val = 0, 0
for store_id in train['air_store_id'].unique():
    X, y = X_train[store_id].values, y_train[store_id].values
    rmsle_train, rmsle_val = 0, 0
    for train_idx, val_idx in kf.split(X):
        X_train_cv, y_train_cv = X[train_idx], np.ravel(y[train_idx])
        X_val_cv, y_val_cv = X[val_idx], np.ravel(y[val_idx])
            
        reg.fit(X_train_cv, y_train_cv)
        rmsle_train += RMSE(y_train_cv, reg.predict(X_train_cv))
        rmsle_val += RMSE(y_val_cv, reg.predict(X_val_cv))
    print('*************************************')
    print('RMSLE on {} training set: {}'.format(store_id, rmsle_train / 5))
    print('RMSLE on {} validation set: {}'.format(store_id, rmsle_val / 5))
    res_train += rmsle_train
    res_val += rmsle_val
print(res_train / (5 * train['air_store_id'].nunique()))
print(res_val / (5 * train['air_store_id'].nunique()))

In [None]:
#######################################################################################################
###                                Approximate pre-training outcomes                                ###
###                                            Conclusioin                                          ###
#######################################################################################################
# Pre-train on entire pre-training set
RandomForestRegressor: 0.4857
AdaBoostRegressor: 0.5774
XGBRegressor: 0.5084
LinearRegression: 0.5542
KNeighborsRegressor: 0.5064

# Pre-train a model for each restaurant
RandomForestRegressor: 0.5078
XGBRegressor: 0.4972
RidgeRegression: 0.4949
KNeighborsRegressor: 0.5768
Univariate forecasting with Prophet: 0.5420
    
# Conclusion from pre-training
Most time-series forecasting techniques do not work well, 
because for each restaurant id, there are many missing days, 
e.g. Fridays/Saturdays jump to Sundays, and there are also
some missing months such as May. Therefore, this problem
is better to be solved using regression methods, which means
that we can assume every data sample is independent.

How to obtain the final result?
From the pre-training outcomes, two schemes can be followed.
1. Train a model for each restaurant, using
RidgeRegression and RandomForestRegressoror.
2. Train three models on the entire training set, using
RandomForestRegressor, XGBRegressor and KNeighborsRegressor.

Suppose procedure 1 gives result res1 and procedure 2 gives res2, 
res3, res4. The final result is then an ensemble or stacking of res1~res4.
Now, let's move on with our plan!