# Test LightGBM with Bayesian Optimsation

# Feature Engineering

In [1]:
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error,r2_score
import time

import lightgbm as lgb
from lightgbm import LGBMRegressor
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings("ignore")

In [2]:
### Thanks to Shinji Suzuki

# OS
import glob, re
from datetime import datetime
import pickle

# data science tool
import numpy as np
import pandas as pd
import datetime as dt


# machine learning
from sklearn import *
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb

# データの読み込み
# 事前にcalendar_dateをvisit_dataに変更しています。airとhpgで同じことですが、別名で使用されているようです。
data = {
    'tra': pd.read_csv('../../../mltestdata/05_recruit/air_visit_data.csv'),
    'as': pd.read_csv('../../../mltestdata/05_recruit/air_store_info.csv'),
    'hs': pd.read_csv('../../../mltestdata/05_recruit/hpg_store_info.csv'),
    'ar': pd.read_csv('../../../mltestdata/05_recruit/air_reserve.csv'),
    'hr': pd.read_csv('../../../mltestdata/05_recruit/hpg_reserve.csv'),
    'id': pd.read_csv('../../../mltestdata/05_recruit/store_id_relation.csv'),
    'tes': pd.read_csv('../../../mltestdata/05_recruit/sample_submission.csv'),
    'hol': pd.read_csv('../../../mltestdata/05_recruit/date_info.csv').rename(columns={'calendar_date':'visit_date'})
}

# それぞれのデータをマージするために、まずは、relation用のものをマージします
data['hr'] = pd.merge(data['hr'], data['id'], how = 'inner', on = ['hpg_store_id'])

for df in ['ar', 'hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r:(r['visit_datetime']- r['reserve_datetime']).days, axis = 1)
    tmp1 = data[df].groupby(['air_store_id', 'visit_datetime'], as_index =False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns = {'visit_datetime':'visit_date', 'reserve_datetime_diff':'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id', 'visit_datetime'], as_index =False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns = {'visit_datetime':'visit_date', 'reserve_datetime_diff':'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how = 'inner', on = ['air_store_id', 'visit_date'])

data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

unique_stores = data['tes']['air_store_id'].unique()

stores = pd.concat([pd.DataFrame({'air_store_id':unique_stores, 'dow':[i]*len(unique_stores)}) for i in range(7)], axis =0, ignore_index = True).reset_index(drop = True) 

#曜日だけでなく、月も追加
stores_m = pd.concat([pd.DataFrame({'air_store_id':unique_stores, 'month':[i]*len(unique_stores)}) for i in range(1,13)], axis =0, ignore_index = True).reset_index(drop = True)
stores = pd.merge(stores_m, stores,on=('air_store_id'), how='left')

tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].min().rename(columns = {'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].median().rename(columns = {'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].max().rename(columns = {'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].count().rename(columns = {'visitors':'count_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])

#曜日だけでなく、ID×月も追加
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].min().rename(columns = {'visitors':'m_min_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'m_mean_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].median().rename(columns = {'visitors':'m_median_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].max().rename(columns = {'visitors':'m_max_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].count().rename(columns = {'visitors':'m_count_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])

stores = pd.merge(stores, data['as'], how= "left", on = ['air_store_id'])

stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/', ' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-', ' ')))

lbl = preprocessing.LabelEncoder()
for i in range(10):
    stores['air_genre_name' + str(i)] = lbl.fit_transform(stores['air_genre_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ' '))
    stores['air_area_name' + str(i)] = lbl.fit_transform(stores['air_area_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ' '))
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

#土日フラグ(day_of_week_1)と、休日前(holi_2)フラグを追加
data['hol']['day_of_week_1']= data['hol']['day_of_week'].replace(['Saturday', 'Sunday','Monday','Tuesday','Wednesday','Thursday','Friday'],['1', '1','0','0','0','0','0']).astype('int')
data['hol']['holi_2'] = data['hol'][['holiday_flg', 'day_of_week_1']].sum(axis = 1)
data['hol']['holi_2'] = data['hol']['holi_2'].apply( lambda x: 0 if x < 1 else 1 )
data['hol']['holi_2'] = data['hol']['holi_2'].shift(-1)
data['hol']['holi_2'] = data['hol']['holi_2'].fillna(1)
data['hol']['holi_2'] = data['hol']['holi_2'].astype('int')

data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date
train = pd.merge(data['tra'], data['hol'], how ='left', on = ['visit_date'])
test = pd.merge(data['tes'], data['hol'], how ='left', on = ['visit_date'])

#曜日と月でmerge
train = pd.merge(train, stores, how ='left', on = ['air_store_id', 'dow','month'])
test = pd.merge(test, stores, how ='left', on = ['air_store_id', 'dow','month'])

#ID×休日前でのvisitorsの平均、中央値等を追加
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].min().rename(columns = {'visitors':'h_min_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'h_mean_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].median().rename(columns = {'visitors':'h_median_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].max().rename(columns = {'visitors':'h_max_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].count().rename(columns = {'visitors':'h_count_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])

for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1) 

train['total_reserve_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserve_mean'] = (train['rv2_x'] + train['rv2_y'])/2
train['total_reserve_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y'])/2

test['total_reserve_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserve_mean'] = (test['rv2_x'] + test['rv2_y'])/2
test['total_reserve_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y'])/2

train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

lbl = preprocessing.LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2']= lbl.fit_transform(test['air_store_id'])

train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date', 'visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

ntrain = train.shape[0]
ntest = test.shape[0]

all_data = pd.concat([train, test]) 

#指数移動平均の追加。これはなくても良いかも
#https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/discussion/46179#266344
#def calc_shifted_ewm(series, alpha, adjust=True):
#    return series.shift().ewm(alpha=alpha, adjust=adjust).mean()

#train['ewm'] = train.groupby(['air_store_id', 'dow']).apply(lambda g: calc_shifted_ewm(g['visitors'], 0.1)).sort_index(level=['air_store_id']).values

#以下、気象データの追加
df_air_store_weather_station = pd.read_csv('../../../mltestdata/05_recruit/air_store_info_with_nearest_active_station.csv')

cols = ['air_store_id', 'station_id', 'station_latitude', 'station_longitude', 'station_vincenty', 'station_great_circle']
all_data = pd.merge(all_data, df_air_store_weather_station[cols], on='air_store_id', how='left')

combine = all_data
filenames = []
df_weather = None
for station_id in combine['station_id'].unique():
    fn = f"../../../mltestdata/05_recruit/1-1-16_5-31-17_Weather/{station_id}.csv"
    if not fn in filenames:
        df = pd.read_csv(fn)
        df['station_id'] = station_id
        if df_weather is None:
            df_weather = df
        else:
            df_weather = pd.concat([df_weather, df])
        del df

        filenames.append(fn)
    else:
        continue

#欠損値を平均で穴埋め（median, ffillで試すも特に差は出なかった）
df_weather = df_weather.fillna(df_weather.mean())

df_weather = df_weather.rename(columns={'calendar_date': 'visit_date'})

df_weather['visit_date'] = pd.to_datetime(df_weather['visit_date'])
df_weather['visit_date'] = df_weather['visit_date'].dt.date

#なんとなく対数化
df_weather['precipitation'] = np.log1p(df_weather['precipitation'])

#使いそうなデータだけ結合（その他の気象データは試していません。特に意味はなし）
cols = ['station_id', 
    'visit_date', 
    'precipitation', 
    'hours_sunlight',
    'avg_temperature',
    'high_temperature',
    'low_temperature']

combine = pd.merge(combine, df_weather[cols], on=['station_id', 'visit_date'], how='left')

#降水量をカテゴリ化
def simplify_pre(df):
    df.precipitation = df.precipitation.fillna(0)
    bins = ( -1, 0.01, 2,  5)
    group_names = ['1', '2', '3']
    categories = pd.cut(df.precipitation, bins, labels=group_names)
    df.precipitation = categories
    return df

combine = simplify_pre(combine) 
all_data = combine 

################################## modified on 1-Feb
#不要そうなデータを削除
#drop_col =['station_id', 'station_latitude','station_longitude','station_vincenty', 'station_great_circle','hours_sunlight','high_temperature','low_temperature']
drop_col =['hours_sunlight','high_temperature','low_temperature']
all_data = all_data.drop(drop_col, axis = 1)

train = all_data[:ntrain]
test = all_data[ntrain:]

################################## modified on 1-Feb
#Encode station_id
lbl = preprocessing.LabelEncoder()
train['station_id'] = lbl.fit_transform(train['station_id']) 
test['station_id'] = lbl.fit_transform(test['station_id'])

#ID×降水量で平均、中央値等を追加
tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].min().rename(columns = {'visitors':'p_min_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'p_mean_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].median().rename(columns = {'visitors':'p_median_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation',])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].max().rename(columns = {'visitors':'p_max_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].count().rename(columns = {'visitors':'p_count_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

#countをLabel Encoder化。なんとなく試してみたら結果が良かった。
lbl = preprocessing.LabelEncoder()
train['count_visitors'] = lbl.fit_transform(train['count_visitors']) 
test['count_visitors']= lbl.fit_transform(test['count_visitors'])
train['m_count_visitors'] = lbl.fit_transform(train['m_count_visitors'])
test['m_count_visitors']= lbl.fit_transform(test['m_count_visitors'])
train['h_count_visitors'] = lbl.fit_transform(train['h_count_visitors'])
test['h_count_visitors']= lbl.fit_transform(test['h_count_visitors'])
train['p_count_visitors'] = lbl.fit_transform(train['p_count_visitors'])
test['p_count_visitors']= lbl.fit_transform(test['p_count_visitors'])

# GW flag
combine = [train, test]
gw_list = ['2016-04-29','2016-04-30','2016-05-01','2016-05-02','2016-05-03','2016-05-04','2016-05-05','2017-04-29','2017-04-30','2017-05-01','2017-05-02','2017-05-03','2017-05-04','2017-05-05']
post_gw_list=['2016-05-06']
train['gw_flg'] = 0
train['post_gw_flg'] = 0
test['gw_flg'] = 0
test['post_gw_flg'] = 0
update_gw_list = [["0" for i in range(3)] for j in range(len(gw_list))]
update_post_gw_list = [["0" for i in range(3)] for j in range(len(post_gw_list))]

from datetime import date
for index, gw_date in enumerate(gw_list):
    temp_list = gw_date.split("-")
    for col_i, temp_figure in enumerate(temp_list):
        update_gw_list[index][col_i]=int(temp_figure)
        
    #print("{}  {}  {}".format(update_list[index][0],update_list[index][1],update_list[index][2]))
    
for index, gw_date in enumerate(post_gw_list):
    temp_list = gw_date.split("-")
    for col_i, temp_figure in enumerate(temp_list):
        update_post_gw_list[index][col_i]=int(temp_figure)

for dataset in combine:
    for index in range(len(update_gw_list)):
        dataset.loc[dataset.visit_date == date(update_gw_list[index][0],update_gw_list[index][1],update_gw_list[index][2]), 'gw_flg'] = 1
        
for dataset in combine:
    for index in range(len(update_post_gw_list)):
        dataset.loc[dataset.visit_date == date(update_post_gw_list[index][0],update_post_gw_list[index][1],update_post_gw_list[index][2]), 'post_gw_flg'] = 1     

In [3]:
tra = train.copy()
tes = test.copy()

import datetime as dt
from datetime import datetime

#tra = pd.read_csv('/Users/suzukishinji/kaggle/recluit/air_visit_data.csv')
#tes = pd.read_csv('/Users/suzukishinji/kaggle/recluit/sample_submission-3.csv')

#countで試してみましたが、「2016年初めからデータはあるものの営業日の少ないお店」と「データは途中からだけど営業日の多い店」が混じるため、2016/1~2016/6に営業していた店と営業していないお店という分け方をしてみました。

tra['visit_date'] = pd.to_datetime(tra['visit_date'])
tra['visit_date'] = pd.to_datetime(tra['visit_date'])
tra['dow'] = tra['visit_date'].dt.dayofweek
tra['year'] = tra['visit_date'].dt.year
tra['month'] = tra['visit_date'].dt.month
tra['visit_date'] = tra['visit_date'].dt.date

#2016/1~2016/6に営業しているお店は316店舗でした。
year_2016 = tra[tra['year'] == 2016]
month_1 = year_2016[year_2016['month'] == 1]
month_2 = year_2016[year_2016['month'] == 2]
month_3 = year_2016[year_2016['month'] == 3]
month_4 = year_2016[year_2016['month'] == 4]
month_5 = year_2016[year_2016['month'] == 5]
month_6 = year_2016[year_2016['month'] == 6]
tra_store_316_6 = pd.concat([month_1,month_2,month_3,month_4,month_5,month_6])

id = list(tra_store_316_6['air_store_id'].values.flatten())

#trainデータから2016/1~2016/6に営業しているお店の全期間を抜き出したもの。
#trainデータから上記を除いたものが513店舗。
tra_store_316_all = tra[tra['air_store_id'].isin(id)]
tra_store_513 = tra[~tra['air_store_id'].isin(id)] # これはNOTなんだ。

tes['visit_date'] = tes['id'].map(lambda x: str(x).split('_')[2])
tes['air_store_id'] = tes['id'].map(lambda x: '_'.join(x.split('_')[:2]))
tes['visit_date'] = pd.to_datetime(tes['visit_date'])
tes['dow'] = tes['visit_date'].dt.dayofweek
tes['year'] = tes['visit_date'].dt.year
tes['month'] = tes['visit_date'].dt.month
tes['visit_date'] = tes['visit_date'].dt.date

#testデータから2016/1~2016/6に営業しているお店の全期間を抜き出したもの。
#testデータから上記を除いたものが513店舗。
tes_store_316 = tes[tes['air_store_id'].isin(id)]
tes_store_513 = tes[~tes['air_store_id'].isin(id)]

#使うのは①2016/1~2016/6の期間のデータ、②2016/1~2016/6に営業していたお店の全期間のデータ、③途中からの513店舗のデータ。
#tra_store_316_6[['air_store_id', 'visit_date','visitors']].to_csv('tra_store_316_6.csv', index = False)
#tra_store_316_all[['air_store_id', 'visit_date','visitors']].to_csv('tra_store_316_all.csv', index = False)
#tra_store_513[['air_store_id', 'visit_date','visitors']].to_csv('tra_store_513.csv', index = False)

#使うのは①2016/1~2016/6の期間のデータ、②2016/1~2016/6に営業していたお店の全期間のデータ、③途中からの513店舗のデータ。
tra_store_316_6.to_csv('tra_store_316_6.csv', index = False)
tra_store_316_all.to_csv('tra_store_316_all.csv', index = False)
tra_store_513.to_csv('tra_store_513.csv', index = False)

#テストデータは期間で分けずに最初からの316店舗と途中からの513店舗
#tes_store_316[['id','visitors']].to_csv('tes_store_316.csv', index = False)
#tes_store_513[['id','visitors']].to_csv('tes_store_513.csv', index = False)

tes_store_316.to_csv('tes_store_316.csv', index = False)
tes_store_513.to_csv('tes_store_513.csv', index = False)

tra_316_6 = pd.read_csv('./tra_store_316_6.csv') # 316 for period 1 (first 6 months)
tra_316_all = pd.read_csv('./tra_store_316_all.csv') # 316 for period 1 & 2
tra_513 = pd.read_csv('./tra_store_513.csv') # 513 for period 2 (expect first 6 months)
tes_316 = pd.read_csv('./tes_store_316.csv') # 316 only
tes_513 = pd.read_csv('./tes_store_513.csv') # 513 only

#出力されているか確認。（①と③はくっつけて使う）
print("Total train: "+str(tra.shape))
print("316 for period 1: "+str(tra_316_6.shape))
print("316 for period 1 & 2: "+str(tra_316_all.shape)+" <=== To be used")
print("513 for period 2 "+str(tra_513.shape))
tra_513_316 = pd.concat([tra_316_6, tra_513], ignore_index=True)
print("316 for period 1 + 513 for period 2 "+str(tra_513_316.shape)+" <=== To be used"+"\n")
print("Total test: "+str(tes.shape))
print("test with 316"+str(tes_316.shape))
print("test with 513"+str(tes_513.shape))

train_1 = tra_316_all.copy()
test_1 = tes_316.copy()

train_2 = tra_513_316.copy()
test_2 = tes_513.copy()

y_1 = train_1.visitors

drop_cols=['visitors','air_store_id','visit_date','id']
train_1=train_1.drop(drop_cols, axis=1)
test_1=test_1.drop(drop_cols, axis=1)

y_2 = train_2.visitors

train_2=train_2.drop(drop_cols, axis=1)
test_2=test_2.drop(drop_cols, axis=1)

Total train: (252108, 80)
316 for period 1: (47699, 80)
316 for period 1 & 2: (126700, 80) <=== To be used
513 for period 2 (125408, 80)
316 for period 1 + 513 for period 2 (173107, 80) <=== To be used

Total test: (32019, 80)
test with 316(12207, 80)
test with 513(19812, 80)


In [40]:
_tes_316 = tes_316.drop(["visitors"],axis=1)
_tes_513 = tes_513.drop(["visitors"],axis=1)

# LightGBM

__Define utility function__

In [11]:
#Define a evaluation function

def rmsle(preds, true):

    rmsle = np.sqrt(mean_squared_error(np.log1p(true), np.log1p(preds)))
    return float(rmsle)

In [6]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer
RMSLE = make_scorer(rmsle)

In [7]:
# Define a function for comparing predictions and true data.
def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

__Cross validation with LightGBM__

In [8]:
def cross_validate_lgb(params, x_train, y_train, x_test, 
                        kf, 
                        cat_features=[],
                        verbose=True, verbose_eval=100, nseeds=1, df_input=True,
                        early_stopping=100, num_boost_round=8000, scoreonly=False):

    start_time = time.time()
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))

    # self-defined eval metric
    # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
    # binary error
    def feval_rmsle(preds, train_data):
        preds = np.expm1(preds)
        true = np.expm1(train_data.get_label())

        return 'rmsle', rmsle(preds, true), False
       
    if len(cat_features)==0: use_cat=False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)): # folds 1, 2 ,3 ,4, 5
        # example: training from 1,2,3,4; validation from 5
        if df_input:
            x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]
        else:
            x_train_kf, x_val_kf = x_train[train_index], x_train[val_index]

        y_train_kf, y_val_kf = np.log1p(y_train[train_index]), np.log1p(y_train[val_index])

        for seed in range(nseeds):
            params['feature_fraction_seed'] = seed
            params['bagging_seed'] = seed

            if use_cat:
                lgb_train = lgb.Dataset(x_train_kf, y_train_kf, categorical_feature=cat_features)
                lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train, categorical_feature=cat_features)

            else:
                lgb_train = lgb.Dataset(x_train_kf, y_train_kf)
                lgb_val = lgb.Dataset(x_val_kf, y_val_kf, reference=lgb_train)

            gbm = lgb.train(params,
                            lgb_train,
                            num_boost_round=num_boost_round,
                            valid_sets=[lgb_val],
                            early_stopping_rounds=early_stopping,
                            feval=feval_rmsle,
                            verbose_eval=verbose_eval)

            val_pred = np.expm1(gbm.predict(x_val_kf, num_iteration=gbm.best_iteration))
            
            train_pred[val_index] += val_pred
            test_pred += np.expm1((gbm.predict(x_test, num_iteration=gbm.best_iteration)))

                                
        train_pred[val_index] = val_pred/nseeds

        #fold_rmsle = rmsle(np.expm1(y_val_kf.values), train_pred[val_index])
        fold_rmsle = rmsle(train_pred[val_index],np.expm1(y_val_kf.values))
        if verbose:
            print('fold cv {} RMSLE score is {:.6f}'.format(i, fold_rmsle))

    test_pred = test_pred / (nseeds * kf.n_splits)
    #cv_score = rmsle(y_train, train_pred)
    cv_score = rmsle(train_pred, y_train)
    
    if verbose:
        print('cv RMSLE score is {:.6f}'.format(cv_score))
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
    #return cv_score, np.expm1(train_pred),test_pred
    
    if scoreonly:
        return cv_score
    else:
        return cv_score, train_pred, test_pred

# 1. Modeling based on train_1

In [12]:
kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)

lgb_params = {
    'boosting_type': 'dart',
    'max_depth' : 5,
    'max_bin' : 500,
    'learning_rate': 0.1,  # 0.618580
    'num_leaves': 22,
    #'metric': 'RMSE'
}


print('Start training...')

#cv_score =cross_validate_lgb(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)
#cv_score =cross_validate_lgb_nofeval(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)
cv_score =cross_validate_lgb(lgb_params, train_1, y_1, test_1, kf, verbose=True, verbose_eval=50,df_input=True,scoreonly=True)

print('cv RMSLE score is {:.6f}'.format(cv_score))

Start training...
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.570788
[100]	valid_0's rmsle: 0.574997
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.503997
fold cv 0 RMSLE score is 1.783158
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.577542
[100]	valid_0's rmsle: 0.582236
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.506213
fold cv 1 RMSLE score is 1.789501
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.573235
[100]	valid_0's rmsle: 0.577793
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.503836
fold cv 2 RMSLE score is 1.784790
cv RMSLE score is 1.785819
it takes 30.249 seconds to perform cross validation
cv RMSLE score is 1.785819


__Bayesian Optimsation - Setup__

In [15]:
params={
    'num_leaves':(7,4095),#(7,4095),
    'max_depth':(2,63),
    'learning_rate':(0.05,0.3),
    'scale_pos_weight':(1,10000),
    'min_sum_hessian_in_leaf':(2,30),
    'subsample':(0.4,1.0),
    'colsample_bytree':(0.4,1.0),
    'feature_fraction':(0.1,0.9),
    'bagging_fraction':(0.1,0.9),
    'bagging_freq':(0,2),
    'lambda_l1':(0.0,1.0),
    'lambda_l2':(0.0,1.0),
    'n_estimators':(2,30), 
    'reg_lambda':(0.0,2.0),
    'min_gain_to_split':(0.0,1.0)
}

In [26]:
# reload(lgb_wrapper)
#def lgbcv_func(max_depth, learning_rate, subsample, colsample_bytree, nthread=4, seed=0):
def lgbcv_func(num_leaves, max_depth, learning_rate,
               scale_pos_weight, 
               min_sum_hessian_in_leaf, 
               subsample, 
               colsample_bytree,
               feature_fraction, bagging_fraction, 
               bagging_freq, lambda_l1, lambda_l2,
               n_estimators,reg_lambda,min_gain_to_split,
               nthread=4):

    params = {
        'objective' : "regression",
        'task': 'train',
        'boosting_type': 'dart',
                
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth), 
        'learning_rate': float(learning_rate),
        'scale_pos_weight':scale_pos_weight,
        'min_sum_hessian_in_leaf':float(min_sum_hessian_in_leaf), 
        'subsample':subsample,
        'colsample_bytree':colsample_bytree,
        'feature_fraction':feature_fraction, 
        'bagging_fraction':bagging_fraction,
        'bagging_freq':int(bagging_freq), 
        'lambda_l1':lambda_l1, 
        'lambda_l2':lambda_l2,
        'n_estimators':n_estimators,
        'reg_lambda':reg_lambda,
        'min_gain_to_split':min_gain_to_split       
        #'metric': 'RMSE'
    }
    
    # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
    kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)
    
    # we will disable all the verbose setting in this functional call, so that we don't have too much information 
    # to read during the bayesian optimisation process.
    return 1-cross_validate_lgb(params, train_1, y_1, test_1, kf, verbose=False, verbose_eval=False, scoreonly=True)

In [27]:
lgb_bo=BayesianOptimization(lgbcv_func, params)

In [28]:
lgb_bo.maximize(init_points=5, n_iter=15)

[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   bagging_fraction |   bagging_freq |   colsample_bytree |   feature_fraction |   lambda_l1 |   lambda_l2 |   learning_rate |   max_depth |   min_gain_to_split |   min_sum_hessian_in_leaf |   n_estimators |   num_leaves |   reg_lambda |   scale_pos_weight |   subsample | 
    1 | 08m19s | [35m  -0.76066[0m | [32m            0.5136[0m | [32m        0.6819[0m | [32m            0.8882[0m | [32m            0.6091[0m | [32m     0.4021[0m | [32m     0.1456[0m | [32m         0.0954[0m | [32m    55.9998[0m | [32m             0.1872[0m | [32m                   8.3007[0m | [32m       20.5923[0m | [32m   1871.9580

In [29]:
print('-'*30)
print('Maximum value: %f' % lgb_bo.res['max']['max_val'])
print('Best parameters: ', lgb_bo.res['max']['max_params'])

------------------------------
Maximum value: 0.484076
Best parameters:  {'num_leaves': 1812.0372756775496, 'max_depth': 2.0, 'learning_rate': 0.29999999999999999, 'scale_pos_weight': 7742.3535594133255, 'min_sum_hessian_in_leaf': 30.0, 'subsample': 1.0, 'colsample_bytree': 0.40000000000000002, 'feature_fraction': 0.90000000000000002, 'bagging_fraction': 0.10000000000000001, 'bagging_freq': 0.0, 'lambda_l1': 1.0, 'lambda_l2': 0.0, 'n_estimators': 2.0, 'reg_lambda': 0.0, 'min_gain_to_split': 1.0}


__Velification__

In [30]:
kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)

lgb_params = {
    'num_leaves': int(1812.0372756775496), 
    'max_depth': int(2.0), 
    'learning_rate': 0.29999999999999999, 
    'scale_pos_weight': 7742.3535594133255, 
    'min_sum_hessian_in_leaf': 30.0, 
    'subsample': 1.0, 
    'colsample_bytree': 0.40000000000000002, 
    'feature_fraction': 0.90000000000000002, 
    'bagging_fraction': 0.10000000000000001, 
    'bagging_freq': int(0.0), 
    'lambda_l1': 1.0, 
    'lambda_l2': 0.0, 
    'n_estimators': 2.0, 
    'reg_lambda': 0.0, 
    'min_gain_to_split': 1.0
} 

outcomes=cross_validate_lgb(lgb_params, train_1, y_1, test_1, kf, verbose_eval=False)

lgb_cv=outcomes[0]
lgb_train_1_pred=outcomes[1]
lgb_test_1_pred=outcomes[2]

lgb_train_1_pred_df=pd.DataFrame(columns=['visitors'], data=lgb_train_1_pred)
lgb_test_1_pred_df=pd.DataFrame(columns=['visitors'], data=lgb_test_1_pred)

fold cv 0 RMSLE score is 0.499625
fold cv 1 RMSLE score is 0.493109
fold cv 2 RMSLE score is 0.494765
fold cv 3 RMSLE score is 0.491680
fold cv 4 RMSLE score is 0.494757
fold cv 5 RMSLE score is 0.494452
fold cv 6 RMSLE score is 0.495229
fold cv 7 RMSLE score is 0.493174
fold cv 8 RMSLE score is 0.489943
fold cv 9 RMSLE score is 0.490895
cv RMSLE score is 0.493775
it takes 65.343 seconds to perform cross validation


# 2. Modeling based on train_2

In [31]:
kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)

lgb_params = {
    'boosting_type': 'dart',
    'max_depth' : 5,
    'max_bin' : 500,
    'learning_rate': 0.1,  # 0.618580
    'num_leaves': 22,
    #'metric': 'RMSE'
}


print('Start training...')

#cv_score =cross_validate_lgb(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)
#cv_score =cross_validate_lgb_nofeval(lgb_params, train_input, y, kf, verbose=True, verbose_eval=50,df_input=True)
cv_score =cross_validate_lgb(lgb_params, train_2, y_2, test_2, kf, verbose=True, verbose_eval=50,df_input=True,scoreonly=True)

print('cv RMSLE score is {:.6f}'.format(cv_score))

Start training...
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.567371
[100]	valid_0's rmsle: 0.571482
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.498357
fold cv 0 RMSLE score is 1.781551
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.569884
[100]	valid_0's rmsle: 0.574181
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.499114
fold cv 1 RMSLE score is 1.784154
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's rmsle: 0.564257
[100]	valid_0's rmsle: 0.568736
Early stopping, best iteration is:
[45]	valid_0's rmsle: 0.49446
fold cv 2 RMSLE score is 1.779858
cv RMSLE score is 1.781856
it takes 38.891 seconds to perform cross validation
cv RMSLE score is 1.781856


__Bayesian Optimsation - Setup__

In [32]:
params={
    'num_leaves':(7,4095),#(7,4095),
    'max_depth':(2,63),
    'learning_rate':(0.05,0.3),
    'scale_pos_weight':(1,10000),
    'min_sum_hessian_in_leaf':(2,30),
    'subsample':(0.4,1.0),
    'colsample_bytree':(0.4,1.0),
    'feature_fraction':(0.1,0.9),
    'bagging_fraction':(0.1,0.9),
    'bagging_freq':(0,2),
    'lambda_l1':(0.0,1.0),
    'lambda_l2':(0.0,1.0),
    'n_estimators':(2,30), 
    'reg_lambda':(0.0,2.0),
    'min_gain_to_split':(0.0,1.0)
}

In [33]:
# reload(lgb_wrapper)
#def lgbcv_func(max_depth, learning_rate, subsample, colsample_bytree, nthread=4, seed=0):
def lgbcv_func(num_leaves, max_depth, learning_rate,
               scale_pos_weight, 
               min_sum_hessian_in_leaf, 
               subsample, 
               colsample_bytree,
               feature_fraction, bagging_fraction, 
               bagging_freq, lambda_l1, lambda_l2,
               n_estimators,reg_lambda,min_gain_to_split,
               nthread=4):

    params = {
        'objective' : "regression",
        'task': 'train',
        'boosting_type': 'dart',
                
        'num_leaves': int(num_leaves),
        'max_depth': int(max_depth), 
        'learning_rate': float(learning_rate),
        'scale_pos_weight':scale_pos_weight,
        'min_sum_hessian_in_leaf':float(min_sum_hessian_in_leaf), 
        'subsample':subsample,
        'colsample_bytree':colsample_bytree,
        'feature_fraction':feature_fraction, 
        'bagging_fraction':bagging_fraction,
        'bagging_freq':int(bagging_freq), 
        'lambda_l1':lambda_l1, 
        'lambda_l2':lambda_l2,
        'n_estimators':n_estimators,
        'reg_lambda':reg_lambda,
        'min_gain_to_split':min_gain_to_split       
        #'metric': 'RMSE'
    }
    
    # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
    kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)
    
    # we will disable all the verbose setting in this functional call, so that we don't have too much information 
    # to read during the bayesian optimisation process.
    return 1-cross_validate_lgb(params, train_2, y_2, test_2, kf, verbose=False, verbose_eval=False, scoreonly=True)

In [34]:
lgb_bo=BayesianOptimization(lgbcv_func, params)

In [35]:
lgb_bo.maximize(init_points=5, n_iter=15)

[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   bagging_fraction |   bagging_freq |   colsample_bytree |   feature_fraction |   lambda_l1 |   lambda_l2 |   learning_rate |   max_depth |   min_gain_to_split |   min_sum_hessian_in_leaf |   n_estimators |   num_leaves |   reg_lambda |   scale_pos_weight |   subsample | 
    1 | 02m04s | [35m  -0.78045[0m | [32m            0.5702[0m | [32m        1.7636[0m | [32m            0.7240[0m | [32m            0.3566[0m | [32m     0.6395[0m | [32m     0.7273[0m | [32m         0.1244[0m | [32m     5.0359[0m | [32m             0.7410[0m | [32m                   6.1572[0m | [32m       12.1459[0m | [32m   1279.2608

In [37]:
print('-'*30)
print('Maximum value: %f' % lgb_bo.res['max']['max_val'])
print('Best parameters: ', lgb_bo.res['max']['max_params'])

------------------------------
Maximum value: 0.502675
Best parameters:  {'num_leaves': 7.0, 'max_depth': 63.0, 'learning_rate': 0.29999999999999999, 'scale_pos_weight': 4653.6549687603901, 'min_sum_hessian_in_leaf': 30.0, 'subsample': 1.0, 'colsample_bytree': 1.0, 'feature_fraction': 0.90000000000000002, 'bagging_fraction': 0.90000000000000002, 'bagging_freq': 2.0, 'lambda_l1': 1.0, 'lambda_l2': 1.0, 'n_estimators': 30.0, 'reg_lambda': 0.0, 'min_gain_to_split': 1.0}


__Velification__

In [38]:
kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)

lgb_params = {
    'num_leaves': int(7.0), 
    'max_depth': int(63.0), 
    'learning_rate': 0.29999999999999999, 
    'scale_pos_weight': 4653.6549687603901, 
    'min_sum_hessian_in_leaf': 30.0, 
    'subsample': 1.0, 
    'colsample_bytree': 1.0, 
    'feature_fraction': 0.90000000000000002, 
    'bagging_fraction': 0.90000000000000002, 
    'bagging_freq': int(2.0), 
    'lambda_l1': 1.0, 
    'lambda_l2': 1.0, 
    'n_estimators': 30.0, 
    'reg_lambda': 0.0, 
    'min_gain_to_split': 1.0} 

outcomes=cross_validate_lgb(lgb_params, train_2, y_2, test_2, kf, verbose_eval=False)

lgb_cv=outcomes[0]
lgb_train_2_pred=outcomes[1]
lgb_test_2_pred=outcomes[2]

lgb_train_2_pred_df=pd.DataFrame(columns=['visitors'], data=lgb_train_2_pred)
lgb_test_2_pred_df=pd.DataFrame(columns=['visitors'], data=lgb_test_2_pred)

fold cv 0 RMSLE score is 0.488026
fold cv 1 RMSLE score is 0.485111
fold cv 2 RMSLE score is 0.480672
fold cv 3 RMSLE score is 0.480342
fold cv 4 RMSLE score is 0.472245
fold cv 5 RMSLE score is 0.476952
fold cv 6 RMSLE score is 0.479841
fold cv 7 RMSLE score is 0.481549
fold cv 8 RMSLE score is 0.473780
fold cv 9 RMSLE score is 0.476972
cv RMSLE score is 0.479580
it takes 78.843 seconds to perform cross validation


# Saving data

In [41]:
lv1_lgb_train_1_pred = lgb_train_1_pred_df.copy()
lv1_lgb_train_1_pred.to_csv('lv1_lgb_train_1_pred.csv', index=False)

lv1_lgb_test_1_pred = lgb_test_1_pred_df.copy()
lv1_lgb_test_1_pred.to_csv('lv1_lgb_test_1_pred.csv', index=False)

_tes_316 = pd.concat([_tes_316,lv1_lgb_test_1_pred], axis=1)

lv1_lgb_train_2_pred = lgb_train_2_pred_df.copy()
lv1_lgb_train_2_pred.to_csv('lv1_lgb_train_2_pred.csv', index=False)

lv1_lgb_test_2_pred = lgb_test_2_pred_df.copy()
lv1_lgb_test_2_pred.to_csv('lv1_lgb_test_2_pred.csv', index=False)

_tes_513 = pd.concat([_tes_513,lv1_lgb_test_2_pred], axis=1)

lgb_test_pred_df = pd.concat([_tes_316,_tes_513])

sub_ = lgb_test_pred_df[['id','visitors']].copy()

sub_ = sub_.sort_values(by=['id'])

# Submission

In [42]:
sub_.to_csv('submission_rs_recruit_v15_lgbm_fe_suzukiry_01.csv', index=False)
print('Finished.')
# lgbm
# fe
# bopt
# train data separation
# LB: ----

Finished.


__Last work with weight__

In [43]:
sub1 = sub_.copy()

In [44]:
# from hklee
# https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
    pd.read_csv(fn)for fn in glob.glob('../../../mltestdata/05_recruit/*.csv')}

for k, v in dfs.items(): locals()[k] = v

wkend_holidays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.

sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(visitors, on=[
    'air_store_id', 'day_of_week', 'holiday_flg'], how='left')

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), 
    how='left')['visitors_y'].values

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values

sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sub2 = sample_submission[['id', 'visitors']].copy()
sub_merge = pd.merge(sub1, sub2, on='id', how='inner')

In [45]:
sub_merge['visitors'] = (sub_merge['visitors_x'] + sub_merge['visitors_y']* 1.1)/2
sub_merge[['id', 'visitors']].to_csv('submission_rs_recruit_v15_lgbm_fe_suzukiry_02.csv', index=False)
# lgbm
# fe
# bopt
# train data separation
# weight
# LB: ---