# Test xgboost with Bayesian Optimsation

# Feature Engineering

In [1]:
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error,r2_score
import time
%matplotlib inline

import xgboost
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings("ignore")

In [2]:
### Thanks to Shinji Suzuki

# OS
import glob, re
from datetime import datetime
import pickle

# data science tool
import numpy as np
import pandas as pd
import datetime as dt


# machine learning
from sklearn import *
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb

# データの読み込み
# 事前にcalendar_dateをvisit_dataに変更しています。airとhpgで同じことですが、別名で使用されているようです。
data = {
    'tra': pd.read_csv('../../../mltestdata/05_recruit/air_visit_data.csv'),
    'as': pd.read_csv('../../../mltestdata/05_recruit/air_store_info.csv'),
    'hs': pd.read_csv('../../../mltestdata/05_recruit/hpg_store_info.csv'),
    'ar': pd.read_csv('../../../mltestdata/05_recruit/air_reserve.csv'),
    'hr': pd.read_csv('../../../mltestdata/05_recruit/hpg_reserve.csv'),
    'id': pd.read_csv('../../../mltestdata/05_recruit/store_id_relation.csv'),
    'tes': pd.read_csv('../../../mltestdata/05_recruit/sample_submission.csv'),
    'hol': pd.read_csv('../../../mltestdata/05_recruit/date_info.csv').rename(columns={'calendar_date':'visit_date'})
}

# それぞれのデータをマージするために、まずは、relation用のものをマージします
data['hr'] = pd.merge(data['hr'], data['id'], how = 'inner', on = ['hpg_store_id'])

for df in ['ar', 'hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r:(r['visit_datetime']- r['reserve_datetime']).days, axis = 1)
    tmp1 = data[df].groupby(['air_store_id', 'visit_datetime'], as_index =False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns = {'visit_datetime':'visit_date', 'reserve_datetime_diff':'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id', 'visit_datetime'], as_index =False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns = {'visit_datetime':'visit_date', 'reserve_datetime_diff':'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how = 'inner', on = ['air_store_id', 'visit_date'])

data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

unique_stores = data['tes']['air_store_id'].unique()

stores = pd.concat([pd.DataFrame({'air_store_id':unique_stores, 'dow':[i]*len(unique_stores)}) for i in range(7)], axis =0, ignore_index = True).reset_index(drop = True) 

#曜日だけでなく、月も追加
stores_m = pd.concat([pd.DataFrame({'air_store_id':unique_stores, 'month':[i]*len(unique_stores)}) for i in range(1,13)], axis =0, ignore_index = True).reset_index(drop = True)
stores = pd.merge(stores_m, stores,on=('air_store_id'), how='left')

tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].min().rename(columns = {'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].median().rename(columns = {'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].max().rename(columns = {'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])
tmp = data['tra'].groupby(['air_store_id', 'dow'], as_index = False)['visitors'].count().rename(columns = {'visitors':'count_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'dow'])

#曜日だけでなく、ID×月も追加
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].min().rename(columns = {'visitors':'m_min_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'m_mean_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].median().rename(columns = {'visitors':'m_median_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].max().rename(columns = {'visitors':'m_max_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])
tmp = data['tra'].groupby(['air_store_id', 'month'], as_index = False)['visitors'].count().rename(columns = {'visitors':'m_count_visitors'})
stores = pd.merge(stores, tmp, how ='left', on = ['air_store_id', 'month'])

stores = pd.merge(stores, data['as'], how= "left", on = ['air_store_id'])

stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/', ' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-', ' ')))

lbl = preprocessing.LabelEncoder()
for i in range(10):
    stores['air_genre_name' + str(i)] = lbl.fit_transform(stores['air_genre_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ' '))
    stores['air_area_name' + str(i)] = lbl.fit_transform(stores['air_area_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ' '))
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

#土日フラグ(day_of_week_1)と、休日前(holi_2)フラグを追加
data['hol']['day_of_week_1']= data['hol']['day_of_week'].replace(['Saturday', 'Sunday','Monday','Tuesday','Wednesday','Thursday','Friday'],['1', '1','0','0','0','0','0']).astype('int')
data['hol']['holi_2'] = data['hol'][['holiday_flg', 'day_of_week_1']].sum(axis = 1)
data['hol']['holi_2'] = data['hol']['holi_2'].apply( lambda x: 0 if x < 1 else 1 )
data['hol']['holi_2'] = data['hol']['holi_2'].shift(-1)
data['hol']['holi_2'] = data['hol']['holi_2'].fillna(1)
data['hol']['holi_2'] = data['hol']['holi_2'].astype('int')

data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date
train = pd.merge(data['tra'], data['hol'], how ='left', on = ['visit_date'])
test = pd.merge(data['tes'], data['hol'], how ='left', on = ['visit_date'])

#曜日と月でmerge
train = pd.merge(train, stores, how ='left', on = ['air_store_id', 'dow','month'])
test = pd.merge(test, stores, how ='left', on = ['air_store_id', 'dow','month'])

#ID×休日前でのvisitorsの平均、中央値等を追加
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].min().rename(columns = {'visitors':'h_min_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'h_mean_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].median().rename(columns = {'visitors':'h_median_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].max().rename(columns = {'visitors':'h_max_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])
tmp = train.groupby(['air_store_id','holi_2'], as_index = False)['visitors'].count().rename(columns = {'visitors':'h_count_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','holi_2'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','holi_2'])

for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1) 

train['total_reserve_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserve_mean'] = (train['rv2_x'] + train['rv2_y'])/2
train['total_reserve_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y'])/2

test['total_reserve_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserve_mean'] = (test['rv2_x'] + test['rv2_y'])/2
test['total_reserve_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y'])/2

train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

lbl = preprocessing.LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2']= lbl.fit_transform(test['air_store_id'])

train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date', 'visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

ntrain = train.shape[0]
ntest = test.shape[0]

all_data = pd.concat([train, test]) 

#指数移動平均の追加。これはなくても良いかも
#https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/discussion/46179#266344
#def calc_shifted_ewm(series, alpha, adjust=True):
#    return series.shift().ewm(alpha=alpha, adjust=adjust).mean()

#train['ewm'] = train.groupby(['air_store_id', 'dow']).apply(lambda g: calc_shifted_ewm(g['visitors'], 0.1)).sort_index(level=['air_store_id']).values

#以下、気象データの追加
df_air_store_weather_station = pd.read_csv('../../../mltestdata/05_recruit/air_store_info_with_nearest_active_station.csv')

cols = ['air_store_id', 'station_id', 'station_latitude', 'station_longitude', 'station_vincenty', 'station_great_circle']
all_data = pd.merge(all_data, df_air_store_weather_station[cols], on='air_store_id', how='left')

combine = all_data
filenames = []
df_weather = None
for station_id in combine['station_id'].unique():
    fn = f"../../../mltestdata/05_recruit/1-1-16_5-31-17_Weather/{station_id}.csv"
    if not fn in filenames:
        df = pd.read_csv(fn)
        df['station_id'] = station_id
        if df_weather is None:
            df_weather = df
        else:
            df_weather = pd.concat([df_weather, df])
        del df

        filenames.append(fn)
    else:
        continue

#欠損値を平均で穴埋め（median, ffillで試すも特に差は出なかった）
df_weather = df_weather.fillna(df_weather.mean())

df_weather = df_weather.rename(columns={'calendar_date': 'visit_date'})

df_weather['visit_date'] = pd.to_datetime(df_weather['visit_date'])
df_weather['visit_date'] = df_weather['visit_date'].dt.date

#なんとなく対数化
df_weather['precipitation'] = np.log1p(df_weather['precipitation'])

#使いそうなデータだけ結合（その他の気象データは試していません。特に意味はなし）
cols = ['station_id', 
    'visit_date', 
    'precipitation', 
    'hours_sunlight',
    'avg_temperature',
    'high_temperature',
    'low_temperature']

combine = pd.merge(combine, df_weather[cols], on=['station_id', 'visit_date'], how='left')

#降水量をカテゴリ化
def simplify_pre(df):
    df.precipitation = df.precipitation.fillna(0)
    bins = ( -1, 0.01, 2,  5)
    group_names = ['1', '2', '3']
    categories = pd.cut(df.precipitation, bins, labels=group_names)
    df.precipitation = categories
    return df

combine = simplify_pre(combine) 
all_data = combine 

################################## modified on 1-Feb
#不要そうなデータを削除
#drop_col =['station_id', 'station_latitude','station_longitude','station_vincenty', 'station_great_circle','hours_sunlight','high_temperature','low_temperature']
drop_col =['hours_sunlight','high_temperature','low_temperature']
all_data = all_data.drop(drop_col, axis = 1)

train = all_data[:ntrain]
test = all_data[ntrain:]

################################## modified on 1-Feb
#Encode station_id
lbl = preprocessing.LabelEncoder()
train['station_id'] = lbl.fit_transform(train['station_id']) 
test['station_id'] = lbl.fit_transform(test['station_id']) 



#ID×降水量で平均、中央値等を追加
tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].min().rename(columns = {'visitors':'p_min_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].mean().rename(columns = {'visitors':'p_mean_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].median().rename(columns = {'visitors':'p_median_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation',])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].max().rename(columns = {'visitors':'p_max_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

tmp = train.groupby(['air_store_id','precipitation'], as_index = False)['visitors'].count().rename(columns = {'visitors':'p_count_visitors'})
train = pd.merge(train, tmp, how ='left', on = ['air_store_id','precipitation'])
test = pd.merge(test, tmp, how ='left', on = ['air_store_id','precipitation'])

#countをLabel Encoder化。なんとなく試してみたら結果が良かった。
lbl = preprocessing.LabelEncoder()
train['count_visitors'] = lbl.fit_transform(train['count_visitors']) 
test['count_visitors']= lbl.fit_transform(test['count_visitors'])
train['m_count_visitors'] = lbl.fit_transform(train['m_count_visitors'])
test['m_count_visitors']= lbl.fit_transform(test['m_count_visitors'])
train['h_count_reserves'] = lbl.fit_transform(train['h_count_visitors'])
test['h_count_reserves']= lbl.fit_transform(test['h_count_visitors'])
train['p_count_visitors'] = lbl.fit_transform(train['p_count_visitors'])
test['p_count_visitors']= lbl.fit_transform(test['p_count_visitors'])

# GW flag
combine = [train, test]
gw_list = ['2016-04-29','2016-04-30','2016-05-01','2016-05-02','2016-05-03','2016-05-04','2016-05-05','2017-04-29','2017-04-30','2017-05-01','2017-05-02','2017-05-03','2017-05-04','2017-05-05']
post_gw_list=['2016-05-06']
train['gw_flg'] = 0
train['post_gw_flg'] = 0
test['gw_flg'] = 0
test['post_gw_flg'] = 0
update_gw_list = [["0" for i in range(3)] for j in range(len(gw_list))]
update_post_gw_list = [["0" for i in range(3)] for j in range(len(post_gw_list))]

from datetime import date
for index, gw_date in enumerate(gw_list):
    temp_list = gw_date.split("-")
    for col_i, temp_figure in enumerate(temp_list):
        update_gw_list[index][col_i]=int(temp_figure)
        
    #print("{}  {}  {}".format(update_list[index][0],update_list[index][1],update_list[index][2]))
    
for index, gw_date in enumerate(post_gw_list):
    temp_list = gw_date.split("-")
    for col_i, temp_figure in enumerate(temp_list):
        update_post_gw_list[index][col_i]=int(temp_figure)

for dataset in combine:
    for index in range(len(update_gw_list)):
        dataset.loc[dataset.visit_date == date(update_gw_list[index][0],update_gw_list[index][1],update_gw_list[index][2]), 'gw_flg'] = 1
        
for dataset in combine:
    for index in range(len(update_post_gw_list)):
        dataset.loc[dataset.visit_date == date(update_post_gw_list[index][0],update_post_gw_list[index][1],update_post_gw_list[index][2]), 'post_gw_flg'] = 1     

In [3]:
y = train.visitors
train_input = train.copy()
test_input = test.copy()

drop_cols=['visitors','air_store_id','visit_date','id']
train_input=train_input.drop(drop_cols, axis=1)
test_input=test_input.drop(drop_cols, axis=1)

In [4]:
train['precipitation'] = train['precipitation'].values.astype(float)
test['precipitation'] = test['precipitation'].values.astype(float)

train_input['precipitation'] = train_input['precipitation'].values.astype(float)
test_input['precipitation'] = test_input['precipitation'].values.astype(float)

In [5]:
tra = train.copy()
tes = test.copy()

import datetime as dt
from datetime import datetime

#tra = pd.read_csv('/Users/suzukishinji/kaggle/recluit/air_visit_data.csv')
#tes = pd.read_csv('/Users/suzukishinji/kaggle/recluit/sample_submission-3.csv')

#countで試してみましたが、「2016年初めからデータはあるものの営業日の少ないお店」と「データは途中からだけど営業日の多い店」が混じるため、2016/1~2016/6に営業していた店と営業していないお店という分け方をしてみました。

tra['visit_date'] = pd.to_datetime(tra['visit_date'])
tra['visit_date'] = pd.to_datetime(tra['visit_date'])
tra['dow'] = tra['visit_date'].dt.dayofweek
tra['year'] = tra['visit_date'].dt.year
tra['month'] = tra['visit_date'].dt.month
tra['visit_date'] = tra['visit_date'].dt.date

#2016/1~2016/6に営業しているお店は316店舗でした。
year_2016 = tra[tra['year'] == 2016]
month_1 = year_2016[year_2016['month'] == 1]
month_2 = year_2016[year_2016['month'] == 2]
month_3 = year_2016[year_2016['month'] == 3]
month_4 = year_2016[year_2016['month'] == 4]
month_5 = year_2016[year_2016['month'] == 5]
month_6 = year_2016[year_2016['month'] == 6]
tra_store_316_6 = pd.concat([month_1,month_2,month_3,month_4,month_5,month_6])

id = list(tra_store_316_6['air_store_id'].values.flatten())

#trainデータから2016/1~2016/6に営業しているお店の全期間を抜き出したもの。
#trainデータから上記を除いたものが513店舗。
tra_store_316_all = tra[tra['air_store_id'].isin(id)]
tra_store_513 = tra[~tra['air_store_id'].isin(id)] # これはNOTなんだ。

tes['visit_date'] = tes['id'].map(lambda x: str(x).split('_')[2])
tes['air_store_id'] = tes['id'].map(lambda x: '_'.join(x.split('_')[:2]))
tes['visit_date'] = pd.to_datetime(tes['visit_date'])
tes['dow'] = tes['visit_date'].dt.dayofweek
tes['year'] = tes['visit_date'].dt.year
tes['month'] = tes['visit_date'].dt.month
tes['visit_date'] = tes['visit_date'].dt.date

#testデータから2016/1~2016/6に営業しているお店の全期間を抜き出したもの。
#testデータから上記を除いたものが513店舗。
tes_store_316 = tes[tes['air_store_id'].isin(id)]
tes_store_513 = tes[~tes['air_store_id'].isin(id)]

#使うのは①2016/1~2016/6の期間のデータ、②2016/1~2016/6に営業していたお店の全期間のデータ、③途中からの513店舗のデータ。
#tra_store_316_6[['air_store_id', 'visit_date','visitors']].to_csv('tra_store_316_6.csv', index = False)
#tra_store_316_all[['air_store_id', 'visit_date','visitors']].to_csv('tra_store_316_all.csv', index = False)
#tra_store_513[['air_store_id', 'visit_date','visitors']].to_csv('tra_store_513.csv', index = False)

#使うのは①2016/1~2016/6の期間のデータ、②2016/1~2016/6に営業していたお店の全期間のデータ、③途中からの513店舗のデータ。
tra_store_316_6.to_csv('tra_store_316_6.csv', index = False) #index部分がいらない場合はオプションを追加
tra_store_316_all.to_csv('tra_store_316_all.csv', index = False)
tra_store_513.to_csv('tra_store_513.csv', index = False)

#テストデータは期間で分けずに最初からの316店舗と途中からの513店舗
#tes_store_316[['id','visitors']].to_csv('tes_store_316.csv', index = False)
#tes_store_513[['id','visitors']].to_csv('tes_store_513.csv', index = False)

tes_store_316.to_csv('tes_store_316.csv', index = False)
tes_store_513.to_csv('tes_store_513.csv', index = False)

tra_316_6 = pd.read_csv('./tra_store_316_6.csv') # 316 for period 1 (first 6 months)
tra_316_all = pd.read_csv('./tra_store_316_all.csv') # 316 for period 1 & 2
tra_513 = pd.read_csv('./tra_store_513.csv') # 513 for period 2 (expect first 6 months)
tes_316 = pd.read_csv('./tes_store_316.csv') # 316 only
tes_513 = pd.read_csv('./tes_store_513.csv') # 513 only

#出力されているか確認。（①と③はくっつけて使う）
print("Total train: "+str(tra.shape))
print("316 for period 1: "+str(tra_316_6.shape))
print("316 for period 1 & 2: "+str(tra_316_all.shape)+" <=== To be used")
print("513 for period 2 "+str(tra_513.shape))
tra_513_316 = pd.concat([tra_316_6, tra_513], ignore_index=True)
tra_513_316_index = pd.concat([tra_316_6, tra_513])
print("316 for period 1 + 513 for period 2 "+str(tra_513_316.shape)+" <=== To be used"+"\n")
print("Total test: "+str(tes.shape))
print("test with 316"+str(tes_316.shape))
print("test with 513"+str(tes_513.shape))

train_1 = tra_316_all.copy()
test_1 = tes_316.copy()

train_2 = tra_513_316.copy()
test_2 = tes_513.copy()

y_1 = train_1.visitors

drop_cols=['visitors','air_store_id','visit_date','id']
train_1=train_1.drop(drop_cols, axis=1)
test_1=test_1.drop(drop_cols, axis=1)

y_2 = train_2.visitors

train_2=train_2.drop(drop_cols, axis=1)
test_2=test_2.drop(drop_cols, axis=1)

Total train: (252108, 81)
316 for period 1: (47699, 81)
316 for period 1 & 2: (126700, 81) <=== To be used
513 for period 2 (125408, 81)
316 for period 1 + 513 for period 2 (173107, 81) <=== To be used

Total test: (32019, 81)
test with 316(12207, 81)
test with 513(19812, 81)


In [32]:
tra_store_316_all.shape

(126700, 81)

In [33]:
tra_store_513.shape

(125408, 81)

In [35]:
tes_store_316.shape

(12207, 81)

In [36]:
tes_store_513.shape

(19812, 81)

# Modeling

__Define utility function__

In [6]:
#Define a evaluation function

def rmsle(preds, true):
    rmsle = np.sqrt(mean_squared_error(np.log1p(true), np.log1p(preds)))
    return float(rmsle)

In [7]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer
RMSLE = make_scorer(rmsle)

In [8]:
# Define a function for comparing predictions and true data.
def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

In [9]:
def cross_validate_xgb(params, x_train, y_train, x_test, kf,  verbose=True, verbose_eval=50, scoreonly=False):
    start_time=time.time()
    nround=[]
    # the prediction matrix need to contains 3 columns, one for the probability of each class
    #train_pred = np.zeros((x_train.shape[0],3))
    train_pred = np.zeros((x_train.shape[0]))
    test_pred = np.zeros((x_test.shape[0]))
    
    # self-defined eval metric
    # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
    # binary error
    def feval_rmsle(preds, train_data):
        preds = np.expm1(preds)
        true = np.expm1(train_data.get_label())
        #return 'rmsle', rmsle(true, preds), False

        return 'rmsle', rmsle(preds, true), False

    # use the k-fold object to enumerate indexes for each training and validation fold
    for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)):
        x_train_kf, x_val_kf = x_train.loc[train_index, :], x_train.loc[val_index, :]

        #y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]
        y_train_kf, y_val_kf = np.log1p(y_train[train_index]), np.log1p(y_train[val_index])
        x_test_kf=x_test.copy()
        
        d_train = xgboost.DMatrix(x_train_kf, y_train_kf)
        d_val=xgboost.DMatrix(x_val_kf, y_val_kf)
        d_test = xgboost.DMatrix(x_test_kf)
        
        watchlist= [(d_train, "train"), (d_val, 'val')]
        bst = xgboost.train(params=params, 
                            dtrain=d_train, 
                            num_boost_round=8000, 
                            early_stopping_rounds=100,
                            evals=watchlist, 
                            verbose_eval=verbose_eval)        
        
#        y_val_kf_preds=bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
        y_val_kf_preds=np.expm1(bst.predict(d_val, ntree_limit=bst.best_ntree_limit))
        nround.append(bst.best_ntree_limit)
        
        train_pred[val_index] += y_val_kf_preds
#        test_pred += np.expm1((bst.predict(x_test, ntree_limit=bst.best_ntree_limit)))
        test_pred += np.expm1(bst.predict(d_test))
        
        
        #fold_cv = log_loss(y_val_kf.values, y_val_kf_preds)
        #fold_rmsle = rmsle(np.expm1(train_pred[val_index]),np.expm1(y_val_kf.values))
        fold_rmsle = rmsle(train_pred[val_index],np.expm1(y_val_kf.values))
        fold_cv = fold_rmsle
        
        if verbose:
            print('fold cv {} rmsle score is {:.6f}'.format(i, fold_cv))

    test_pred = test_pred / kf.n_splits
    #cv_score = log_loss(y_train, train_pred)
    #cv_score = rmsle(np.expm1(train_pred), y_train)
    cv_score = rmsle(train_pred, y_train)
    
    if verbose:
        print('cv rmsle score is {:.6f}'.format(cv_score))    
        end_time = time.time()
        print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
 
    if scoreonly:
        #return cv_score # for the purpose of bayesian optimisation, we only need to return the CV score
        return cv_score
    else:
        return (cv_score,train_pred,test_pred)

In [10]:
# only do 3 fold CV here so that we save some running time on Kaggle Kernel
kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)

# 1. Modeling based on train_1

__Cross validation with xgboost__

In [78]:
xgb_params = {
    "objective" : "reg:linear",
    #"num_class" : 3,
    #"tree_method" : "hist",
    "eval_metric" : "rmse",
    "nthread": 4,
    "seed" : 0,
    'silent': 1,

    "eta":0.05,  # default 0.3
    "max_depth" : 5, # default 6
    "subsample" : 0.8, # default 1
    "colsample_bytree" : 0.6, # default 1
    "gamma": 0.5
}

print('Start training...')

cv_score =cross_validate_xgb(xgb_params, train_1, y_1, test_1, kf, verbose=False, verbose_eval=50, scoreonly=True)

print('cv rmsle score is {:.6f}'.format(cv_score))

Start training...
[0]	train-rmse:2.33018	val-rmse:2.34643
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[50]	train-rmse:0.531852	val-rmse:0.543664
[100]	train-rmse:0.489632	val-rmse:0.50142
[150]	train-rmse:0.483156	val-rmse:0.495893
[200]	train-rmse:0.47874	val-rmse:0.492569
[250]	train-rmse:0.475566	val-rmse:0.490667
[300]	train-rmse:0.472716	val-rmse:0.489322
[350]	train-rmse:0.470267	val-rmse:0.487797
[400]	train-rmse:0.468569	val-rmse:0.486893
[450]	train-rmse:0.467194	val-rmse:0.486348
[500]	train-rmse:0.465902	val-rmse:0.485863
[550]	train-rmse:0.464913	val-rmse:0.485571
[600]	train-rmse:0.464201	val-rmse:0.485316
[650]	train-rmse:0.463184	val-rmse:0.484829
[700]	train-rmse:0.462553	val-rmse:0.484527
[750]	train-rmse:0.461746	val-rmse:0.484139
[800]	train-rmse:0.460964	val-rmse:0.483947
[850]	train-rmse:0.460234	val-rmse:0.483817
[900]	train-rmse:0.459619	val-rmse:0.483697
[950]	train

__Bayesian Optimsation - Setup__

In [79]:
params={'max_depth':(4,10),
        'learning_rate':(0.05,0.3),
        'subsample': (0.4, 1),
        'colsample_bytree': (0.4, 1),
        'gamma': (0.001, 10.0),
        'min_child_weight': (0, 20),
        'max_delta_step': (0, 10),
        'n_estimators': (10, 25),
        'min_samples_split': (2, 20),
        'max_features': (0.1, 0.999)
       }

In [82]:
# reload(xgb_wrapper)
def xgbcv_func(max_depth, learning_rate, subsample, 
               colsample_bytree, gamma, min_child_weight, 
               max_delta_step, n_estimators, 
               min_samples_split, max_features,nthread=4, seed=0):
    params = {
        "objective" : "reg:linear",
        #"num_class" : 3,
        #"tree_method" : "hist",
        "eval_metric" : "rmse",
        "nthread": nthread,
        "seed" : 0,
        'silent': 1,

        "eta":learning_rate,  # default 0.3
        "max_depth" : int(max_depth), # default 6
        "subsample" : subsample, # default 1
        "colsample_bytree" : colsample_bytree, # default 1

        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'max_delta_step': max_delta_step,
        'n_estimators': n_estimators,
        'min_samples_split': min_samples_split,
        'max_features': max_features    

    }
    
    # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
    kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)
    
    # we will disable all the verbose setting in this functional call, so that we don't have too much information 
    # to read during the bayesian optimisation process.
    return 1-cross_validate_xgb(params, train_1, y_1, test_1, kf, verbose=False, verbose_eval=False, scoreonly=True)

In [83]:
xgb_bo=BayesianOptimization(xgbcv_func, params)

In [84]:
xgb_bo.maximize(init_points=5, n_iter=15)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   learning_rate |   max_delta_step |   max_depth |   max_features |   min_child_weight |   min_samples_split |   n_estimators |   subsample | 
    1 | 27m07s | [35m   0.51771[0m | [32m            0.7050[0m | [32m   6.9771[0m | [32m         0.1239[0m | [32m          9.4892[0m | [32m     6.6818[0m | [32m        0.4071[0m | [32m            5.7733[0m | [32m            13.9950[0m | [32m       20.4022[0m | [32m     0.5829[0m | 
    2 | 14m44s | [35m   0.52164[0m | [32m            0.9858[0m | [32m   3.2518[0m | [32m         0.1789[0m | [32m          4.3073[0m | [32m     7.1484[0m | [32m        0.7845[0m | [32m            6.8698[0m | [32m            10.9129[0

In [85]:
print('-'*30)
print('Maximum value: %f' % xgb_bo.res['max']['max_val'])
print('Best parameters: ', xgb_bo.res['max']['max_params'])

------------------------------
Maximum value: 0.521641
Best parameters:  {'max_depth': 7.1484196343297413, 'learning_rate': 0.17887629868730259, 'subsample': 0.50453440255149851, 'colsample_bytree': 0.98576451058596715, 'gamma': 3.2517796225471769, 'min_child_weight': 6.8697889345516128, 'max_delta_step': 4.3072885672804917, 'n_estimators': 10.364974451950365, 'min_samples_split': 10.91289701587583, 'max_features': 0.78454358066050689}


__Velification__

In [13]:
xgb_params = {

    'max_depth': int(7.1484196343297413), 'learning_rate': 0.17887629868730259, 'subsample': 0.50453440255149851, 'colsample_bytree': 0.98576451058596715, 'gamma': 3.2517796225471769, 'min_child_weight': 6.8697889345516128, 'max_delta_step': 4.3072885672804917, 'n_estimators': 10.364974451950365, 'min_samples_split': 10.91289701587583, 'max_features': 0.78454358066050689
}
print("Starting xgboost...")
outcomes=cross_validate_xgb(xgb_params, train_1, y_1, test_1, kf, verbose_eval=False)

xgb_cv=outcomes[0]
xgb_train_1_pred=outcomes[1]
xgb_test_1_pred=outcomes[2]

xgb_train_1_pred_df=pd.DataFrame(columns=['visitors'], data=xgb_train_1_pred)
xgb_test_1_pred_df=pd.DataFrame(columns=['visitors'], data=xgb_test_1_pred)
print("Finished.")

Starting xgboost...
fold cv 0 rmsle score is 0.480301
fold cv 1 rmsle score is 0.478021
fold cv 2 rmsle score is 0.476746
fold cv 3 rmsle score is 0.478559
fold cv 4 rmsle score is 0.478631
fold cv 5 rmsle score is 0.478513
fold cv 6 rmsle score is 0.482232
fold cv 7 rmsle score is 0.479451
fold cv 8 rmsle score is 0.473449
fold cv 9 rmsle score is 0.477626
cv rmsle score is 0.478359
it takes 930.909 seconds to perform cross validation
Finished.


# 2. Modeling based on train_2

__Cross validation with xgboost__

In [131]:
# only do 3 fold CV here so that we save some running time on Kaggle Kernel
kf=StratifiedKFold(n_splits=3, shuffle=True, random_state=2018)

xgb_params = {
    "objective" : "reg:linear",
    #"num_class" : 3,
    #"tree_method" : "hist",
    "eval_metric" : "rmse",
    "nthread": 4,
    "seed" : 0,
    'silent': 1,

    "eta":0.05,  # default 0.3
    "max_depth" : 5, # default 6
    "subsample" : 0.8, # default 1
    "colsample_bytree" : 0.6, # default 1
    "gamma": 0.5
}

print('Start training...')

cv_score =cross_validate_xgb(xgb_params, train_2, y_2, test_2, kf, verbose=False, verbose_eval=50, scoreonly=True)

print('cv rmsle score is {:.6f}'.format(cv_score))

Start training...
[0]	train-rmse:2.3282	val-rmse:2.32892
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[50]	train-rmse:0.523935	val-rmse:0.528063
[100]	train-rmse:0.48115	val-rmse:0.487976
[150]	train-rmse:0.474884	val-rmse:0.483455
[200]	train-rmse:0.471059	val-rmse:0.481107
[250]	train-rmse:0.46766	val-rmse:0.479155
[300]	train-rmse:0.465053	val-rmse:0.478059
[350]	train-rmse:0.462606	val-rmse:0.477149
[400]	train-rmse:0.460382	val-rmse:0.476361
[450]	train-rmse:0.458584	val-rmse:0.475764
[500]	train-rmse:0.456891	val-rmse:0.475273
[550]	train-rmse:0.455432	val-rmse:0.474972
[600]	train-rmse:0.4541	val-rmse:0.47464
[650]	train-rmse:0.452775	val-rmse:0.474391
[700]	train-rmse:0.451427	val-rmse:0.474113
[750]	train-rmse:0.450118	val-rmse:0.473927
[800]	train-rmse:0.448668	val-rmse:0.473666
[850]	train-rmse:0.447437	val-rmse:0.4735
[900]	train-rmse:0.446173	val-rmse:0.47339
[950]	train-rmse:0

__Bayesian Optimsation - Setup__

In [132]:
params={'max_depth':(4,10),
        'learning_rate':(0.05,0.3),
        'subsample': (0.4, 1),
        'colsample_bytree': (0.4, 1),
        'gamma': (0.001, 10.0),
        'min_child_weight': (0, 20),
        'max_delta_step': (0, 10),
        'n_estimators': (10, 25),
        'min_samples_split': (2, 20),
        'max_features': (0.1, 0.999)
       }

In [133]:
# reload(xgb_wrapper)
def xgbcv_func(max_depth, learning_rate, subsample, 
               colsample_bytree, gamma, min_child_weight, 
               max_delta_step, n_estimators, 
               min_samples_split, max_features,nthread=4, seed=0):
    params = {
        "objective" : "reg:linear",
        #"num_class" : 3,
        #"tree_method" : "hist",
        "eval_metric" : "rmse",
        "nthread": nthread,
        "seed" : 0,
        'silent': 1,

        "eta":learning_rate,  # default 0.3
        "max_depth" : int(max_depth), # default 6
        "subsample" : subsample, # default 1
        "colsample_bytree" : colsample_bytree, # default 1

        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'max_delta_step': max_delta_step,
        'n_estimators': n_estimators,
        'min_samples_split': min_samples_split,
        'max_features': max_features    

    }
    
    # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
    kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)
    
    # we will disable all the verbose setting in this functional call, so that we don't have too much information 
    # to read during the bayesian optimisation process.
    return 1-cross_validate_xgb(params, train_2, y_2, test_2, kf, verbose=False, verbose_eval=False, scoreonly=True)

In [134]:
xgb_bo=BayesianOptimization(xgbcv_func, params)

In [135]:
xgb_bo.maximize(init_points=5, n_iter=15)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   learning_rate |   max_delta_step |   max_depth |   max_features |   min_child_weight |   min_samples_split |   n_estimators |   subsample | 
    1 | 24m28s | [35m   0.52110[0m | [32m            0.5915[0m | [32m   8.9828[0m | [32m         0.2511[0m | [32m          1.3957[0m | [32m     9.1532[0m | [32m        0.2781[0m | [32m            8.8760[0m | [32m            18.5186[0m | [32m       23.5401[0m | [32m     0.4587[0m | 
    2 | 19m11s | [35m   0.52532[0m | [32m            0.7042[0m | [32m   4.3236[0m | [32m         0.2002[0m | [32m          6.2094[0m | [32m     4.8409[0m | [32m        0.6829[0m | [32m           19.6131[0m | [32m            12.3590[0

In [136]:
print('-'*30)
print('Maximum value: %f' % xgb_bo.res['max']['max_val'])
print('Best parameters: ', xgb_bo.res['max']['max_params'])

------------------------------
Maximum value: 0.529510
Best parameters:  {'max_depth': 6.8960955944176909, 'learning_rate': 0.1067125144540214, 'subsample': 0.93984945674997611, 'colsample_bytree': 0.70093842006487073, 'gamma': 1.3228005298535754, 'min_child_weight': 19.086672690291731, 'max_delta_step': 9.6928478134941152, 'n_estimators': 10.643264156851325, 'min_samples_split': 19.397089382225694, 'max_features': 0.26451327557648185}


__Velification__

In [14]:
kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2018)

xgb_params = {
    'max_depth': int(6.8960955944176909), 'learning_rate': 0.1067125144540214, 'subsample': 0.93984945674997611, 'colsample_bytree': 0.70093842006487073, 'gamma': 1.3228005298535754, 'min_child_weight': 19.086672690291731, 'max_delta_step': 9.6928478134941152, 'n_estimators': 10.643264156851325, 'min_samples_split': 19.397089382225694, 'max_features': 0.26451327557648185
}
print("Starting xgboost...")
outcomes=cross_validate_xgb(xgb_params, train_2, y_2, test_2, kf, verbose_eval=False)

xgb_cv=outcomes[0]
xgb_train_2_pred=outcomes[1]
xgb_test_2_pred=outcomes[2]

xgb_train_2_pred_df=pd.DataFrame(columns=['visitors'], data=xgb_train_2_pred)
xgb_test_2_pred_df=pd.DataFrame(columns=['visitors'], data=xgb_test_2_pred)
print("Finished.")

Starting xgboost...
fold cv 0 rmsle score is 0.477364
fold cv 1 rmsle score is 0.473759
fold cv 2 rmsle score is 0.470892
fold cv 3 rmsle score is 0.471985
fold cv 4 rmsle score is 0.463737
fold cv 5 rmsle score is 0.467912
fold cv 6 rmsle score is 0.472384
fold cv 7 rmsle score is 0.472876
fold cv 8 rmsle score is 0.465027
fold cv 9 rmsle score is 0.468725
cv rmsle score is 0.470490
it takes 2023.403 seconds to perform cross validation
Finished.


# Saving data

In [15]:
# Total train: (252108, 81)
# 316 for period 1: (47699, 81)
# 316 for period 1 & 2: (126700, 81) <=== To be used
# 513 for period 2 (125408, 81)
# 316 for period 1 + 513 for period 2 (173107, 81) <=== To be used

# Total test: (32019, 81)
# test with 316(12207, 81)
# test with 513(19812, 81)

In [16]:
# trainが正しく整理できていない。

_tra_316 = tra_316_all.drop(["visitors"],axis=1)
#_tra_513 = tra_513_316.drop(["visitors"],axis=1)
_tra_513 = tra_513.drop(["visitors"],axis=1)

_tes_316 = tes_316.drop(["visitors"],axis=1)
_tes_513 = tes_513.drop(["visitors"],axis=1)

# 316 for period 1 & 2: (126700, 81) <=== To be used
lv1_xgb_train_1_pred = xgb_train_1_pred_df.copy()
lv1_xgb_train_1_pred.to_csv('lv1_xgb_train_1_pred.csv', index=False)

lv1_xgb_test_1_pred = xgb_test_1_pred_df.copy()
lv1_xgb_test_1_pred.to_csv('lv1_xgb_test_1_pred.csv', index=False)

_tra_316 = pd.concat([_tra_316,lv1_xgb_train_1_pred], axis=1)
_tes_316 = pd.concat([_tes_316,lv1_xgb_test_1_pred], axis=1)

# 316 for period 1 + 513 for period 2 (173107, 81) <=== To be used
lv1_xgb_train_2_pred = xgb_train_2_pred_df.copy()
lv1_xgb_train_2_pred.to_csv('lv1_xgb_train_2_pred.csv', index=False)

lv1_xgb_test_2_pred = xgb_test_2_pred_df.copy()
lv1_xgb_test_2_pred.to_csv('lv1_xgb_test_2_pred.csv', index=False)

#_tra_513 = pd.concat([_tra_513,lv1_xgb_train_2_pred], axis=1)
_tra_513 = pd.concat([_tra_513,lv1_xgb_train_2_pred], axis=1,join_axes=[_tra_513.index])
_tes_513 = pd.concat([_tes_513,lv1_xgb_test_2_pred], axis=1)

# Merge
xgb_train_pred_df = pd.concat([_tra_316,_tra_513])
xgb_test_pred_df = pd.concat([_tes_316,_tes_513])

lv1_xgb_train_input = xgb_train_pred_df[['id','visitors']].copy()
lv1_xgb_test_input = xgb_test_pred_df[['id','visitors']].copy()

lv1_xgb_train_input = lv1_xgb_train_input.sort_values(by=['id'])
lv1_xgb_test_input = lv1_xgb_test_input.sort_values(by=['id'])

In [43]:
train_origin[train_origin['air_store_id'].isin(id)].shape

(126700, 81)

In [44]:
train_origin[~train_origin['air_store_id'].isin(id)].shape

(125408, 81)

In [95]:
train_origin =  pd.concat([tra_store_316_all,tra_store_513])

In [110]:
train_origin.shape

(252108, 81)

In [111]:
train_origin["pred_visitors"] = 0

In [112]:
train_origin.shape

(252108, 82)

In [132]:
tra_index = train_origin[train_origin['air_store_id'].isin(id)].index

In [138]:
tra_index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            251864, 251865, 251866, 251867, 251868, 251869, 251870, 251871,
            251872, 251873],
           dtype='int64', length=126700)

In [139]:
train_origin_ = np.zeros((train_origin.shape[0]))

In [141]:
train_origin_.shape

(252108,)

In [142]:
train_origin_[tra_index] = lv1_xgb_train_1_pred

ValueError: shape mismatch: value array of shape (126700,1) could not be broadcast to indexing result of shape (126700,)

In [144]:
train_origin[train_origin['air_store_id'].isin(id)].pred_visitors = lv1_xgb_train_1_pred.values

In [145]:
train_origin.shape

(252108, 82)

In [146]:
train_origin.pred_visitors.value_counts()

0    252108
Name: pred_visitors, dtype: int64

In [98]:
train_origin_513 = pd.concat([tra_store_316_6,tra_store_513])

In [99]:
train_origin_513.shape

(173107, 81)

In [100]:
train_origin_513["pred_visitors"] = lv1_xgb_train_2_pred.values

In [101]:
train_origin_513[~train_origin_513['air_store_id'].isin(id)].pred_visitors.shape

(125408,)

In [104]:
temp_val = train_origin_513[~train_origin_513['air_store_id'].isin(id)].pred_visitors

In [105]:
temp_val.shape

(125408,)

In [106]:
train_origin[~train_origin['air_store_id'].isin(id)]["pred_visitors"] = temp_val.values

In [107]:
train_origin.shape

(252108, 81)

In [276]:
# Remove id 失敗。あとで整理。
lv1_xgb_train_input = lv1_xgb_train_input.drop(["id"],axis=1)

# Submission

In [278]:
lv1_xgb_train_input.to_csv('lv1_xgb_train_pred.csv', index=False)
lv1_xgb_test_input.to_csv('lv1_xgb_test_pred.csv', index=False)

lv1_xgb_test_input.to_csv('submission_rs_recruit_v14_xgb_linear_fe_suzukiry_01.csv', index=False)

print('Good luck :)')

#fe
#xgboost
#Bopt
#LB NOT SUBMITTED

Good luck :)


__Consider weight__

In [187]:
sub1 = sub_.copy()

In [188]:
# from hklee
# https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
    pd.read_csv(fn)for fn in glob.glob('../../../mltestdata/05_recruit/*.csv')}

for k, v in dfs.items(): locals()[k] = v

wkend_holidays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.

sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(visitors, on=[
    'air_store_id', 'day_of_week', 'holiday_flg'], how='left')

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), 
    how='left')['visitors_y'].values

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values

sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sub2 = sample_submission[['id', 'visitors']].copy()
sub_merge = pd.merge(sub1, sub2, on='id', how='inner')

In [189]:
sub_merge['visitors'] = (sub_merge['visitors_x'] + sub_merge['visitors_y']* 1.1)/2
sub_merge[['id', 'visitors']].to_csv('submission_rs_recruit_v14_xgb_linear_fe_suzukiry_02.csv', index=False)

# fe
# xgb: linear
# Bopt
# weight
# train data separation
# LB 0.480