In [2]:
import glob, re
import numpy as np
import pandas as pd
from sklearn import *
import datetime as dt
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle



In [3]:
train = pd.read_csv("../Input/train_proc2.csv")
test = pd.read_csv("../Input/test_proc2.csv")
submission = pd.read_csv("../Input/sample_submission.csv")
air_store = pd.read_csv("../Input/allstore_info_proc1.csv").rename(columns={'store_id':'air_store_id'})
date_info = pd.read_csv("../Input/date_info.csv").rename(columns={"calendar_date" : 'visit_date'})

In [4]:
train_weather = pd.read_csv("../Input/train_weather_01.csv")
test_weather = pd.read_csv("../Input/test_weather_01.csv")

In [5]:
train_weather.head()

Unnamed: 0,air_store_id,visit_date,station_id,avg_temperature,high_temperature,low_temperature,precipitation,hours_sunlight
0,air_00a91d42b08b08d9,2016-07-01,tokyo__tokyo-kana__tonokyo,25.6,30.6,22.0,,4.4
1,air_00a91d42b08b08d9,2016-07-02,tokyo__tokyo-kana__tonokyo,27.0,31.8,23.4,0.0,6.9
2,air_00a91d42b08b08d9,2016-07-04,tokyo__tokyo-kana__tonokyo,27.8,33.8,23.6,1.5,7.1
3,air_00a91d42b08b08d9,2016-07-05,tokyo__tokyo-kana__tonokyo,21.7,25.6,20.2,0.0,0.0
4,air_00a91d42b08b08d9,2016-07-06,tokyo__tokyo-kana__tonokyo,23.1,26.9,20.5,,4.1


In [6]:
all_weather = pd.concat([train_weather, test_weather])
all_weather['visit_date'] = pd.to_datetime(all_weather['visit_date'])
all_weather.isnull().sum()

air_store_id            0
visit_date              0
station_id              0
avg_temperature     27898
high_temperature    27902
low_temperature     27902
precipitation       89120
hours_sunlight      40087
dtype: int64

In [7]:
submission['visit_date'] = submission['id'].map(lambda x: str(x).split('_')[2])
submission['air_store_id'] = submission['id'].map(lambda x: '_'.join(x.split('_')[:2]))
submission['visit_date'] = pd.to_datetime(submission['visit_date'])
submission['visitors'] = np.nan

In [8]:
train['visit_date'] = pd.to_datetime(train['visit_date'])
test['visit_date'] = pd.to_datetime(test['visit_date'])
test['visitors'] = -1

In [9]:
date_info['visit_date'] = pd.to_datetime(date_info['visit_date'])

In [10]:
date_info['date_seq_id'] = date_info['visit_date'].dt.date - date_info['visit_date'].dt.date.min() + dt.timedelta(1)
date_info['date_seq_id'] = date_info['date_seq_id']/dt.timedelta(1)

In [11]:
date_info['week_seq_id'] = ((date_info['date_seq_id']+4)/7).astype(np.int64)

In [12]:
stores_to_drop = list(set(train['air_store_id']) - set(test['air_store_id']))
train = train.where(~(train['air_store_id'].isin(stores_to_drop)))
train = train.dropna(axis=0,subset=['air_store_id'])

In [13]:
all_data = pd.concat([train,test])

In [14]:
train_len = len(train)

In [15]:
del train; del test;
gc.collect();

In [16]:
all_data = pd.merge(all_data,air_store,how='left',on='air_store_id',)

In [17]:
all_data = pd.merge(all_data,date_info,how='left',on='visit_date')

In [18]:
all_data['visit_month'] = all_data['visit_date'].dt.month
all_data['visit_month'] = all_data['visit_date'].dt.year

In [19]:
# dont know why this would help ??
all_data['diff_max_lat'] = all_data['latitude'].max() - all_data['latitude']
all_data['diff_min_lat'] = all_data['latitude'].min() - all_data['latitude']
all_data['diff_max_long'] = all_data['longitude'].max() - all_data['longitude']
all_data['diff_min_long'] = all_data['longitude'].min() - all_data['longitude']
all_data['lat_plus_long'] = all_data['latitude'] + all_data['longitude']

In [20]:
stores_to_drop = ['air_b2d8bc9c88b85f96',
 'air_cf22e368c1a71d53',
 'air_229d7e508d9f1b5e',
 'air_d0a7bd3339c3d12a',
 'air_cb083b4789a8d3a2',
 'air_2703dcb33192b181',
 'air_0ead98dd07e7a82a',
 'air_d63cfa6d6ab78446']

In [21]:
all_data = all_data.where(~(all_data['air_store_id'].isin(stores_to_drop)))
all_data = all_data.dropna(axis=0,subset=['air_store_id'])

In [22]:
all_data = pd.merge(all_data,all_weather, on=['air_store_id','visit_date'], how='left')

In [23]:
all_data['holiday_flg'] = all_data[['day_of_week','holiday_flg']].apply(lambda x: 1 if x['holiday_flg'] == 1 or 
    x['day_of_week'] == 'Friday' or x['day_of_week'] == 'Saturday' or x['day_of_week'] == 'Sunday' else 0, axis=1)

In [24]:
all_data.shape

(282487, 113)

In [25]:
train = all_data[:train_len]

In [26]:
test = all_data[train_len:]

In [27]:
train.shape

(250468, 113)

In [28]:
test.shape

(32019, 113)

In [79]:
def calc_shifted_ewm(series, alpha, adjust=True):
    return series.shift().ewm(alpha=alpha, adjust=adjust).mean()

In [84]:
x = (all_data.groupby(['air_store_id', 'day_of_week'], as_index=False)
                  #.apply(lambda g: calc_shifted_ewm(g['visitors'], 0.1))
                  #.sort_index(level=['air_store_id', 'visit_date'])
                  )

In [91]:
x.head()

Unnamed: 0,air_store_id,visit_date,visitors,vmax_6weekago_10,vmax_6weekago_11,vmax_6weekago_12,vmax_6weekago_13,vmax_6weekago_14,vmax_6weekago_15,vmax_6weekago_7,...,diff_min_lat,diff_max_long,diff_min_long,lat_plus_long,station_id,avg_temperature,high_temperature,low_temperature,precipitation,hours_sunlight
0,air_00a91d42b08b08d9,2016-07-01,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,25.6,30.6,22.0,,4.4
1,air_00a91d42b08b08d9,2016-07-02,9.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,27.0,31.8,23.4,0.0,6.9
2,air_00a91d42b08b08d9,2016-07-04,20.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,27.8,33.8,23.6,1.5,7.1
3,air_00a91d42b08b08d9,2016-07-05,25.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,21.7,25.6,20.2,0.0,0.0
4,air_00a91d42b08b08d9,2016-07-06,29.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,23.1,26.9,20.5,,4.1
5,air_00a91d42b08b08d9,2016-07-07,34.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,27.4,36.7,21.9,,12.8
6,air_00a91d42b08b08d9,2016-07-08,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,25.3,29.8,22.1,0.0,2.1
7,air_00a91d42b08b08d9,2016-07-09,11.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,22.4,24.9,20.3,12.5,0.0
8,air_00a91d42b08b08d9,2016-07-11,25.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,28.0,33.5,22.3,,12.5
9,air_00a91d42b08b08d9,2016-07-12,24.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,-2.482035,4.519803,-9.558040,175.447598,tokyo__tokyo-kana__tonokyo,27.1,31.8,24.6,,6.3


In [None]:
del air_store; del date_info; del all_data; del all_weather; gc.collect();

In [None]:
train['visit_date'] = pd.to_datetime(train['visit_date'])

In [None]:
train['visit_date'] = train['visit_date'].dt.date

In [None]:
train.sort_values('visit_date').iloc[-1]['visit_date']-dt.timedelta(days=90) #38days validation set

In [None]:
test = train[train['visit_date'] > dt.date(2016, 12, 23)]

In [None]:
train = train[train['visit_date'] <= dt.date(2016, 12, 23)]

In [None]:
train = train.fillna(-1)
test = test.fillna(-1)

In [None]:
# %load prepareData.py
def prepareData(trainIn, testIn):
    stat1 = trainIn.groupby(["air_store_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_store_day'})
    stat2 = trainIn.groupby(["air_store_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_store_day'})
        
    stat3 = trainIn.groupby(["area_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_area'})
    stat4 = trainIn.groupby(["area_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_area'})

    stat5 = trainIn.groupby(["n200mt_cluster_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_200mt'})
    stat6 = trainIn.groupby(["n200mt_cluster_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_200mt'})

    stat7 = trainIn.groupby(["n400mt_cluster_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_400mt'})
    stat8 = trainIn.groupby(["n400mt_cluster_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_400mt'})
    
    stat9 = trainIn.groupby(["n1000mt_cluster_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_1000mt'})
    stat10 = trainIn.groupby(["n1000mt_cluster_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_1000mt'})
    
    stat11 = trainIn.groupby(["n200mt_cluster_id","genre_name"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_genre_200mt'})
    stat12 = trainIn.groupby(["n200mt_cluster_id","genre_name"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_genre_200mt'})
    
    stat13 = trainIn.groupby(["n400mt_cluster_id","genre_name"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_genre_400mt'})
    stat14 = trainIn.groupby(["n400mt_cluster_id","genre_name"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_genre_400mt'})
    
    stat15 = trainIn.groupby(["n1000mt_cluster_id","genre_name"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_genre_1000mt'})
    stat16 = trainIn.groupby(["n1000mt_cluster_id","genre_name"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_genre_1000mt'})
    #####################################
    stat17 = trainIn.groupby(["air_store_id","day_of_week",'week_seq_id'],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_store_day'})
    stat18 = trainIn.groupby(["air_store_id","day_of_week",'week_seq_id'],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_store_day'})
    stat17 = pd.merge(stat17, stat18, on=['air_store_id','day_of_week','week_seq_id'])
    #del stat18; gc.collect()
    
    stat_6wago_day = stat17.copy()
    stat_6wago_day.loc[:,'week_seq_id'] = np.nan
    stat_6wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 6
    stat_6wago_day = stat_6wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_sum_6wks_wago'})
    stat_6wago_day = stat_6wago_day.rename(columns={'visitors_mean_store_day' : 'visitors_mean_6wks_wago'})
    
    stat_7wago_day = stat17.copy()
    stat_7wago_day.loc[:,'week_seq_id'] = np.nan
    stat_7wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 7
    stat_7wago_day = stat_7wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_sum_7wks_wago'})
    stat_7wago_day = stat_7wago_day.rename(columns={'visitors_mean_store_day' : 'visitors_mean_7wks_wago'})
   
    stat_8wago_day = stat17.copy()
    stat_8wago_day.loc[:,'week_seq_id'] = np.nan
    stat_8wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 8
    stat_8wago_day = stat_8wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_sum_8wks_wago'})
    stat_8wago_day = stat_8wago_day.rename(columns={'visitors_mean_store_day' : 'visitors_mean_8wks_wago'})

    stat_9wago_day = stat17.copy()
    stat_9wago_day.loc[:,'week_seq_id'] = np.nan
    stat_9wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 9
    stat_9wago_day = stat_9wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_sum_9wks_wago'})
    stat_9wago_day = stat_9wago_day.rename(columns={'visitors_mean_store_day' : 'visitors_mean_9wks_wago'})
    
    stat_10wago_day = stat17.copy()
    stat_10wago_day.loc[:,'week_seq_id'] = np.nan
    stat_10wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 10
    stat_10wago_day = stat_10wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_sum_10wks_wago'})
    stat_10wago_day = stat_10wago_day.rename(columns={'visitors_mean_store_day' : 'visitors_mean_10wks_wago'})
    
    #################################
    stat18 = trainIn.groupby(["air_store_id","week_seq_id"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_store_week'})
    stat19 = trainIn.groupby(["air_store_id","week_seq_id"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_store_week'})                  
    stat18 = pd.merge(stat18, stat19, on=['air_store_id','week_seq_id'])
    del stat19;
    
    stat_6wago = stat18.copy()
    stat_6wago.loc[:,'week_seq_id'] = np.nan
    stat_6wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 6
    stat_6wago = stat_6wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_6wago'})
    stat_6wago = stat_6wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_6wago'})
    
    stat_7wago = stat18.copy()
    stat_7wago.loc[:,'week_seq_id'] = np.nan
    stat_7wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 7
    stat_7wago = stat_7wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_7wago'})
    stat_7wago = stat_7wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_8wago'})
    
    stat_8wago = stat18.copy()
    stat_8wago.loc[:,'week_seq_id'] = np.nan
    stat_8wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 8
    stat_8wago = stat_8wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_day_6wago'})
    stat_8wago = stat_8wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_day_6wago'}) 
    
    stat_9wago = stat18.copy()
    stat_9wago.loc[:,'week_seq_id'] = np.nan
    stat_9wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 9
    stat_9wago = stat_9wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_9wago'})
    stat_9wago = stat_9wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_9wago'})
    
    stat_10wago = stat18.copy()
    stat_10wago.loc[:,'week_seq_id'] = np.nan
    stat_10wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 10
    stat_10wago = stat_10wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_10wago'})
    stat_10wago = stat_10wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_10wago'}) 
    ##############################
    
    trainIn = pd.merge(trainIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat3, on = ["area_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat4, on = ["area_id", "day_of_week"], how='left')
    
    trainIn = pd.merge(trainIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    
    trainIn = pd.merge(trainIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')
    
    trainIn = pd.merge(trainIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    
    trainIn = pd.merge(trainIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    
    ############################
    testIn = pd.merge(testIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat3, on = ["area_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat4, on = ["area_id", "day_of_week"], how='left')
    
    testIn = pd.merge(testIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')

    testIn = pd.merge(testIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')
    
    testIn = pd.merge(testIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    
    testIn = pd.merge(testIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
 
    return (trainIn, testIn)

In [None]:
col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors','prefecture','city']]

In [None]:
params = {}
params['objective'] = 'reg:linear'
params['booster'] = 'gbtree'
params['eval_metric'] = 'rmse'
#params['eta'] = 0.1
params['max_depth'] = 10
params['silent'] = 1
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['tree_method'] = "exact"

#watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [None]:
#X = train[train['visit_date'] < dt.date(2017, 2, 21)]
#X = train[col].copy()
y = np.log1p(train['visitors']).copy()

In [None]:
X = train.copy()

In [None]:
test = test[col]

In [None]:
K = 10
kf = model_selection.KFold(n_splits = K, shuffle = True)

In [None]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

In [None]:
y_test_pred = 0
#K-Fold Validation for xgboost
for i, (train_index, test_index) in enumerate(kf.split(X)):
    train, val = prepareData(X.iloc[train_index, :].copy(), X.iloc[test_index, :].copy())
    # Create data for this fold
    #y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    #X_train, X_valid = .iloc[train_index, :].copy(), X.iloc[test_index, :].copy()
    
    X_train, y_train = train[col], np.log1p(train['visitors'])
    X_valid, y_valid = val[col], np.log1p(val['visitors'])
    print("\nFold ", i)
    
    d_train = xgb.DMatrix(X_train[col],y_train ,missing = np.nan,)
    d_valid = xgb.DMatrix(X_valid[col], y_valid, missing = np.nan)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    model = (xgb.train(params,d_train,num_boost_round=5000,evals=watchlist,
                   early_stopping_rounds=1000,verbose_eval=100,))
    pred = model.predict(xgb.DMatrix(X_valid))
    print('RMSLE XGB Regressor, validation set, fold ', i, ': ', RMSLE(y_valid, pred))

    pred = model.predict(xgb.DMatrix(test[col]))
    print('Prediction length on test set, XGB Regressor, fold ', i, ': ', len(pred))
    y_test_pred += pred

    del X_train, X_valid, y_train, y_valid

In [None]:
y_test_pred /= (K)

In [None]:
print('RMSLE XGB Regressor, full validtion, fold  ' + str(RMSLE(np.log1p(test['visitors']).values, y_test_pred)))