In [1]:
import glob, re
import numpy as np
import pandas as pd
from sklearn import *
import datetime as dt
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout, PReLU, BatchNormalization
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("../input/train_proc5.csv")
test = pd.read_csv("../input/test_proc5.csv")
submission = pd.read_csv("../input/sample_submission.csv")
air_store = pd.read_csv("../input/allstore_info_proc1.csv").rename(columns={'store_id':'air_store_id'})
date_info = pd.read_csv("../input/date_info.csv").rename(columns={"calendar_date" : 'visit_date'})
train_weather = pd.read_csv("../input/train_weather_01.csv")
test_weather = pd.read_csv("../input/test_weather_01.csv")

In [3]:
submission['visit_date'] = submission['id'].map(lambda x: str(x).split('_')[2])
submission['air_store_id'] = submission['id'].map(lambda x: '_'.join(x.split('_')[:2]))
submission['visit_date'] = pd.to_datetime(submission['visit_date'])
submission['visitors'] = np.nan

In [4]:
all_weather = pd.concat([train_weather, test_weather])
all_weather['visit_date'] = pd.to_datetime(all_weather['visit_date'])

In [5]:
train['visit_date'] = pd.to_datetime(train['visit_date'])
test['visit_date'] = pd.to_datetime(test['visit_date'])
#test['visitors'] = -1
test['visitors'] = np.nan

In [6]:
date_info['visit_date'] = pd.to_datetime(date_info['visit_date'])

In [7]:
date_info['date_seq_id'] = date_info['visit_date'].dt.date - date_info['visit_date'].dt.date.min() + dt.timedelta(1)
date_info['date_seq_id'] = date_info['date_seq_id']/dt.timedelta(1)

In [8]:
date_info['week_seq_id'] = ((date_info['date_seq_id']+4)/7).astype(np.int64)

In [9]:
stores_to_drop = list(set(train['air_store_id']) - set(test['air_store_id']))
train = train.where(~(train['air_store_id'].isin(stores_to_drop)))
train = train.dropna(axis=0,subset=['air_store_id'])

In [10]:
all_data = pd.concat([train,test])

In [11]:
train_len = len(train)

In [12]:
del train; del test;
gc.collect();

In [13]:
all_data = pd.merge(all_data,air_store,how='left',on='air_store_id',)

In [14]:
all_data = pd.merge(all_data,date_info,how='left',on='visit_date')

In [15]:
all_data['visit_month'] = all_data['visit_date'].dt.month
all_data['visit_year'] = all_data['visit_date'].dt.year

In [16]:
all_data['diff_max_lat'] = all_data['latitude'].max() - all_data['latitude']
all_data['diff_min_lat'] = all_data['latitude'].min() - all_data['latitude']
all_data['diff_max_long'] = all_data['longitude'].max() - all_data['longitude']
all_data['diff_min_long'] = all_data['longitude'].min() - all_data['longitude']
all_data['lat_plus_long'] = all_data['latitude'] + all_data['longitude']

In [17]:
stores_to_drop = ['air_b2d8bc9c88b85f96',
 'air_cf22e368c1a71d53',
 'air_229d7e508d9f1b5e',
 'air_d0a7bd3339c3d12a',
 'air_cb083b4789a8d3a2',
 'air_2703dcb33192b181',
 'air_0ead98dd07e7a82a',
 'air_d63cfa6d6ab78446']

In [18]:
all_data = all_data.where(~(all_data['air_store_id'].isin(stores_to_drop)))
all_data = all_data.dropna(axis=0,subset=['air_store_id'])

In [19]:
all_data = pd.merge(all_data,all_weather, on=['air_store_id','visit_date'], how='left')

In [20]:
all_data.shape

(282487, 73)

In [21]:
del air_store;del date_info;gc.collect();

In [22]:
all_data['visit_date'] = pd.to_datetime(all_data['visit_date'])

In [23]:
#train.sort_values('visit_date').iloc[-1]['visit_date']-dt.timedelta(days=90) #38days validation set
print(all_data['visit_date'].min())
print(all_data['visit_date'].max())

2016-01-01 00:00:00
2017-05-31 00:00:00


In [24]:
#all_data = all_data.sort_values(['air_store_id','visit_date'])

In [25]:
train = all_data[:train_len]
test = all_data[train_len:]

In [26]:
train.shape

(250468, 73)

In [27]:
train = train.reset_index().drop('index',axis=1)
test = test.reset_index().drop('index',axis=1)

In [28]:
test.shape

(32019, 73)

In [29]:
dt.date(2017, 4, 22) + dt.timedelta(days=39)

datetime.date(2017, 5, 31)

In [30]:
cols_to_encode = ([i for i,j in zip(all_data.dtypes.index,all_data.dtypes.values) if j == 'object'])
cols_to_encode.remove('air_store_id')
cols_to_encode.remove('visit_date')
#lbl = preprocessing.LabelEncoder()
for i in cols_to_encode:
    #all_data[i] = lbl.fit_transform(all_data[i])
    print(pd.get_dummies(all_data[i]).columns, "\n\n")
    all_data = pd.concat([all_data, pd.get_dummies(all_data[i])],axis=1)
    if i not in ["day_of_week", 'genre_name']:
        all_data.drop(i, axis=1, inplace=True)

Index(['Asian', 'Bar/Cocktail', 'Cafe/Sweets', 'Creative cuisine',
       'Dining bar', 'International cuisine', 'Italian/French', 'Izakaya',
       'Japanese food', 'Karaoke/Party', 'Okonomiyaki/Monja/Teppanyaki',
       'Other', 'Western food', 'Yakiniku/Korean food'],
      dtype='object') 


Index(['Fukuoka-ken Fukuoka-shi Daimyō',
       'Fukuoka-ken Fukuoka-shi Hakata Ekimae',
       'Fukuoka-ken Fukuoka-shi Imaizumi', 'Fukuoka-ken Fukuoka-shi Momochi',
       'Fukuoka-ken Fukuoka-shi Shiobaru', 'Fukuoka-ken Fukuoka-shi Takatori',
       'Fukuoka-ken Fukuoka-shi Tenjin', 'Fukuoka-ken Fukuoka-shi Torikai',
       'Fukuoka-ken Itoshima-shi Maebarunishi',
       'Fukuoka-ken Kitakyūshū-shi Konyamachi',
       ...
       'Ōsaka-fu Sakai-shi Minamikawaramachi', 'Ōsaka-fu Suita-shi Izumichō',
       'Ōsaka-fu Ōsaka-shi Fuminosato', 'Ōsaka-fu Ōsaka-shi Kyōmachibori',
       'Ōsaka-fu Ōsaka-shi Kyūtarōmachi', 'Ōsaka-fu Ōsaka-shi Nakanochō',
       'Ōsaka-fu Ōsaka-shi Nanbasennichimae', '

In [31]:
print(cols_to_encode)

['genre_name', 'area_name', 'prefecture', 'city', 'day_of_week', 'station_id']


In [32]:
def calc_shifted_ewm(series, alpha, adjust=True, days=0): #
    return series.shift(periods=days).ewm(alpha=alpha, adjust=adjust).mean()

In [33]:
tmp = (train.groupby(['air_store_id', 'day_of_week'])
                  .apply(lambda g: calc_shifted_ewm(g['visitors'], 0.1, days=39))
                  .reset_index().set_index('level_2').sort_index()
                  .rename(columns={'visitors' : 'ewm'})
                  .drop(['air_store_id','day_of_week'], axis=1))

In [34]:
tmp = pd.concat([train, tmp], axis=1)[['air_store_id', 'visit_date', 'ewm']]

In [35]:
tmp.head()

Unnamed: 0,air_store_id,visit_date,ewm
0,air_00a91d42b08b08d9,2016-07-01,
1,air_00a91d42b08b08d9,2016-07-02,
2,air_00a91d42b08b08d9,2016-07-04,
3,air_00a91d42b08b08d9,2016-07-05,
4,air_00a91d42b08b08d9,2016-07-06,


In [36]:
all_data['visit_date'] = pd.to_datetime(all_data['visit_date'])
tmp['visit_date'] = pd.to_datetime(tmp['visit_date'])

In [37]:
#all_data['visit_date'] = all_data['visit_date'].dt.date
#tmp['visit_date'] = tmp['visit_date'].dt.date

In [38]:
train = train.reset_index().drop('index',axis=1)
test = test.reset_index().drop('index',axis=1)

In [39]:
print("Train min: " + str(train['visit_date'].min()))
print("Train max:" + str(train['visit_date'].max()))
print("Test min: " + str(test['visit_date'].min()))
print("Test max:" + str(test['visit_date'].max()))
print("tmp min: " + str(tmp['visit_date'].min()))
print("tmp max:" + str(tmp['visit_date'].max()))
print("Difference: " + str(test['visit_date'].max() - train['visit_date'].max()))

Train min: 2016-01-01 00:00:00
Train max:2017-04-22 00:00:00
Test min: 2017-04-23 00:00:00
Test max:2017-05-31 00:00:00
tmp min: 2016-01-01 00:00:00
tmp max:2017-04-22 00:00:00
Difference: 39 days 00:00:00


In [40]:
tmp['visit_date'] += dt.timedelta(days=39)

In [41]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282487 entries, 0 to 282486
Columns: 295 entries, air_store_id to yamaguchi__shimonoseki-kana__shimonoseki
dtypes: datetime64[ns](1), float64(53), int64(12), object(3), uint8(226)
memory usage: 211.7+ MB


In [42]:
tmp.shape

(250468, 3)

In [43]:
#all_data = pd.merge(all_data, tmp, on=['air_store_id', 'visit_date'], how='left')

In [44]:
train = all_data[:train_len]
test = all_data[train_len:]

In [45]:
train = train.reset_index().drop('index',axis=1)
test = test.reset_index().drop('index',axis=1)

In [46]:
all_data['visit_date'] = pd.to_datetime(all_data['visit_date'])
#all_data['visit_date'] = all_data['visit_date'].dt.date

In [47]:
del all_data; gc.collect();

In [48]:
test.isnull().sum()[test.isnull().sum()>0]

diff_8_7                 13
diff_9_8                  6
eight_weeks_ago        4377
eleven_weeks_ago       4589
nine_weeks_ago         4439
priorMax               1632
priorMean              1632
priorMin               1632
seven_weeks_ago        4395
six_weeks_ago          4349
ten_weeks_ago          4545
visitors              32019
vmax_6weekago_7          13
vmax_6weekago_8           6
vmean_6weekago_7         13
vmean_6weekago_8          6
vmedian_6weekago_7       13
vmedian_6weekago_8        6
vmin_6weekago_7          13
vmin_6weekago_8           6
avg_temperature        3276
high_temperature       3276
low_temperature        3276
precipitation         12730
hours_sunlight         4641
dtype: int64

In [49]:
train.columns

Index(['air_store_id', 'diff_10_9', 'diff_11_10', 'diff_12_11', 'diff_8_7',
       'diff_9_8', 'eight_weeks_ago', 'eleven_weeks_ago', 'nine_weeks_ago',
       'priorMax',
       ...
       'shizuoka__hamamatsu-kana__hamamatsu',
       'shizuoka__mishima-kana__mishima', 'shizuoka__shizuoka-kana__shizuoka',
       'tokyo__edogawa-seaside-kana__edgawawinkai', 'tokyo__fuchu-kana__fuku',
       'tokyo__haneda-kana__haneda', 'tokyo__nerima-kana__nerima',
       'tokyo__setagaya-kana__setagaya', 'tokyo__tokyo-kana__tonokyo',
       'yamaguchi__shimonoseki-kana__shimonoseki'],
      dtype='object', length=295)

In [50]:
test.columns

Index(['air_store_id', 'diff_10_9', 'diff_11_10', 'diff_12_11', 'diff_8_7',
       'diff_9_8', 'eight_weeks_ago', 'eleven_weeks_ago', 'nine_weeks_ago',
       'priorMax',
       ...
       'shizuoka__hamamatsu-kana__hamamatsu',
       'shizuoka__mishima-kana__mishima', 'shizuoka__shizuoka-kana__shizuoka',
       'tokyo__edogawa-seaside-kana__edgawawinkai', 'tokyo__fuchu-kana__fuku',
       'tokyo__haneda-kana__haneda', 'tokyo__nerima-kana__nerima',
       'tokyo__setagaya-kana__setagaya', 'tokyo__tokyo-kana__tonokyo',
       'yamaguchi__shimonoseki-kana__shimonoseki'],
      dtype='object', length=295)

In [51]:
#train = train.fillna(0) ##0.5171185851031399
#test = test.fillna(0)
train = train.fillna(np.nan) ##0.5185937979429444
test = test.fillna(np.nan)

In [52]:
# %load prepareData.py
def prepareData(trainIn, valIn, testIn = None):
    stat1 = trainIn.groupby(["air_store_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_store_day'})
    stat2 = trainIn.groupby(["air_store_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_store_day'})
        
    stat3 = trainIn.groupby(["area_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_area'})
    stat4 = trainIn.groupby(["area_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_area'})

    stat5 = trainIn.groupby(["n200mt_cluster_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_200mt'})
    stat6 = trainIn.groupby(["n200mt_cluster_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_200mt'})

    stat7 = trainIn.groupby(["n400mt_cluster_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_400mt'})
    stat8 = trainIn.groupby(["n400mt_cluster_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_400mt'})
    
    stat9 = trainIn.groupby(["n1000mt_cluster_id","day_of_week"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_1000mt'})
    stat10 = trainIn.groupby(["n1000mt_cluster_id","day_of_week"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_1000mt'})
    
    stat11 = trainIn.groupby(["n200mt_cluster_id","genre_name"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_genre_200mt'})
    stat12 = trainIn.groupby(["n200mt_cluster_id","genre_name"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_genre_200mt'})
    
    stat13 = trainIn.groupby(["n400mt_cluster_id","genre_name"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_genre_400mt'})
    stat14 = trainIn.groupby(["n400mt_cluster_id","genre_name"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_genre_400mt'})
    
    stat15 = trainIn.groupby(["n1000mt_cluster_id","genre_name"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_genre_1000mt'})
    stat16 = trainIn.groupby(["n1000mt_cluster_id","genre_name"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_genre_1000mt'})
    
    #####################################
    stat17 = trainIn.groupby(["air_store_id","day_of_week",'week_seq_id'],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_store_day'})
    #stat18 = trainIn.groupby(["air_store_id","day_of_week",'week_seq_id'],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_store_day'})
    #stat17 = pd.merge(stat17, stat18, on=['air_store_id','day_of_week','week_seq_id'])
    #del stat18; gc.collect()
    
    stat_6wago_day = stat17.copy()
    stat_6wago_day.loc[:,'week_seq_id'] = np.nan
    stat_6wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 6
    stat_6wago_day = stat_6wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_6wks_wago'})
    
    stat_7wago_day = stat17.copy()
    stat_7wago_day.loc[:,'week_seq_id'] = np.nan
    stat_7wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 7
    stat_7wago_day = stat_7wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_7wks_wago'})
   
    stat_8wago_day = stat17.copy()
    stat_8wago_day.loc[:,'week_seq_id'] = np.nan
    stat_8wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 8
    stat_8wago_day = stat_8wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_8wks_wago'})

    stat_9wago_day = stat17.copy()
    stat_9wago_day.loc[:,'week_seq_id'] = np.nan
    stat_9wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 9
    stat_9wago_day = stat_9wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_9wks_wago'})
    
    stat_10wago_day = stat17.copy()
    stat_10wago_day.loc[:,'week_seq_id'] = np.nan
    stat_10wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 10
    stat_10wago_day = stat_10wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_10wks_wago'})
    
    #################################
    stat18 = trainIn.groupby(["air_store_id","week_seq_id"],as_index=False)['visitors'].sum().rename(columns={'visitors':'visitors_sum_store_week'})
    stat19 = trainIn.groupby(["air_store_id","week_seq_id"],as_index=False)['visitors'].mean().rename(columns={'visitors':'visitors_mean_store_week'})                  
    stat18 = pd.merge(stat18, stat19, on=['air_store_id','week_seq_id'])
    del stat19;
    
    stat_6wago = stat18.copy()
    stat_6wago.loc[:,'week_seq_id'] = np.nan
    stat_6wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 6
    stat_6wago = stat_6wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_6wago'})
    stat_6wago = stat_6wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_6wago'})
    
    stat_7wago = stat18.copy()
    stat_7wago.loc[:,'week_seq_id'] = np.nan
    stat_7wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 7
    stat_7wago = stat_7wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_7wago'})
    stat_7wago = stat_7wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_8wago'})
    
    stat_8wago = stat18.copy()
    stat_8wago.loc[:,'week_seq_id'] = np.nan
    stat_8wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 8
    stat_8wago = stat_8wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_day_6wago'})
    stat_8wago = stat_8wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_day_6wago'}) 
    
    stat_9wago = stat18.copy()
    stat_9wago.loc[:,'week_seq_id'] = np.nan
    stat_9wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 9
    stat_9wago = stat_9wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_9wago'})
    stat_9wago = stat_9wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_9wago'})
    
    stat_10wago = stat18.copy()
    stat_10wago.loc[:,'week_seq_id'] = np.nan
    stat_10wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 10
    stat_10wago = stat_10wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_10wago'})
    stat_10wago = stat_10wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_10wago'}) 
    
    ##############################
    trainIn = pd.merge(trainIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat3, on = ["area_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat4, on = ["area_id", "day_of_week"], how='left')
    
    trainIn = pd.merge(trainIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    
    trainIn = pd.merge(trainIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')
    
    trainIn = pd.merge(trainIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    
    trainIn = pd.merge(trainIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    
    ############################
    testIn = pd.merge(testIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat3, on = ["area_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat4, on = ["area_id", "day_of_week"], how='left')

    testIn = pd.merge(testIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')

    testIn = pd.merge(testIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')

    testIn = pd.merge(testIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')

    testIn = pd.merge(testIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    
    ##################################
    valIn = pd.merge(valIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat3, on = ["area_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat4, on = ["area_id", "day_of_week"], how='left')
    
    valIn = pd.merge(valIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')

    valIn = pd.merge(valIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')
    
    valIn = pd.merge(valIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    
    valIn = pd.merge(valIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
 
    return (trainIn, valIn, testIn)

In [53]:
col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors','visit_date', 'day_of_week', 'genre_name']]

In [54]:
X = train.sort_values(['air_store_id', 'visit_date']).copy()

In [55]:
X.fillna(0, inplace=True)

In [56]:
train = X[col]

In [57]:
y = np.log1p(X['visitors'])

In [58]:
#test = test[col]

In [59]:

K = 5
kf = model_selection.KFold(n_splits = K, shuffle = True)

In [60]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(np.log1p(y), np.log1p(pred))**0.5

In [61]:
y_test_pred = 0
y_train_pred = np.zeros(len(X))
#y_train_pred = X['visitors'].copy
y_train_pred
bestIters = []

In [62]:
y_train_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [63]:
train.shape

(250468, 290)

In [64]:
y_train_pred.shape

(250468,)

In [65]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250468 entries, 0 to 250467
Columns: 295 entries, air_store_id to yamaguchi__shimonoseki-kana__shimonoseki
dtypes: datetime64[ns](1), float64(53), int64(12), object(3), uint8(226)
memory usage: 187.7+ MB


In [66]:
for i, (train_index, val_index) in enumerate(kf.split(X)):
    
    train, val, test2 = prepareData(X.iloc[train_index, :].copy(), X.iloc[val_index, :].copy(), test.copy())
    train = train.sort_values(['air_store_id', 'visit_date'])
    val = val.sort_values(['air_store_id', 'visit_date'])
    test2 = test2.sort_values(['air_store_id', 'visit_date'])
    train = train.fillna(0)
    val = val.fillna(0)
    test2 = test.fillna(0)
    
    X_train, y_train = train[col], np.log1p(train['visitors'])
    X_valid, y_valid = val[col], np.log1p(val['visitors'])
    print("\nFold ", i)
    
    sc = preprocessing.StandardScaler()
    X_train = sc.fit_transform(X_train)
    test2 = sc.transform(test2[col])
    X_valid = sc.fit_transform(X_valid)
    #x_val = np.array(x_valid)
    #y_val = np.array(y_valid)
    

    model = Sequential()
    model.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = X_train.shape[1]))
    model.add(PReLU())
    model.add(Dropout(.4))
    model.add(Dense(units = 160 , kernel_initializer = 'normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.6))
    model.add(Dense(units = 64 , kernel_initializer = 'normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.5))
    model.add(Dense(units = 26, kernel_initializer = 'normal'))
    model.add(Dense(units = 64 , kernel_initializer = 'normal'))
    model.add(Dense(units = 64 , kernel_initializer = 'normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(.6))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer=Adam(lr=5e-3,decay=1e-4), metrics=['mean_squared_error'])#lr=0.1,decay=1e-4

    
    wtpath = 'weights.hdf5'
    bestepoch = ModelCheckpoint( filepath=wtpath, verbose=1, save_best_only=True )
    early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1) 
    
    network_history = (model.fit(X_train, y_train, validation_data = (X_valid, y_valid), epochs=1000, 
          batch_size=256, verbose=True, callbacks=[bestepoch, early_stop])) 
    
    val_pred = model.predict(X_valid)
    val_pred = [item for sublist in val_pred for item in sublist]
    val_pred = np.array(val_pred)
    val_pred[val_pred < 0] = 0
    val_pred = np.expm1(val_pred)
    val_pred[val_pred < 1] = 1
    y_train_pred[val_index] = np.array(val_pred) 
    print('RMSLE Keras Regressor, validation set, fold ', i, ': ', RMSLE(val['visitors'], val_pred))

    test_pred = model.predict(test2)
    test_pred = [item for sublist in test_pred for item in sublist]
    test_pred = np.array(test_pred)
    test_pred[test_pred < 0] = 0
    test_pred = np.expm1(test_pred)
    test_pred[test_pred < 1] = 1
    y_test_pred += test_pred

    del X_train, X_valid, y_train, y_valid, train, test2


Fold  0
Train on 200374 samples, validate on 50094 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.28557, saving model to weights.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.28557 to 0.28316, saving model to weights.hdf5
Epoch 3/50
Epoch 00003: val_loss did not improve
Epoch 4/50
Epoch 00004: val_loss improved from 0.28316 to 0.27804, saving model to weights.hdf5
Epoch 5/50
Epoch 00005: val_loss improved from 0.27804 to 0.27512, saving model to weights.hdf5
Epoch 6/50
Epoch 00006: val_loss improved from 0.27512 to 0.27308, saving model to weights.hdf5
Epoch 7/50
Epoch 00007: val_loss did not improve
Epoch 8/50
Epoch 00008: val_loss did not improve
Epoch 00008: early stopping
RMSLE Keras Regressor, validation set, fold  0 :  0.5311813467697177

Fold  1
Train on 200374 samples, validate on 50094 samples
Epoch 1/50
Epoch 00001: val_loss improved from inf to 0.29950, saving model to weights.hdf5
Epoch 2/50
Epoch 00002: val_loss improved from 0.29950 to 0.29210, s

In [67]:
y_test_pred /= K

In [68]:
print('RMSLE Keras, full validtion, fold  ' + str(RMSLE(X['visitors'].values, y_train_pred)))

RMSLE Keras, full validtion, fold  0.5323928754746718


In [69]:
print(y_test_pred)

[ 6.180293  28.868744  31.92031   ...  3.4842389  4.114339   4.1106715]


In [95]:
submission = pd.DataFrame()
submission['id'] = test['air_store_id'] + "_" + test['visit_date'].dt.date.astype('str')

In [96]:
submission['visitors'] = y_test_pred

In [97]:
submission.to_csv('../Submission/submission11_1.csv', float_format='%.6f', index=False)

In [108]:
submission = pd.DataFrame()
submission['air_store_id'] = X['air_store_id']
submission['visit_date'] =  X['visit_date'].dt.date
submission['visitors'] = y_train_pred
submission['actual_visitors'] = X['visitors']

In [111]:
submission.head()

Unnamed: 0,air_store_id,visit_date,visitors,actual_visitors,rmsle
0,air_00a91d42b08b08d9,2016-07-01,47.332245,35.0,0.532393
1,air_00a91d42b08b08d9,2016-07-02,21.128527,9.0,0.532393
2,air_00a91d42b08b08d9,2016-07-04,18.165964,20.0,0.532393
3,air_00a91d42b08b08d9,2016-07-05,21.84672,25.0,0.532393
4,air_00a91d42b08b08d9,2016-07-06,28.32258,29.0,0.532393


In [110]:
submission['rmsle'] = RMSLE(submission['actual_visitors'], submission['visitors'])

In [115]:
submission.groupby(['actual_visitors']).mean()

Unnamed: 0_level_0,visitors,rmsle
actual_visitors,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,7.301833,0.532393
2.0,8.034849,0.532393
3.0,8.191721,0.532393
4.0,8.882400,0.532393
5.0,9.251229,0.532393
6.0,9.894290,0.532393
7.0,10.529503,0.532393
8.0,11.197293,0.532393
9.0,11.827628,0.532393
10.0,12.534514,0.532393
