In [1]:
import glob, re
import numpy as np
import pandas as pd
from sklearn import *
import datetime as dt
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout, PReLU, BatchNormalization
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv("../input/train_proc5.csv")
test = pd.read_csv("../input/test_proc5.csv")
submission = pd.read_csv("../input/sample_submission.csv")
air_store = pd.read_csv("../input/allstore_info_proc1.csv").rename(columns={'store_id':'air_store_id'})
date_info = pd.read_csv("../input/date_info.csv").rename(columns={"calendar_date" : 'visit_date'})
train_weather = pd.read_csv("../input/train_weather_01.csv")
test_weather = pd.read_csv("../input/test_weather_01.csv")

In [3]:
gw1_start = dt.date(2016,4,29)
gw1_end = dt.date(2016,5,5)
gw2_start = dt.date(2017,4,29)
gw2_end = dt.date(2017,5,5)

In [4]:
submission['visit_date'] = submission['id'].map(lambda x: str(x).split('_')[2])
submission['air_store_id'] = submission['id'].map(lambda x: '_'.join(x.split('_')[:2]))
submission['visit_date'] = pd.to_datetime(submission['visit_date'])
submission['visitors'] = np.nan

In [5]:
all_weather = pd.concat([train_weather, test_weather])
all_weather['visit_date'] = pd.to_datetime(all_weather['visit_date'])

In [6]:
train['visit_date'] = pd.to_datetime(train['visit_date'])
test['visit_date'] = pd.to_datetime(test['visit_date'])
test['visitors'] = np.nan

In [7]:
date_info['visit_date'] = pd.to_datetime(date_info['visit_date'])

In [8]:
date_info['date_seq_id'] = date_info['visit_date'].dt.date - date_info['visit_date'].dt.date.min() + dt.timedelta(1)
date_info['date_seq_id'] = date_info['date_seq_id']/dt.timedelta(1)

In [9]:
date_info['week_seq_id'] = ((date_info['date_seq_id']+4)/7).astype(np.int64)

In [10]:
date_info['golden_week'] = 0
date_info['after_golden_week'] = 0
date_info.loc[(date_info['visit_date'] >= gw1_start) & (date_info['visit_date'] <= gw1_end), 'golden_week'] = 1
date_info.loc[(date_info['visit_date'] >= dt.date(2016,5,6)) & (date_info['visit_date'] <= dt.date(2016,5,12)), 'after_golden_week'] = 1

date_info.loc[(date_info['visit_date'] >= gw2_start) & (date_info['visit_date'] <= gw2_end), 'golden_week'] = 1
date_info.loc[(date_info['visit_date'] >= dt.date(2017,5,6)) & (date_info['visit_date'] <= dt.date(2017,5,12)), 'after_golden_week'] = 1

In [11]:
stores_to_drop = list(set(train['air_store_id']) - set(test['air_store_id']))
train = train.where(~(train['air_store_id'].isin(stores_to_drop)))
train = train.dropna(axis=0,subset=['air_store_id'])

In [12]:
all_data = pd.concat([train,test])

In [13]:
train_len = len(train)

In [14]:
del train; del test;
gc.collect();

In [15]:
all_data = pd.merge(all_data,air_store,how='left',on='air_store_id',)

In [16]:
all_data = pd.merge(all_data,date_info,how='left',on='visit_date')

In [17]:
all_data['visit_day'] = all_data['visit_date'].dt.day
all_data['visit_month'] = all_data['visit_date'].dt.month
all_data['visit_year'] = all_data['visit_date'].dt.year

In [18]:
all_data['diff_max_lat'] = all_data['latitude'].max() - all_data['latitude']
all_data['diff_min_lat'] = all_data['latitude'].min() - all_data['latitude']
all_data['diff_max_long'] = all_data['longitude'].max() - all_data['longitude']
all_data['diff_min_long'] = all_data['longitude'].min() - all_data['longitude']
all_data['lat_plus_long'] = all_data['latitude'] + all_data['longitude']

In [19]:
stores_to_drop = ['air_b2d8bc9c88b85f96',
 'air_cf22e368c1a71d53',
 'air_229d7e508d9f1b5e',
 'air_d0a7bd3339c3d12a',
 'air_cb083b4789a8d3a2',
 'air_2703dcb33192b181',
 'air_0ead98dd07e7a82a',
 'air_d63cfa6d6ab78446']

In [20]:
all_data = all_data.where(~(all_data['air_store_id'].isin(stores_to_drop)))
all_data = all_data.dropna(axis=0,subset=['air_store_id'])

In [21]:
#all_data = pd.merge(all_data,all_weather, on=['air_store_id','visit_date'], how='left')

In [22]:
all_data.shape

(282487, 70)

In [23]:
del air_store;del date_info;gc.collect();

In [24]:
all_data['visit_date'] = pd.to_datetime(all_data['visit_date'])

In [25]:
golden_week_multiplier = 1.1688571428571
after_golden_week_multiplier = 0.85

In [26]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282487 entries, 0 to 282486
Data columns (total 70 columns):
air_store_id            282487 non-null object
diff_10_9               282487 non-null float64
diff_11_10              282487 non-null float64
diff_12_11              282487 non-null float64
diff_8_7                282474 non-null float64
diff_9_8                282481 non-null float64
eight_weeks_ago         221561 non-null float64
eleven_weeks_ago        207193 non-null float64
nine_weeks_ago          216946 non-null float64
priorMax                249026 non-null float64
priorMean               249026 non-null float64
priorMin                249026 non-null float64
priorSum                282487 non-null float64
seven_weeks_ago         226112 non-null float64
six_weeks_ago           230884 non-null float64
ten_weeks_ago           211815 non-null float64
visit_date              282487 non-null datetime64[ns]
visitors                250468 non-null float64
vmax_6weekago_10   

In [27]:
#all_data = all_data.sort_values(['air_store_id','visit_date'])

In [28]:
train = all_data[:train_len]
test = all_data[train_len:]

In [29]:
train = train.reset_index().drop('index',axis=1)
test = test.reset_index().drop('index',axis=1)

In [30]:
train.shape

(250468, 70)

In [31]:
test.shape

(32019, 70)

In [32]:
cols_to_encode = ([i for i,j in zip(all_data.dtypes.index,all_data.dtypes.values) if j == 'object'])
cols_to_encode.remove('air_store_id')
#cols_to_encode.extend(['area_id','n200mt_cluster_id','n400mt_cluster_id','n1000mt_cluster_id'])
#cols_to_encode.remove('visit_date')

for i in cols_to_encode:
    all_data = pd.concat([all_data, pd.get_dummies(all_data[i])],axis=1)
    if i not in ["day_of_week", 'genre_name', 'area_id','n200mt_cluster_id','n400mt_cluster_id','n1000mt_cluster_id']:
        all_data.drop(i, axis=1, inplace=True)

In [33]:
print(cols_to_encode)

['genre_name', 'area_name', 'prefecture', 'city', 'day_of_week']


In [34]:
train = train.reset_index().drop('index',axis=1)
test = test.reset_index().drop('index',axis=1)

In [35]:
print("Train min: " + str(train['visit_date'].min()))
print("Train max:" + str(train['visit_date'].max()))
print("Test min: " + str(test['visit_date'].min()))
print("Test max:" + str(test['visit_date'].max()))

print("Difference: " + str(test['visit_date'].max() - train['visit_date'].max()))

Train min: 2016-01-01 00:00:00
Train max:2017-04-22 00:00:00
Test min: 2017-04-23 00:00:00
Test max:2017-05-31 00:00:00
Difference: 39 days 00:00:00


In [36]:
train = all_data[:train_len]
test = all_data[train_len:]

In [37]:
all_data['visit_date'] = pd.to_datetime(all_data['visit_date'])
#all_data['visit_date'] = all_data['visit_date'].dt.date

In [38]:
del all_data; gc.collect();

In [39]:
tmp = train.groupby(by=['air_store_id'])['visit_date'].min().reset_index().rename(columns={'visit_date' : 'days_since_first_obs'})
train = pd.merge(train, tmp, on=['air_store_id'], how='left')
train['days_since_first_obs'] = (train['visit_date'] - train['days_since_first_obs']).dt.days

In [None]:
train = train[train['days_since_first_obs'] > 38]

In [None]:
# %load prepareData.py
def prepareData(trainIn, valIn, testIn = None):
    stat1 = trainIn.groupby(["air_store_id","day_of_week"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_store_day'})
    stat2 = trainIn.groupby(["air_store_id","day_of_week"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_store_day'})
        
    stat3 = trainIn.groupby(["area_id","day_of_week"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_area'})
    stat4 = trainIn.groupby(["area_id","day_of_week"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_area'})

    stat5 = trainIn.groupby(["n200mt_cluster_id","day_of_week"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_200mt'})
    stat6 = trainIn.groupby(["n200mt_cluster_id","day_of_week"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_200mt'})

    stat7 = trainIn.groupby(["n400mt_cluster_id","day_of_week"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_400mt'})
    stat8 = trainIn.groupby(["n400mt_cluster_id","day_of_week"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_400mt'})
    
    stat9 = trainIn.groupby(["n1000mt_cluster_id","day_of_week"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_1000mt'})
    stat10 = trainIn.groupby(["n1000mt_cluster_id","day_of_week"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_1000mt'})
    
    stat11 = trainIn.groupby(["n200mt_cluster_id","genre_name"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_genre_200mt'})
    stat12 = trainIn.groupby(["n200mt_cluster_id","genre_name"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_genre_200mt'})
    
    stat13 = trainIn.groupby(["n400mt_cluster_id","genre_name"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_genre_400mt'})
    stat14 = trainIn.groupby(["n400mt_cluster_id","genre_name"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_genre_400mt'})
    
    stat15 = trainIn.groupby(["n1000mt_cluster_id","genre_name"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_genre_1000mt'})
    stat16 = trainIn.groupby(["n1000mt_cluster_id","genre_name"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_genre_1000mt'})
    
    #####################################
    stat17 = trainIn.groupby(["air_store_id","day_of_week",'week_seq_id'])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_store_day'})
    #stat18 = trainIn.groupby(["air_store_id","day_of_week",'week_seq_id'])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_store_day'})
    #stat17 = pd.merge(stat17, stat18, on=['air_store_id','day_of_week','week_seq_id'])
    #del stat18; gc.collect()
    
    stat_6wago_day = stat17.copy()
    stat_6wago_day.loc[:,'week_seq_id'] = np.nan
    stat_6wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 6
    stat_6wago_day = stat_6wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_6wks_wago'})
    
    stat_7wago_day = stat17.copy()
    stat_7wago_day.loc[:,'week_seq_id'] = np.nan
    stat_7wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 7
    stat_7wago_day = stat_7wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_7wks_wago'})
   
    stat_8wago_day = stat17.copy()
    stat_8wago_day.loc[:,'week_seq_id'] = np.nan
    stat_8wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 8
    stat_8wago_day = stat_8wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_8wks_wago'})

    stat_9wago_day = stat17.copy()
    stat_9wago_day.loc[:,'week_seq_id'] = np.nan
    stat_9wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 9
    stat_9wago_day = stat_9wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_9wks_wago'})
    
    stat_10wago_day = stat17.copy()
    stat_10wago_day.loc[:,'week_seq_id'] = np.nan
    stat_10wago_day.loc[:,'week_seq_id'] = stat17.loc[:,'week_seq_id'] + 10
    stat_10wago_day = stat_10wago_day.rename(columns={'visitors_sum_store_day' : 'visitors_10wks_wago'})
    
    #################################
    stat18 = trainIn.groupby(["air_store_id","week_seq_id"])['visitors'].sum().reset_index().rename(columns={'visitors':'visitors_sum_store_week'})
    stat19 = trainIn.groupby(["air_store_id","week_seq_id"])['visitors'].mean().reset_index().rename(columns={'visitors':'visitors_mean_store_week'})                  
    stat18 = pd.merge(stat18, stat19, on=['air_store_id','week_seq_id'])
    del stat19;
    
    stat_6wago = stat18.copy()
    stat_6wago.loc[:,'week_seq_id'] = np.nan
    stat_6wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 6
    stat_6wago = stat_6wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_6wago'})
    stat_6wago = stat_6wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_6wago'})
    
    stat_7wago = stat18.copy()
    stat_7wago.loc[:,'week_seq_id'] = np.nan
    stat_7wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 7
    stat_7wago = stat_7wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_7wago'})
    stat_7wago = stat_7wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_8wago'})
    
    stat_8wago = stat18.copy()
    stat_8wago.loc[:,'week_seq_id'] = np.nan
    stat_8wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 8
    stat_8wago = stat_8wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_day_6wago'})
    stat_8wago = stat_8wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_day_6wago'}) 
    
    stat_9wago = stat18.copy()
    stat_9wago.loc[:,'week_seq_id'] = np.nan
    stat_9wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 9
    stat_9wago = stat_9wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_9wago'})
    stat_9wago = stat_9wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_9wago'})
    
    stat_10wago = stat18.copy()
    stat_10wago.loc[:,'week_seq_id'] = np.nan
    stat_10wago.loc[:,'week_seq_id'] = stat18.loc[:,'week_seq_id'] + 10
    stat_10wago = stat_10wago.rename(columns={'visitors_sum_store_week' : 'visitors_sum_store_10wago'})
    stat_10wago = stat_10wago.rename(columns={'visitors_mean_store_week' : 'visitors_mean_store_10wago'}) 
    
    ##############################
    trainIn = pd.merge(trainIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat3, on = ["area_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat4, on = ["area_id", "day_of_week"], how='left')
    
    trainIn = pd.merge(trainIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    
    trainIn = pd.merge(trainIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    trainIn = pd.merge(trainIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')
    
    trainIn = pd.merge(trainIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    trainIn = pd.merge(trainIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    
    trainIn = pd.merge(trainIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    trainIn = pd.merge(trainIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    
    ############################
    testIn = pd.merge(testIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat3, on = ["area_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat4, on = ["area_id", "day_of_week"], how='left')

    testIn = pd.merge(testIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')

    testIn = pd.merge(testIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    testIn = pd.merge(testIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')

    testIn = pd.merge(testIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    testIn = pd.merge(testIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')

    testIn = pd.merge(testIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    testIn = pd.merge(testIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    
    ##################################
    valIn = pd.merge(valIn, stat1, on = ["air_store_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat2, on = ["air_store_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat3, on = ["area_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat4, on = ["area_id", "day_of_week"], how='left')
    
    valIn = pd.merge(valIn, stat_6wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_7wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_8wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_9wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_10wago_day, on = ["air_store_id", "day_of_week", "week_seq_id"], how='left')

    valIn = pd.merge(valIn, stat_6wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_7wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_8wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_9wago, on = ["air_store_id", "week_seq_id"], how='left')
    valIn = pd.merge(valIn, stat_10wago, on = ["air_store_id", "week_seq_id"], how='left')
    
    valIn = pd.merge(valIn, stat5, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat6, on = ["n200mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat7, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat8, on = ["n400mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat9, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    valIn = pd.merge(valIn, stat10, on = ["n1000mt_cluster_id", "day_of_week"], how='left')
    
    valIn = pd.merge(valIn, stat11, on = ["n200mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat12, on = ["n200mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat13, on = ["n400mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat14, on = ["n400mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat15, on = ["n1000mt_cluster_id", "genre_name"], how='left')
    valIn = pd.merge(valIn, stat16, on = ["n1000mt_cluster_id", "genre_name"], how='left')
 
    return (trainIn, valIn, testIn)

In [None]:
col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors','days_since_first_obs', 'day_of_week', 'genre_name']]

In [None]:
X = train.sort_values(['air_store_id', 'visit_date']).copy()

In [None]:
X.fillna(0, inplace=True)

In [None]:
train = X[col]

In [None]:
y = np.log1p(X['visitors'])

In [None]:
#test = test[col]

In [None]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(np.log1p(y), np.log1p(pred))**0.5

In [None]:
np.random.seed(37)
#tf.set_random_seed(344)

In [None]:
num_loops = 3
K = 10
kf = model_selection.KFold(n_splits = K, shuffle = True)

In [None]:
y_test_pred = 0
y_train_pred = np.zeros(len(X))
#y_train_pred = X['visitors'].copy
y_train_pred
bestIters = []

In [None]:
y_train_pred

In [None]:
train.shape

In [None]:
y_train_pred.shape

In [None]:
X.info()

In [None]:
for a in range(num_loops):
        for i, (train_index, val_index) in enumerate(kf.split(X)):
        train, val, test2 = prepareData(X.iloc[train_index, :].copy(), X.iloc[val_index, :].copy(), test.copy())
        train = train.sort_values(['air_store_id', 'visit_date'])
        val = val.sort_values(['air_store_id', 'visit_date'])
        test2 = test2.sort_values(['air_store_id', 'visit_date'])
        train = train.fillna(0)
        val = val.fillna(0)
        test2 = test.fillna(0)

        X_train, y_train = train[col], np.log1p(train['visitors'])
        X_valid, y_valid = val[col], np.log1p(val['visitors'])
        print("\nFold ", i)

        sc = preprocessing.StandardScaler()
        X_train = sc.fit_transform(X_train)
        test2 = sc.transform(test2[col])
        X_valid = sc.fit_transform(X_valid)
        #x_val = np.array(x_valid)
        #y_val = np.array(y_valid)


        model = Sequential()
        model.add(Dense(units = 160 , kernel_initializer = 'normal', input_dim = X_train.shape[1]))
        model.add(PReLU())
        model.add(Dropout(.2))
        model.add(Dense(units = 100 , kernel_initializer = 'normal'))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.2))
        model.add(Dense(units = 64 , kernel_initializer = 'normal'))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.1))
        model.add(Dense(units = 26, kernel_initializer = 'normal'))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.1))
        model.add(Dense(1, kernel_initializer='normal'))
        model.compile(loss='mean_squared_error', optimizer=Adam(lr=1e-2,decay=1e-4), metrics=['mean_squared_error'])#lr=0.1,decay=1e-4

        wtpath = 'weights.hdf5'
        bestepoch = ModelCheckpoint( filepath=wtpath, verbose=1, save_best_only=True )
        early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1) 

        network_history = (model.fit(X_train, y_train, validation_data = (X_valid, y_valid), epochs=500, 
              batch_size=256, verbose=True, callbacks=[bestepoch, early_stop])) 

        val_pred = model.predict(X_valid)
        val_pred = [item for sublist in val_pred for item in sublist]
        val_pred = np.array(val_pred)
        val_pred[val_pred < 0] = 0
        val_pred = np.expm1(val_pred)
        val_pred[val_pred < 1] = 1
        y_train_pred[val_index] += np.array(val_pred) 
        print('RMSLE Keras Regressor, validation set, fold ', i, ': ', RMSLE(val['visitors'], val_pred))

        test_pred = model.predict(test2)
        test_pred = [item for sublist in test_pred for item in sublist]
        test_pred += np.array(test_pred)
        test_pred[test_pred < 0] = 0
        test_pred = np.expm1(test_pred)
        test_pred[test_pred < 1] = 1
        y_test_pred += test_pred

        del X_train, X_valid, y_train, y_valid, train, test2

In [None]:
y_test_pred /= (K

In [None]:
print('RMSLE Keras, full validtion, fold  ' + str(RMSLE(X['visitors'].values, y_train_pred)))

In [None]:
print(y_test_pred)

In [None]:
submission = pd.DataFrame()
submission['id'] = test['air_store_id'] + "_" + test['visit_date'].dt.date.astype('str')

In [None]:
submission['visitors'] = y_test_pred

In [None]:
#submission.to_csv('../submissions/submission11_3.csv', float_format='%.6f', index=False)

In [None]:
valdf = pd.DataFrame()
valdf['air_store_id'] = X['air_store_id']
valdf['visit_date'] = X['visit_date']
valdf['visitors'] = X['visitors']
valdf['prediction'] = y_train_pred

In [None]:
#valdf.to_csv('../submissions/val11_3.csv', float_format='%.6f', index=False)

In [None]:
y_train_pred