In [432]:
%matplotlib inline
import math, keras, datetime, pandas as pd, numpy as np, keras.backend as K
import matplotlib.pyplot as plt, xgboost, operator, random, pickle
from sklearn import *
from datetime import datetime
import glob, re
from IPython.core.debugger import set_trace

In [None]:
# 1 Introduction
# This is an initial Exploratory Data Analysis for the Recruit Restaurant Visitor Forecasting 

# The aim of this challenge is to predict the future numbers of restaurant visitors. 
# This makes it a Time Series Forecasting problem. The data was collected from Japanese restaurants.
# As we will see, the data set is small and easily accessible without requiring much memory or computing power. 
# Therefore, this competition is particularly suited for beginners.

# The data comes in the shape of 8 relational files which are derived from two separate Japanese websites that 
# collect user information: “Hot Pepper Gourmet (hpg): similar to Yelp” (search and reserve) 
# and “AirREGI / Restaurant Board (air): similar to Square” (reservation control and cash register). 
# The training data is based on the time range of Jan 2016 - most of Apr 2017, 
# while the test set includes the last week of Apr plus May 2017. 
# The test data “intentionally spans a holiday week in Japan called the ‘Golden Week.’ 
# The data description further notes that:”There are days in the test set where the restaurant were 
# closed and had no visitors. These are ignored in scoring. The training set omits days where the restaurants 
# were closed."

# Those are the individual files:

# air_visit_data.csv: historical visit data for the air restaurants. 
# This is essentially the main training data set.

# air_reserve.csv / hpg_reserve.csv: reservations made through the air / hpg systems.

# air_store_info.csv / hpg_store_info.csv: details about the air / hpg restaurants including genre and location.

# store_id_relation.csv: connects the air and hpg ids

# date_info.csv: essentially flags the Japanese holidays.

# sample_submission.csv: serves as the test set. The id is formed by combining the air id with the visit date.


In [433]:
def join_df(left, right, left_on, right_on=None):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", "_y"))

In [434]:
data = {
    'air_visits':  pd.read_csv('../input/air_visit_data.csv'),
    'air_reserve': pd.read_csv('../input/air_reserve.csv'),
    'hpg_reserve': pd.read_csv('../input/hpg_reserve.csv'),
    'air_store':   pd.read_csv('../input/air_store_info.csv'),
    'hpg_store':   pd.read_csv('../input/hpg_store_info.csv'),
    'holidays':    pd.read_csv('../input/date_info.csv').rename(columns={'calendar_date':'visit_date'}),
    'store_ids':   pd.read_csv('../input/store_id_relation.csv'),
    'test':        pd.read_csv('../input/sample_submission.csv')
# test_df = pd.read_csv('../input/sample_submission.csv') 
    }

## Associate hpg store id with air store id
data['hpg_reserve'] = pd.merge(data['hpg_reserve'], data['store_ids'], how='inner', on=['hpg_store_id'])

In [None]:
display(data['test'].head())

In [435]:
for df in ['air_reserve', 'hpg_reserve']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    tmp1 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])

In [427]:
display(data['hpg_reserve'].head())
np.shape(data['hpg_reserve'])

Unnamed: 0,air_store_id,visit_date,rs1,rv1,rs2,rv2
0,air_00a91d42b08b08d9,2016-01-14,3,2,3.0,2.0
1,air_00a91d42b08b08d9,2016-01-15,6,4,6.0,4.0
2,air_00a91d42b08b08d9,2016-01-16,3,2,3.0,2.0
3,air_00a91d42b08b08d9,2016-01-22,3,2,3.0,2.0
4,air_00a91d42b08b08d9,2016-01-29,6,5,6.0,5.0


(18620, 6)

In [428]:
display(data['air_reserve'].head())
np.shape(data['air_reserve'])

Unnamed: 0,air_store_id,visit_date,rs1,rv1,rs2,rv2
0,air_00a91d42b08b08d9,2016-10-31,0,2,0.0,2.0
1,air_00a91d42b08b08d9,2016-12-05,4,9,4.0,9.0
2,air_00a91d42b08b08d9,2016-12-14,6,18,6.0,18.0
3,air_00a91d42b08b08d9,2016-12-17,6,2,6.0,2.0
4,air_00a91d42b08b08d9,2016-12-20,2,4,2.0,4.0


(29830, 6)

In [436]:
"""preprocess training data spliting out the dates """
data['air_visits']['visit_date'] = pd.to_datetime(data['air_visits']['visit_date'])
data['air_visits']['dow'] = data['air_visits']['visit_date'].dt.dayofweek
data['air_visits']['year'] = data['air_visits']['visit_date'].dt.year
data['air_visits']['month'] = data['air_visits']['visit_date'].dt.month
data['air_visits']['visit_date'] = data['air_visits']['visit_date'].dt.date

In [437]:
cols = data['air_visits'].columns.tolist()
cols =  cols[0:2] + cols[3:] + cols[2:3] 
data['air_visits'] = data['air_visits'][cols]
display(data['air_visits'].head())

Unnamed: 0,air_store_id,visit_date,dow,year,month,visitors
0,air_ba937bf13d40fb24,2016-01-13,2,2016,1,25
1,air_ba937bf13d40fb24,2016-01-14,3,2016,1,32
2,air_ba937bf13d40fb24,2016-01-15,4,2016,1,29
3,air_ba937bf13d40fb24,2016-01-16,5,2016,1,22
4,air_ba937bf13d40fb24,2016-01-18,0,2016,1,6


In [439]:
"""prepocess test set to match training dimensions"""
data['test']['air_store_id'] = data['test']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['test']['visit_date'] = data['test']['id'].map(lambda x: str(x).split('_')[2])
data['test']['visit_date'] = pd.to_datetime(data['test']['visit_date'])
data['test']['dow'] = data['test']['visit_date'].dt.dayofweek
data['test']['year'] = data['test']['visit_date'].dt.year
data['test']['month'] = data['test']['visit_date'].dt.month
data['test']['visit_date'] = data['test']['visit_date'].dt.date

unique_stores = data['test']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores,
                                  'dow': [i]*len(unique_stores)}) 
                    for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)

In [440]:
# columnsTitles=["visitors","A"]
# df=df.reindex(columns=columnsTitles)
cols = data['test'].columns.tolist()
cols =  cols[2:] + cols[1:2] 
data['test'] = data['test'][cols]
display(data['test'].head())

Unnamed: 0,air_store_id,visit_date,dow,year,month,visitors
0,air_00a91d42b08b08d9,2017-04-23,6,2017,4,0
1,air_00a91d42b08b08d9,2017-04-24,0,2017,4,0
2,air_00a91d42b08b08d9,2017-04-25,1,2017,4,0
3,air_00a91d42b08b08d9,2017-04-26,2,2017,4,0
4,air_00a91d42b08b08d9,2017-04-27,3,2017,4,0


In [441]:
"""We can compress the rows into meaningful features and combine genre, location datasets into our training sets
    save into stores as a temp variable"""

tmp = data['air_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 
tmp = data['air_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['air_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['air_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['air_visits'].groupby(['air_store_id','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

stores = pd.merge(stores, data['air_store'], how='left', on=['air_store_id']) 

In [None]:
display(stores.head())

In [None]:
display(data['holidays'].head())

In [442]:
data['holidays']['visit_date'] = pd.to_datetime(data['holidays']['visit_date'])
data['holidays']['visit_date'] = data['holidays']['visit_date'].dt.date

In [443]:
data['holidays'].holiday_flg = data['holidays'].holiday_flg != 0

In [None]:
# data['holidays']['calendar_date'] = pd.to_datetime(data['holidays']['calendar_date'])
# data['holidays']['calendar_date'] = data['holidays']['calendar_date'].dt.date

In [444]:
train = pd.merge(data['air_visits'], data['holidays'], how='left', on=['visit_date']) 
test = pd.merge(data['test'], data['holidays'], how='left', on=['visit_date'])

In [None]:
display(train.head())

In [None]:
display(test.head())

In [None]:
# def join_df(left, right, left_on, right_on=None):
#     if right_on is None: right_on = left_on
#     return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
#                       suffixes=("", "_y"))

In [None]:
# train = join_df(data['air_visits'], data['holidays'], "visit_date", "calendar_date")

In [None]:
# test = join_df(data['test'], data['holidays'], "visit_date", "calendar_date")

In [None]:
# display(train.head())

In [None]:
# test.drop('calendar_date',1,inplace=True) 
# test.columns
# display(test.head())

In [None]:
# train.drop('calendar_date',1,inplace=True) 
# train.columns
    

In [445]:
tr = pd.merge(train, stores, how='left', on=['air_store_id','dow'])

In [None]:
display(tr.head())

In [446]:
print(np.shape(tr))
cols = tr.columns.tolist()
cols =  cols[0:5] + cols[6:] + cols[5:6]
cols
len(cols)
final_train = tr[cols]
display(final_train.head())

(252108, 17)


Unnamed: 0,air_store_id,visit_date,dow,year,month,day_of_week,holiday_flg,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude,visitors
0,air_ba937bf13d40fb24,2016-01-13,2,2016,1,Wednesday,False,7.0,23.84375,25.0,57.0,64.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,25
1,air_ba937bf13d40fb24,2016-01-14,3,2016,1,Thursday,False,2.0,20.292308,21.0,54.0,65.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,32
2,air_ba937bf13d40fb24,2016-01-15,4,2016,1,Friday,False,4.0,34.738462,35.0,61.0,65.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,29
3,air_ba937bf13d40fb24,2016-01-16,5,2016,1,Saturday,False,6.0,27.651515,27.0,53.0,66.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,22
4,air_ba937bf13d40fb24,2016-01-18,0,2016,1,Monday,False,2.0,13.754386,12.0,34.0,57.0,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,6


In [447]:
te = pd.merge(test, stores, how='left', on=['air_store_id','dow'])

In [None]:
print(np.shape(te))
display(te.head())


In [448]:
np.shape(te)
cols = te.columns.tolist()
cols =  cols[0:5] + cols[6:] + cols[5:6]
# cols
# len(cols)
final_test = te[cols]
display(final_test.head())

Unnamed: 0,air_store_id,visit_date,dow,year,month,day_of_week,holiday_flg,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude,visitors
0,air_00a91d42b08b08d9,2017-04-23,6,2017,4,Sunday,False,2.0,2.0,2.0,2.0,1.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0
1,air_00a91d42b08b08d9,2017-04-24,0,2017,4,Monday,False,1.0,22.457143,19.0,47.0,35.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0
2,air_00a91d42b08b08d9,2017-04-25,1,2017,4,Tuesday,False,1.0,24.35,24.5,43.0,40.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0
3,air_00a91d42b08b08d9,2017-04-26,2,2017,4,Wednesday,False,15.0,28.125,28.0,52.0,40.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0
4,air_00a91d42b08b08d9,2017-04-27,3,2017,4,Thursday,False,15.0,29.868421,30.0,47.0,38.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0


In [449]:
for df in ['air_reserve','hpg_reserve']:
    train = pd.merge(final_train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(final_test, data[df], how='left', on=['air_store_id','visit_date'])

In [450]:
train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1)



In [452]:
train['total_reserv_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserv_mean'] = (train['rv2_x'] + train['rv2_y']) / 2
train['total_reserv_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y']) / 2

test['total_reserv_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserv_mean'] = (test['rv2_x'] + test['rv2_y']) / 2
test['total_reserv_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y']) / 2

col = [c for c in train if c not in ['id', 'air_store_id','visit_date','visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

KeyError: 'rv1_x'

In [451]:
display(train.head())

Unnamed: 0,air_store_id,visit_date,dow,year,month,day_of_week,holiday_flg,min_visitors,mean_visitors,median_visitors,...,air_genre_name,air_area_name,latitude,longitude,visitors,rs1,rv1,rs2,rv2,id
0,air_ba937bf13d40fb24,2016-01-13,2,2016,1,Wednesday,False,7.0,23.84375,25.0,...,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,25,,,,,air_ba937bf13d40fb24_2016-01-13
1,air_ba937bf13d40fb24,2016-01-14,3,2016,1,Thursday,False,2.0,20.292308,21.0,...,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,32,,,,,air_ba937bf13d40fb24_2016-01-14
2,air_ba937bf13d40fb24,2016-01-15,4,2016,1,Friday,False,4.0,34.738462,35.0,...,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,29,,,,,air_ba937bf13d40fb24_2016-01-15
3,air_ba937bf13d40fb24,2016-01-16,5,2016,1,Saturday,False,6.0,27.651515,27.0,...,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,22,,,,,air_ba937bf13d40fb24_2016-01-16
4,air_ba937bf13d40fb24,2016-01-18,0,2016,1,Monday,False,2.0,13.754386,12.0,...,Dining bar,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,6,,,,,air_ba937bf13d40fb24_2016-01-18


In [None]:
display(test.head())

In [None]:
display(train.head())

In [None]:
""" We have categorical variables we need to transform using entity embeddings"""

In [None]:
# display(stores.head())
display(test.head())

In [None]:
tables = [data['air_visits'], data['air_reserve'], data['hpg_reserve'], data['air_store'], 
               data['hpg_store'], data['holidays'], data['store_ids'], data['test']]

In [None]:
for t in tables: display(t.head())

In [None]:
# We want to combine the hgc data to the air data?? 
air_store_id	visit_date	visitors 	 air_genre_name		 air_area_name	latitude	longitude
day_of_week	 holiday_flg


In [None]:
for t in tables: display(DataFrameSummary(t).summary())

In [None]:
# tables = [air_visits, air_reserve, hpg_reserve, air_store, 
#                hpg_store, holidays, store_ids, test]

# air_visits, air_store, holidays easy to join
# air reserve < air_visits
# store_ids !== total air stores
# hpg_store > air_store
# hpg_reserve > air_visits

test needs to get added with


In [None]:
len(air_visits), len(test), len(store_ids)

In [None]:
# holidays.holiday_flg = holidays.holiday_flg != 0

In [None]:
# def join_df(left, right, left_on, right_on=None):
#     if right_on is None: right_on = left_on
#     return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
#                       suffixes=("", "_y"))

In [None]:
# you sort of remember this but you create a inner join between two files based on identical identifier
# in the case of holidays you can join based on the visit_date and calender date.

training = join_df(air_visits, holidays, "visit_date", "calendar_date")

In [None]:
testing = join_df(test, holidays, "visit_date", "calendar_date")

In [None]:
# air_visits = join_df(air_visits, air_reserve, "air_store_id", "air_store_id")
# num of unique air_reserve ids !== num of unique air_visit ids

In [None]:
training2 = join_df(training, air_store, "air_store_id", "air_store_id")

In [None]:
training2 

In [None]:
len(training[training.holiday_flg.isnull()])

In [None]:
len(training2[training2.air_genre_name.isnull()])

In [None]:
from utils2 import *
np.set_printoptions(threshold=50, edgeitems=20)

In [None]:
from isoweek import Week
from pandas_summary import DataFrameSummary

In [None]:
def concat_csvs(dirname):
    os.chdir(dirname)
    filenames=glob.glob("*.csv")

    wrote_header = False
    with open("../"+dirname+".csv","w") as outputfile:
        for filename in filenames:
            name = filename.split(".")[0]
            with open(filename) as f:
                line = f.readline()
                if not wrote_header:
                    wrote_header = True
                    outputfile.write("file,"+line)
                for line in f:
                     outputfile.write(name + "," + line)
                outputfile.write("\n")

    os.chdir("..")




In [None]:
concat_csvs('input')

In [None]:
df = pd.read_csv('/Users/midas/pcc/input/air_visit_data.csv')
df = pd.read_csv('/Users/midas/pcc/input/air_store_info.csv')

df


In [None]:
# Group by id and day of week - Median of the visitors is taken
agg_data = train.groupby(['air_store_id', 'dow']).agg(aggregation).reset_index()
agg_data.columns = ['air_store_id','dow','visitors']
agg_data['visitors'] = agg_data['visitors']


In [None]:
# Create the first intermediate submission file:
merged = pd.merge(test_df, agg_data, how='left', left_on=[
    'store_id','dow'], right_on=['air_store_id','dow'])
final = merged[['id','visitors']]
final.fillna(0, inplace=True)


In [None]:
# originally from this kernel:
# https://www.kaggle.com/zeemeen/weighted-mean-running-10-sec-lb-0-509
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):pd.read_csv(
    fn) for fn in glob.glob('../input/*.csv')}
for k, v in dfs.items(): locals()[k] = v

weekend_hdays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') 
    and x.holiday_flg==1), axis=1)
date_info.loc[weekend_hdays, 'holiday_flg'] = 0
date_info['weight'] = (date_info.index + 1) / len(date_info) 

visit_data = air_visit_data.merge(
    date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(
    ['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) 

sample_submission['air_store_id'] = sample_submission.id.map(
    lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(
    visitors, on=['air_store_id', 'day_of_week', 'holiday_flg'], how='left')



In [None]:
# fill missings with (air_store_id, day_of_week)
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=(
        'air_store_id', 'day_of_week'), how='left')['visitors_y'].values


In [None]:
# fill missings with (air_store_id)
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values
    
sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sample_submission = sample_submission[['id', 'visitors']]
final['visitors'][final['visitors'] ==0] = sample_submission['visitors'][final['visitors'] ==0]
sub_file = final.copy()



In [None]:
sub_file

In [None]:
## Arithmetric Mean 
sub_file['visitors'] = np.mean([final['visitors'], sample_submission['visitors']], axis = 0)
sub_file.to_csv('sub_math_mean.csv', index=False)

## Geometric Mean  
sub_file['visitors'] = (final['visitors'] * sample_submission['visitors']) ** (1/2)
sub_file.to_csv('sub_geo_mean.csv', index=False)

## Harmonic Mean 
sub_file['visitors'] = 2/(1/final['visitors'] + 1/sample_submission['visitors'])
sub_file.to_csv('sub_hrm_mean.csv', index=False)