In [None]:
%matplotlib inline
import math, keras, datetime, pandas as pd, numpy as np, keras.backend as K
import matplotlib.pyplot as plt, xgboost, operator, random, pickle
import glob, re

In [None]:
# 1 Introduction
# This is an initial Exploratory Data Analysis for the Recruit Restaurant Visitor Forecasting 

# The aim of this challenge is to predict the future numbers of restaurant visitors. 
# This makes it a Time Series Forecasting problem. The data was collected from Japanese restaurants.
# As we will see, the data set is small and easily accessible without requiring much memory or computing power. 
# Therefore, this competition is particularly suited for beginners.

# The data comes in the shape of 8 relational files which are derived from two separate Japanese websites that 
# collect user information: “Hot Pepper Gourmet (hpg): similar to Yelp” (search and reserve) 
# and “AirREGI / Restaurant Board (air): similar to Square” (reservation control and cash register). 
# The training data is based on the time range of Jan 2016 - most of Apr 2017, 
# while the test set includes the last week of Apr plus May 2017. 
# The test data “intentionally spans a holiday week in Japan called the ‘Golden Week.’ 
# The data description further notes that:”There are days in the test set where the restaurant were 
# closed and had no visitors. These are ignored in scoring. The training set omits days where the restaurants 
# were closed."

# Those are the individual files:

# air_visit_data.csv: historical visit data for the air restaurants. 
# This is essentially the main training data set.

# air_reserve.csv / hpg_reserve.csv: reservations made through the air / hpg systems.

# air_store_info.csv / hpg_store_info.csv: details about the air / hpg restaurants including genre and location.

# store_id_relation.csv: connects the air and hpg ids

# date_info.csv: essentially flags the Japanese holidays.

# sample_submission.csv: serves as the test set. The id is formed by combining the air id with the visit date.


In [None]:
air_visits = pd.read_csv('/Users/midas/pcc/input/air_visit_data.csv')
air_reserve = pd.read_csv('/Users/midas/pcc/input/air_reserve.csv')
hpg_reserve = pd.read_csv('/Users/midas/pcc/input/hpg_reserve.csv')
air_store = pd.read_csv('/Users/midas/pcc/input/air_store_info.csv')
hpg_store = pd.read_csv('/Users/midas/pcc/input/hpg_store_info.csv')
holidays = pd.read_csv('/Users/midas/pcc/input/date_info.csv')
store_ids = pd.read_csv('/Users/midas/pcc/input/store_id_relation.csv')
test = pd.read_csv('/Users/midas/pcc/input/sample_submission.csv')
# test_df = pd.read_csv('../input/sample_submission.csv')

In [None]:
tables = [air_visits, air_reserve, hpg_reserve, air_store, 
               hpg_store, holidays, store_ids, test]

In [None]:
for t in tables: display(t.head())

In [None]:
# We want to combine the hgc data to the air data?? 
air_store_id	visit_date	visitors 	 air_genre_name		 air_area_name	latitude	longitude
day_of_week	 holiday_flg


In [None]:
for t in tables: display(DataFrameSummary(t).summary())

In [None]:
# tables = [air_visits, air_reserve, hpg_reserve, air_store, 
#                hpg_store, holidays, store_ids, test]

# air_visits, air_store, holidays easy to join
# air reserve < air_visits
# store_ids !== total air stores
# hpg_store > air_store
# hpg_reserve > air_visits

test needs to get added with


In [None]:
len(air_visits), len(test), len(store_ids)

In [None]:
holidays.holiday_flg = holidays.holiday_flg != 0

In [None]:
def join_df(left, right, left_on, right_on=None):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", "_y"))

In [None]:
# you sort of remember this but you create a inner join between two files based on identical identifier
# in the case of holidays you can join based on the visit_date and calender date.

training = join_df(air_visits, holidays, "visit_date", "calendar_date")

In [None]:
testing = join_df(test, holidays, "visit_date", "calendar_date")

In [None]:
# air_visits = join_df(air_visits, air_reserve, "air_store_id", "air_store_id")
# num of unique air_reserve ids !== num of unique air_visit ids

In [None]:
training2 = join_df(training, air_store, "air_store_id", "air_store_id")

In [None]:
training2 

In [None]:
len(training[training.holiday_flg.isnull()])

In [None]:
len(training2[training2.air_genre_name.isnull()])

In [None]:
from utils2 import *
np.set_printoptions(threshold=50, edgeitems=20)

In [None]:
from isoweek import Week
from pandas_summary import DataFrameSummary

In [None]:
def concat_csvs(dirname):
    os.chdir(dirname)
    filenames=glob.glob("*.csv")

    wrote_header = False
    with open("../"+dirname+".csv","w") as outputfile:
        for filename in filenames:
            name = filename.split(".")[0]
            with open(filename) as f:
                line = f.readline()
                if not wrote_header:
                    wrote_header = True
                    outputfile.write("file,"+line)
                for line in f:
                     outputfile.write(name + "," + line)
                outputfile.write("\n")

    os.chdir("..")




In [None]:
concat_csvs('input')

In [None]:
df = pd.read_csv('/Users/midas/pcc/input/air_visit_data.csv')
df = pd.read_csv('/Users/midas/pcc/input/air_store_info.csv')

df


In [None]:
# Group by id and day of week - Median of the visitors is taken
agg_data = train.groupby(['air_store_id', 'dow']).agg(aggregation).reset_index()
agg_data.columns = ['air_store_id','dow','visitors']
agg_data['visitors'] = agg_data['visitors']


In [None]:
# Create the first intermediate submission file:
merged = pd.merge(test_df, agg_data, how='left', left_on=[
    'store_id','dow'], right_on=['air_store_id','dow'])
final = merged[['id','visitors']]
final.fillna(0, inplace=True)


In [None]:
# originally from this kernel:
# https://www.kaggle.com/zeemeen/weighted-mean-running-10-sec-lb-0-509
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):pd.read_csv(
    fn) for fn in glob.glob('../input/*.csv')}
for k, v in dfs.items(): locals()[k] = v

weekend_hdays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') 
    and x.holiday_flg==1), axis=1)
date_info.loc[weekend_hdays, 'holiday_flg'] = 0
date_info['weight'] = (date_info.index + 1) / len(date_info) 

visit_data = air_visit_data.merge(
    date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(
    ['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) 

sample_submission['air_store_id'] = sample_submission.id.map(
    lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(
    visitors, on=['air_store_id', 'day_of_week', 'holiday_flg'], how='left')



In [None]:
# fill missings with (air_store_id, day_of_week)
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=(
        'air_store_id', 'day_of_week'), how='left')['visitors_y'].values


In [None]:
# fill missings with (air_store_id)
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values
    
sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sample_submission = sample_submission[['id', 'visitors']]
final['visitors'][final['visitors'] ==0] = sample_submission['visitors'][final['visitors'] ==0]
sub_file = final.copy()



In [None]:
sub_file

In [None]:
## Arithmetric Mean 
sub_file['visitors'] = np.mean([final['visitors'], sample_submission['visitors']], axis = 0)
sub_file.to_csv('sub_math_mean.csv', index=False)

## Geometric Mean  
sub_file['visitors'] = (final['visitors'] * sample_submission['visitors']) ** (1/2)
sub_file.to_csv('sub_geo_mean.csv', index=False)

## Harmonic Mean 
sub_file['visitors'] = 2/(1/final['visitors'] + 1/sample_submission['visitors'])
sub_file.to_csv('sub_hrm_mean.csv', index=False)