In [41]:
import pandas as pd
import numpy as np
%matplotlib inline

In [42]:
#%ls -l

In [43]:
air_reserve = pd.read_csv('air_reserve.csv',parse_dates=[1,2])

In [44]:
print ('Dataset shape %s'%str(air_reserve.shape))
print ('Dataset columns %s'%str(air_reserve.columns))
print ('Dataset NAN check %s'%air_reserve.isna().any())
print ('Dataset size %s'%air_reserve.size)

Dataset shape (92378, 4)
Dataset columns Index(['air_store_id', 'visit_datetime', 'reserve_datetime',
       'reserve_visitors'],
      dtype='object')
Dataset NAN check air_store_id        False
visit_datetime      False
reserve_datetime    False
reserve_visitors    False
dtype: bool
Dataset size 369512


In [45]:
air_store_info = pd.read_csv('air_store_info.csv')

In [46]:
print ('Dataset shape %s'%str(air_store_info.shape))
print ('Dataset columns %s'%str(air_store_info.columns))
print ('Dataset NAN check %s'%air_store_info.isna().any())
print ('Dataset size %s'%air_store_info.size)

Dataset shape (829, 5)
Dataset columns Index(['air_store_id', 'air_genre_name', 'air_area_name', 'latitude',
       'longitude'],
      dtype='object')
Dataset NAN check air_store_id      False
air_genre_name    False
air_area_name     False
latitude          False
longitude         False
dtype: bool
Dataset size 4145


In [47]:
print (air_reserve.dtypes)
#air_reserve.head(n=5)

air_store_id                object
visit_datetime      datetime64[ns]
reserve_datetime    datetime64[ns]
reserve_visitors             int64
dtype: object


In [48]:
print (air_store_info.dtypes)
#air_store_info.head(n=5)

air_store_id       object
air_genre_name     object
air_area_name      object
latitude          float64
longitude         float64
dtype: object


In [49]:
air_merged_data = air_reserve.merge(air_store_info,on='air_store_id',how='left')

In [50]:
air_merged_data.dtypes
#air_merged_data.head(n=5)

air_store_id                object
visit_datetime      datetime64[ns]
reserve_datetime    datetime64[ns]
reserve_visitors             int64
air_genre_name              object
air_area_name               object
latitude                   float64
longitude                  float64
dtype: object

In [51]:
hpg_reserve = pd.read_csv('hpg_reserve.csv',parse_dates=[1,2])

In [52]:
hpg_reserve.dtypes
#hpg_reserve.head(n=5)

hpg_store_id                object
visit_datetime      datetime64[ns]
reserve_datetime    datetime64[ns]
reserve_visitors             int64
dtype: object

In [53]:
print ('Dataset shape %s'%str(hpg_reserve.shape))
print ('Dataset columns %s'%str(hpg_reserve.columns))
print ('Dataset NAN check %s'%hpg_reserve.isna().any())
print ('Dataset size %s'%hpg_reserve.size)

Dataset shape (2000320, 4)
Dataset columns Index(['hpg_store_id', 'visit_datetime', 'reserve_datetime',
       'reserve_visitors'],
      dtype='object')
Dataset NAN check hpg_store_id        False
visit_datetime      False
reserve_datetime    False
reserve_visitors    False
dtype: bool
Dataset size 8001280


In [54]:
hpg_store_info = pd.read_csv('hpg_store_info.csv')

In [55]:
hpg_store_info.dtypes
#hpg_store_info.head(n=5)

hpg_store_id       object
hpg_genre_name     object
hpg_area_name      object
latitude          float64
longitude         float64
dtype: object

In [56]:
print ('Dataset shape %s'%str(hpg_store_info.shape))
print ('Dataset columns %s'%str(hpg_store_info.columns))
print ('Dataset NAN check %s'%hpg_store_info.isna().any())
print ('Dataset size %s'%hpg_store_info.size)

Dataset shape (4690, 5)
Dataset columns Index(['hpg_store_id', 'hpg_genre_name', 'hpg_area_name', 'latitude',
       'longitude'],
      dtype='object')
Dataset NAN check hpg_store_id      False
hpg_genre_name    False
hpg_area_name     False
latitude          False
longitude         False
dtype: bool
Dataset size 23450


In [57]:
hpg_merged_data = hpg_reserve.merge(hpg_store_info,on='hpg_store_id',how='left')

In [58]:
hpg_merged_data.dtypes
#hpg_merged_data.head(n=5)

hpg_store_id                object
visit_datetime      datetime64[ns]
reserve_datetime    datetime64[ns]
reserve_visitors             int64
hpg_genre_name              object
hpg_area_name               object
latitude                   float64
longitude                  float64
dtype: object

In [59]:
common_col_names = ['store_id', 'visit_datetime', 'reserve_datetime',
       'reserve_visitors', 'genre_name', 'area_name', 'latitude',
       'longitude']

In [60]:
air_merged_data.columns = common_col_names
hpg_merged_data.columns = common_col_names

In [61]:
air_merged_data['source_type'] = 0 # for air
hpg_merged_data['source_type'] = 1 # for hpg

In [62]:
print(air_merged_data.shape)
print(hpg_merged_data.shape)

(92378, 9)
(2000320, 9)


In [63]:
final_dataset = pd.concat([air_merged_data,hpg_merged_data])

In [64]:
final_dataset.shape[0] == air_merged_data.shape[0] + hpg_merged_data.shape[0]

True

In [65]:
final_dataset.dtypes
#final_dataset.head(5)

store_id                    object
visit_datetime      datetime64[ns]
reserve_datetime    datetime64[ns]
reserve_visitors             int64
genre_name                  object
area_name                   object
latitude                   float64
longitude                  float64
source_type                  int64
dtype: object

In [66]:
del hpg_merged_data,air_merged_data
import gc
gc.enable()
gc.collect()

123

In [67]:
final_dataset=pd.get_dummies(final_dataset,columns=['genre_name'])

In [68]:
def weekend(date):
    if date:
        return 1 if pd.to_datetime(date).weekday_name in ['Saturday','Sunday'] else 0
    else:
        return np.NAN

In [69]:
vectorized_weekend = np.vectorize(weekend)
print (final_dataset.shape)

(2092698, 52)


In [70]:
%%time
final_dataset['Weekend_Or_Week_day'] = vectorized_weekend(final_dataset['visit_datetime'].values)

CPU times: user 8min 44s, sys: 260 ms, total: 8min 44s
Wall time: 8min 48s


In [71]:
final_dataset=final_dataset.reset_index()

In [72]:
train_data = final_dataset[final_dataset['visit_datetime']<=pd.to_datetime('23-04-2017')]
test_data = final_dataset[final_dataset['visit_datetime']>pd.to_datetime('23-04-2017')]

del final_dataset
import gc
gc.collect()

210

In [73]:
print(train_data.shape)
print(test_data.shape)

(2055268, 54)
(37430, 54)


In [75]:
train_data = train_data[:int(train_data.shape[0]/10)]

In [80]:
total_columns = train_data.columns.values.tolist()
total_columns.remove('store_id')
total_columns.remove('visit_datetime')
total_columns.remove('reserve_datetime')
total_columns.remove('reserve_visitors')
total_columns.remove('area_name')

train_X = train_data[total_columns].values
train_y = train_data['reserve_visitors'].values
test_X = test_data[total_columns].values
test_y = test_data['reserve_visitors'].values

In [81]:
print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)

(205526, 49)
(205526,)
(37430, 49)
(37430,)


In [85]:
from sklearn.linear_model import LinearRegression,Lasso

lr = LinearRegression()

lasso = Lasso()

lasso.fit(train_X,train_y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').