In [85]:
#import all related library
import pandas as pd
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.cross_validation import train_test_split
import numpy as np
from __future__ import division

In [86]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def RMSPE(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

In [87]:
folder = 'data/'
real_train=pd.read_csv(folder + 'train.csv')
real_test=pd.read_csv(folder + 'test.csv')

In [88]:
#check the missing value in real_test table
print real_test.isnull().sum()

Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64


In [89]:
#there are some missing value in Open columns, at this point we just simply fill in with 1
real_test.Open.fillna(1, inplace=True)

In [90]:
#check missing value again
print real_test.isnull().sum()

Id               0
Store            0
DayOfWeek        0
Date             0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64


In [91]:
#check the column type 
print real_test.dtypes

Id                 int64
Store              int64
DayOfWeek          int64
Date              object
Open             float64
Promo              int64
StateHoliday      object
SchoolHoliday      int64
dtype: object


In [92]:
#convert float type to int for less trouble in future
real_test['Open']=real_test['Open'].astype(int)

In [93]:
#check type again
print real_test.dtypes

Id                int64
Store             int64
DayOfWeek         int64
Date             object
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
dtype: object


In [94]:
#feature enginnering
real_train['Month'] = pd.DatetimeIndex(real_train['Date']).month
real_test['Month']=pd.DatetimeIndex(real_test['Date']).month

In [95]:
#convert some columns type to object for dummies
for col in ['DayOfWeek','Month','Promo','SchoolHoliday','Open','StateHoliday']:
    real_train[col]=real_train[col].astype(str)
    real_test[col]=real_test[col].astype(str)

In [96]:
#check the type
real_test.dtypes

Id                int64
Store             int64
DayOfWeek        object
Date             object
Open             object
Promo            object
StateHoliday     object
SchoolHoliday    object
Month            object
dtype: object

In [97]:
real_test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Month
0,1,1,4,2015-09-17,1,1,0,0,9
1,2,3,4,2015-09-17,1,1,0,0,9
2,3,7,4,2015-09-17,1,1,0,0,9
3,4,8,4,2015-09-17,1,1,0,0,9
4,5,9,4,2015-09-17,1,1,0,0,9


In [98]:
#get dummy table for train ready
categorical_variables=['DayOfWeek','Month','Promo','SchoolHoliday','Open','StateHoliday']
regular_variables=['Store','Date','Sales','Customers']

dummy_table=pd.DataFrame()
for var in categorical_variables:
    dummy_table=pd.concat([dummy_table,pd.get_dummies(real_train[var], prefix=var)], axis=1) 
    
real_train=pd.concat([dummy_table,real_train[regular_variables]],axis=1)


#get dummy table for test ready
categorical_variables=['DayOfWeek','Month','Promo','SchoolHoliday','Open','StateHoliday']
regular_variables=['Store','Date']

dummy_table=pd.DataFrame()
for var in categorical_variables:
    dummy_table=pd.concat([dummy_table,pd.get_dummies(real_test[var], prefix=var)], axis=1) 
    
real_test=pd.concat([dummy_table,real_test[regular_variables]],axis=1)

In [99]:
real_test.head()

Unnamed: 0,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7,Month_8,Month_9,Promo_0,Promo_1,SchoolHoliday_0,SchoolHoliday_1,Open_0,Open_1,StateHoliday_0,StateHoliday_a,Store,Date
0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,1,2015-09-17
1,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,3,2015-09-17
2,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,7,2015-09-17
3,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,8,2015-09-17
4,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,9,2015-09-17


In [100]:
# add some dummies columns to real_test table to make sure the size
# is consistant with training table
for varMonth in [1,2,3,4,5,6,7,10,11,12]:
    real_test['Month_'+str(varMonth)]=0

for varhol in ['b','c']:
    real_test['StateHoliday_'+varhol]=0

In [101]:
#check size consistancy
print real_test.shape
print real_train.shape

(41088, 31)
(1017209, 33)


In [102]:
#train RF model
X=real_train.drop(['Store','Sales','Customers','Date'],axis=1)
y=real_train['Sales']
RF_model=RF(n_jobs=-1)
RF_model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [103]:
#create submission file
myt=real_test.drop(['Store','Date'],axis=1)
preds=RF_model.predict(myt)

In [104]:
test=pd.read_csv(folder + 'test.csv')
result = pd.DataFrame({'Id': test.Id})
result['Sales']=preds
result.to_csv('baseline10-26.csv', index=False, sep=',')