In [1]:
import pandas as pd, numpy as np
import pickle
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


### Import the datasets

In [2]:
test= pd.read_csv('../assets/test_df.csv')

In [3]:
with open('../assets/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

### Imputing missing values

In [4]:
test.PromoInterval.fillna('N/A', inplace= True)

### Date as index

In [5]:
test['Date']= pd.to_datetime(test['Date'])
test.set_index(test['Date'],inplace=True)

### Holidays

In [9]:
test.StateHoliday = test.StateHoliday.map(lambda x: 'NoHoliday' if x == '0' else x)

We can use the same format that we did in the training dataset. In fact, test and train data should have matching columns. 

### New promo 2

Since promo2 is improperly merged, we need to create a new feature that indicates when promo2 actually starts.

In [10]:
test= test.assign(Promo2On = lambda df: ((test.Date.map(lambda x: x.year) == test.Promo2SinceYear) 
              & (test.Date.map(lambda x: x.week) >= test.Promo2SinceWeek))
              | (test.Date.map(lambda x: x.year) > test.Promo2SinceYear))

In [11]:
test.drop(columns= 'Promo2', inplace= True)

Since we have an actual feature that is called Promo2On, we can drop the Promo2 feature.

### Change Data Type

In [12]:
for col in ['CompetitionOpenSinceMonth' , 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear']:
    test[col]= test[col].astype(int)

#### Create new featues

In [13]:
test= test.assign(Old = lambda df: df.PromoInterval.apply(lambda x: x.split(',')))

In [14]:
test['FirstPromo2'] = test.Old.apply(lambda x: 0 if 'N/A' in x
                 else x[0])

In [15]:
test['SecondPromo2'] = test.Old.apply(lambda x: 0 if 'N/A' in x
                 else x[1])

In [16]:
test['ThirdPromo2'] = test.Old.apply(lambda x: 0 if 'N/A' in x
                 else x[2])

In [17]:
test['FourthPromo2'] = test.Old.apply(lambda x: 0 if 'N/A' in x
                 else x[3])

In [18]:
month_convert = {
    'Jan':1,
    'Feb':2,
    'Mar':3,
    'Apr':4,
    'May':5,
    'Jun':6,
    'Jul':7,
    'Aug':8,
    'Sept':9,
    'Oct':10,
    'Nov':11,
    'Dec':12
}

In [19]:
def set_month(row):
    if row.PromoInterval == 'N/A':
        return 'N/A'
    return row.FirstPromo2 if row.Date.month >= month_convert[row.FirstPromo2] and row.Date.month < month_convert[row.SecondPromo2] else\
            (row.SecondPromo2 if row.Date.month >= month_convert[row.SecondPromo2] and row.Date.month < month_convert[row.ThirdPromo2] else\
             (row.ThirdPromo2 if row.Date.month >= month_convert[row.ThirdPromo2] and row.Date.month < month_convert[row.FourthPromo2] else\
              row.FourthPromo2))

In [20]:
test['Promo2BeginMonth'] = test.apply(set_month, axis=1)

In [21]:
test['LongDistance'] = test.CompetitionDistance.apply(lambda x: 1 if x >= test.CompetitionDistance.median() else 0)

#### Drop un-needed columns

In [22]:
test.drop(columns= ['Date.1', 'Date', 'Old','FirstPromo2', 
                    'SecondPromo2', 'ThirdPromo2', 'FourthPromo2',], inplace= True)

#### New feature from index

In [23]:
test['Month'] = test.index.month

### One hot code

In [24]:
test = pd.get_dummies(columns= ['StateHoliday', 'StoreType', 'Assortment', 'Month',
                              'Promo2BeginMonth', 'PromoInterval'], data = test)

In [26]:
set(test.columns) - set(X_train.columns)

{'Id'}

In [32]:
missing_cols = set(X_train.columns) - set(test.columns)

In [33]:
for col in missing_cols:
    test[col] = 0

In [34]:
sales_lasso = test[test.Open == 1]

In [35]:
no_sales_lasso = test[test.Open == 0]

### Prediction

##### Lasso

In [30]:
with open('../assets/gs_lasso.pkl', 'rb') as f:
    gs_lasso= pickle.load(f)

In [36]:
lasso_preds= gs_lasso.predict(sales_lasso.drop(columns= ['Id']))

In [37]:
sales_lasso['Prediction'] = lasso_preds

In [38]:
no_sales_lasso['Prediction'] = 0

In [39]:
lasso_submission = pd.concat([sales_lasso, no_sales_lasso], axis = 0)[['Id', 'Prediction']]

###### Random Forest

In [40]:
with open('../assets/gs_rf.pkl', 'rb') as f:
    gs_rf= pickle.load(f)

In [41]:
sales_rf = test[test.Open == 1]

In [42]:
no_sales_rf = test[test.Open == 0]

In [43]:
rf_preds= gs_rf.predict(sales_rf.drop(columns= ['Id']))

In [44]:
sales_rf['Prediction'] = rf_preds

In [45]:
no_sales_rf['Prediction'] = 0

In [46]:
rf_submission = pd.concat([sales_rf, no_sales_rf], axis = 0)[['Id', 'Prediction']]