In [342]:
import pandas as pd, numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from sklearn.model_selection import TimeSeriesSplit

### Import data

In [343]:
df= pd.read_csv('../assets/date_df.csv')

In [344]:
test= pd.read_csv('../data/test.csv')
store= pd.read_csv('../assets/clean_store.csv')

In [345]:
df['date'] = pd.to_datetime(df['date'])
df.set_index(df['date'],inplace=True)

In [346]:
df.promointerval.fillna('N/A', inplace= True)

### Changing test column name

In [347]:
test.columns= [col.lower() for col in test.columns]

#### Checking test dataset

In [348]:
test[test.open.isnull()]

Unnamed: 0,id,store,dayofweek,date,open,promo,stateholiday,schoolholiday
479,480,622,4,2015-09-17,,1,0,0
1335,1336,622,3,2015-09-16,,1,0,0
2191,2192,622,2,2015-09-15,,1,0,0
3047,3048,622,1,2015-09-14,,1,0,0
4759,4760,622,6,2015-09-12,,0,0,0
5615,5616,622,5,2015-09-11,,0,0,0
6471,6472,622,4,2015-09-10,,0,0,0
7327,7328,622,3,2015-09-09,,0,0,0
8183,8184,622,2,2015-09-08,,0,0,0
9039,9040,622,1,2015-09-07,,0,0,0


Based on the observation from the training dataset, the store 622 is not under renovation. Also, the store usually closes on when the dayofweek is 7, but in this dataset, 7 is not presented. Therefore, it is safe to assume that we can impute the missing values with 1.

In [349]:
test.open.fillna(1, inplace= True)

### Merging the test data with store

In [350]:
new_test= pd.merge(test, store, how= 'inner')

In [351]:
new_test.drop(columns= 'Unnamed: 0', inplace= True)

In [352]:
new_test[new_test.promointerval.isnull()]['promo2'].value_counts()

0    17232
Name: promo2, dtype: int64

In [353]:
new_test.promointerval.fillna('N/A', inplace= True)

After the observation, the missing values in the feature promointerval is related to promo2 columnn, which is 0. Therefore, I impute the values with 'N/A'.

#### Change Index

In [354]:
new_test['date'] = pd.to_datetime(new_test['date'])
new_test.set_index(new_test['date'],inplace=True)

### New features from date

In [355]:
df['year']= df.index.year
df['month']= df.index.month
df['quarter']= df.index.quarter

In [356]:
new_test['year']= new_test.index.year
new_test['month']= new_test.index.month
new_test['quarter']= new_test.index.quarter

### Create new features

In [357]:
df['compete_duration'] = ((df.year - df.competitionopensinceyear) * 12 +(df.month - df.competitionopensincemonth))

In [358]:
new_test['compete_duration']= ((new_test.year - new_test.competitionopensinceyear) * 12 +(new_test.month - new_test.competitionopensincemonth))

Based on the length of competition, it might be possible that companies have more aggressive promotions. 

In [359]:
df.competitiondistance.describe()

count    1.017209e+06
mean     5.422021e+03
std      7.706918e+03
min      2.000000e+01
25%      7.100000e+02
50%      2.325000e+03
75%      6.880000e+03
max      7.586000e+04
Name: competitiondistance, dtype: float64

In [360]:
df['long_distance']= df.competitiondistance.apply(lambda x: 1 if x > 6880.0 else 0)

In [361]:
df.competitiondistance.describe()['75%']

6880.0

In [362]:
new_test['long_distance']= new_test.competitiondistance.apply(lambda x: 1 if x > 27190.0 else 0)

Based on the distance towards competitors, companies might have a higher chance of having promotions. Therefore, I create a new feature called long_distance using the median of the competitors' distance.

In [363]:
df['promo2_duration']= (df.year - df.promo2sinceyear) * 12 + (df.month - (df.promo2sinceweek / 4))

In [364]:
new_test['promo2_duration']= (new_test.year - new_test.promo2sinceyear) * 12 + (new_test.month - (new_test.promo2sinceweek / 4))

Since the length of promo2 is related to the total sales of each store, calculaing the length of promo2 will be benefiicial.

In [365]:
df.to_csv('../assets/new_df.csv')

#### Create new features

Two dataset have different features, therefore, we need to match the column name.

In [366]:
set(new_test.columns) - set(df.columns)

{'id'}

In [367]:
df['stateholiday_0']= 0

In [368]:
missing_cols= set(df.columns) - set(new_test.columns)

In [369]:
for col in missing_cols:
    new_test[col] = 0

### One hot coding

In order to use machine learning, all the values need to be either discrete or continuous variables, which mean that we need to change string values to some sort of numerical values. Based on the observation on the EDA notebook, some features containg important information.

In [370]:
df= pd.get_dummies(columns= ['stateholiday', 'storetype', 'assortment', 'promointerval', 'dayofweek'], data= df)

In [371]:
new_test= pd.get_dummies(columns= ['stateholiday', 'storetype', 'assortment', 'promointerval', 'dayofweek'], data= new_test)

In [372]:
new_test.to_csv('../assets/new_test.csv')

### Dropping columns

Dropping columns that I do not need to use later including the feature customers. Although the customers feature has the highest correlation with sales, the test dataset does not include customers column. Also, dropping some features that have redundant information such as date, year, month, and quarter.

In [308]:
df.drop(columns= ['date', 'customers', 'year', 'quarter', 'competitionopensincemonth' ,'schoolholiday',
                  'competitionopensinceyear', 'promo2sinceweek', 'promo2sinceyear', 'competitiondistance'], inplace= True)

In [309]:
new_test.drop(columns= ['date', 'year', 'quarter', 'competitionopensincemonth',
                  'competitionopensinceyear', 'promo2sinceweek', 'promo2sinceyear', 'competitiondistance'], inplace= True)

In [312]:
X= df.drop('sales', axis= 1)
y= df['sales']

#### Export the data

In [314]:
with open('../assets/X.pkl','wb+') as f:
    pickle.dump(X, f)

In [315]:
with open('../assets/y.pkl','wb+') as f:
    pickle.dump(y, f)