In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

#### 1.Import and merge data

In [2]:
features = pd.read_csv("data/features.csv")
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [3]:
stores = pd.read_csv("data/stores.csv")
stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [4]:
train = pd.read_csv("data/test.csv")
train.head()

Unnamed: 0,Store,Dept,Date,IsHoliday
0,1,1,2012-11-02,False
1,1,1,2012-11-09,False
2,1,1,2012-11-16,False
3,1,1,2012-11-23,True
4,1,1,2012-11-30,False


In [5]:
features_join_stores = pd.merge(features,stores, on='Store') # Merge features and stores together
features_join_stores.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Type,Size
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True,A,151315
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False,A,151315
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False,A,151315
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False,A,151315


In [6]:
data = pd.merge(train,features_join_stores,on=['Store', 'Date', 'IsHoliday']) # Merge previous df with train
data

Unnamed: 0,Store,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2012-11-02,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315
1,1,2,2012-11-02,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315
2,1,3,2012-11-02,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315
3,1,4,2012-11-02,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315
4,1,5,2012-11-02,False,55.32,3.386,6766.44,5147.70,50.82,3639.90,2737.42,223.462779,6.573,A,151315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115059,45,93,2013-07-26,False,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,B,118221
115060,45,94,2013-07-26,False,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,B,118221
115061,45,95,2013-07-26,False,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,B,118221
115062,45,97,2013-07-26,False,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,B,118221


#### 2. Check descriptive statistics

In [7]:
data.describe() # Check descriptive statistics

Unnamed: 0,Store,Dept,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Size
count,115064.0,115064.0,115064.0,115064.0,114915.0,86437.0,105235.0,102176.0,115064.0,76902.0,76902.0,115064.0
mean,22.238207,44.339524,53.941804,3.581546,7689.216439,3734.051729,2403.088666,3356.219071,3922.681189,176.961347,6.868733,136497.688921
std,12.80993,30.65641,18.724153,0.239442,10698.760716,8323.495014,13767.939313,7570.501545,19445.150745,41.239967,1.583427,61106.926438
min,1.0,1.0,-7.29,2.872,-2781.45,-35.74,-179.26,0.22,-185.17,131.236226,3.684,34875.0
25%,11.0,18.0,39.82,3.431,1966.46,180.35,15.1,155.46,1309.3,138.402033,5.771,93638.0
50%,22.0,37.0,54.47,3.606,4842.29,742.59,78.26,840.94,2390.43,192.304445,6.806,140167.0
75%,33.0,74.0,67.35,3.766,9439.14,2735.67,272.58,3096.92,4227.27,223.244532,8.036,202505.0
max,45.0,99.0,101.95,4.125,103184.98,71074.17,149483.31,65344.64,771448.1,228.976456,10.199,219622.0


#### 3. Check missing value

In [8]:
data.isna().sum()*100/len(data) # Check percentage of missing values

Store            0.000000
Dept             0.000000
Date             0.000000
IsHoliday        0.000000
Temperature      0.000000
Fuel_Price       0.000000
MarkDown1        0.129493
MarkDown2       24.879198
MarkDown3        8.542203
MarkDown4       11.200723
MarkDown5        0.000000
CPI             33.165890
Unemployment    33.165890
Type             0.000000
Size             0.000000
dtype: float64

The `MarkDown1-5` columns have missing value more than 50%. We will drop these columns.

In [9]:
data.drop(["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"],axis=1,inplace=True) # Drop missing columns

In [10]:
data.fillna(data.mean(),inplace=True)

In [11]:
data.isna().sum()*100/len(data) # Check percentage of missing values

Store           0.0
Dept            0.0
Date            0.0
IsHoliday       0.0
Temperature     0.0
Fuel_Price      0.0
CPI             0.0
Unemployment    0.0
Type            0.0
Size            0.0
dtype: float64

#### 4. Check Correlation

In [12]:
corr = data.corr() # Correlation Matrix
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Store,Dept,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment,Size
Store,1.0,0.019627,-0.001166,-0.043495,0.153425,-0.175702,0.204689,-0.186845
Dept,0.019627,1.0,0.001249,0.00397,0.000554,-0.005176,0.003339,0.001502
IsHoliday,-0.001166,0.001249,1.0,-0.187428,-0.126443,-0.001444,0.010071,-0.000443
Temperature,-0.043495,0.00397,-0.187428,1.0,0.073938,0.17426,0.013734,-0.061256
Fuel_Price,0.153425,0.000554,-0.126443,0.073938,1.0,-0.431481,0.386459,0.055088
CPI,-0.175702,-0.005176,-0.001444,0.17426,-0.431481,1.0,-0.289842,-0.002384
Unemployment,0.204689,0.003339,0.010071,0.013734,0.386459,-0.289842,1.0,-0.001626
Size,-0.186845,0.001502,-0.000443,-0.061256,0.055088,-0.002384,-0.001626,1.0


Focus on Weekly_Sales column and other columns row, you will see the correlation between them. 

The `Positive Correlation`s are Size, Dept, IsHoliday. If these columns increase, the Weekly_Sales increase.

The `Negative Correlation`s are Store, CPI, Unemployment. If these columns increase, the Weekly_Sales decrease.

The `No Correlation`s are Temperature, Fuel_Price. We drop this out becaause it's no meaning to predict Weekly_Sales.

In [13]:
data.drop(["Temperature","Fuel_Price"],axis=1,inplace=True)

#### 5. Feature Extractions

In [14]:
data["DateTime"] = pd.to_datetime(data["Date"]) # Cast Date column to DateTime
data["Week"] = data["DateTime"].dt.week # Extract week from date
data["Month"] = data["DateTime"].dt.month # Extract month from date
data["Year"] = data["DateTime"].dt.year # Extract year from date

In [15]:
holiday_dict = {"SuperBowl":['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'],
                "LaborDay":['2010-09-10','2011-09-09', '2012-09-07', '2013-09-06'],
                "ThanksGiving":['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'],
                "Christmas":['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']}

In [16]:
special_day = []
for date in data["Date"].values:
    if date in holiday_dict["SuperBowl"]:
        special_day.append(0)
    elif date in holiday_dict["LaborDay"]:
        special_day.append(1)
    elif date in holiday_dict["ThanksGiving"]:
        special_day.append(2)
    elif date in holiday_dict["Christmas"]:
        special_day.append(3)
    else:
        special_day.append(-1)
data["SpecialDay"] = special_day
data.drop(["Date"],axis=1,inplace=True)

In [17]:
data

Unnamed: 0,Store,Dept,IsHoliday,CPI,Unemployment,Type,Size,DateTime,Week,Month,Year,SpecialDay
0,1,1,False,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1
1,1,2,False,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1
2,1,3,False,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1
3,1,4,False,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1
4,1,5,False,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
115059,45,93,False,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1
115060,45,94,False,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1
115061,45,95,False,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1
115062,45,97,False,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1


#### 6. Dummies and Label Encoders

In [18]:
data["IsHoliday"] = data["IsHoliday"].map({False:0,True:1})

In [19]:
dummies = pd.get_dummies(data["Type"],prefix="Type",dtype=np.int)
data = pd.concat([data,dummies],axis=1)
data

Unnamed: 0,Store,Dept,IsHoliday,CPI,Unemployment,Type,Size,DateTime,Week,Month,Year,SpecialDay,Type_A,Type_B,Type_C
0,1,1,0,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1,1,0,0
1,1,2,0,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1,1,0,0
2,1,3,0,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1,1,0,0
3,1,4,0,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1,1,0,0
4,1,5,0,223.462779,6.573000,A,151315,2012-11-02,44,11,2012,-1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115059,45,93,0,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1,0,1,0
115060,45,94,0,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1,0,1,0
115061,45,95,0,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1,0,1,0
115062,45,97,0,176.961347,6.868733,B,118221,2013-07-26,30,7,2013,-1,0,1,0


In [20]:
data.describe()

Unnamed: 0,Store,Dept,IsHoliday,CPI,Unemployment,Size,Week,Month,Year,SpecialDay,Type_A,Type_B,Type_C
count,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0
mean,22.238207,44.339524,0.077592,176.961347,6.868733,136497.688921,23.050963,5.729855,2012.767182,-0.792776,0.510264,0.386741,0.102995
std,12.80993,30.65641,0.267529,33.714479,1.294483,61106.926438,15.750407,3.590331,0.422629,0.794411,0.499897,0.487006,0.303953
min,1.0,1.0,0.0,131.236226,3.684,34875.0,1.0,1.0,2012.0,-1.0,0.0,0.0,0.0
25%,11.0,18.0,0.0,138.900429,6.228,93638.0,10.0,3.0,2013.0,-1.0,0.0,0.0,0.0
50%,22.0,37.0,0.0,176.961347,6.868733,140167.0,20.0,5.0,2013.0,-1.0,1.0,0.0,0.0
75%,33.0,74.0,0.0,201.21223,7.293,202505.0,30.0,7.0,2013.0,-1.0,1.0,1.0,0.0
max,45.0,99.0,1.0,228.976456,10.199,219622.0,52.0,12.0,2013.0,3.0,1.0,1.0,1.0


In [21]:
data.dtypes

Store                    int64
Dept                     int64
IsHoliday                int64
CPI                    float64
Unemployment           float64
Type                    object
Size                     int64
DateTime        datetime64[ns]
Week                     int64
Month                    int64
Year                     int64
SpecialDay               int64
Type_A                   int64
Type_B                   int64
Type_C                   int64
dtype: object

In [22]:
data.to_csv("data/walmart-data-test.csv",index=False)
data.drop("Type",axis=1,inplace=True)

#### 7. Features Scaling

In [23]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = ["CPI","Unemployment","Size","Year"]

scaler = StandardScaler()
scaled = scaler.fit_transform(data[columns_to_scale])
data.drop(columns_to_scale,axis=1,inplace=True)
scaled_df = pd.DataFrame(data=scaled,columns=columns_to_scale)
data = pd.concat([data,scaled_df],axis=1)
data

Unnamed: 0,Store,Dept,IsHoliday,DateTime,Week,Month,SpecialDay,Type_A,Type_B,Type_C,CPI,Unemployment,Size,Year
0,1,1,0,2012-11-02,44,11,-1,1,0,0,1.379278,-2.284576e-01,0.242483,-1.815267
1,1,2,0,2012-11-02,44,11,-1,1,0,0,1.379278,-2.284576e-01,0.242483,-1.815267
2,1,3,0,2012-11-02,44,11,-1,1,0,0,1.379278,-2.284576e-01,0.242483,-1.815267
3,1,4,0,2012-11-02,44,11,-1,1,0,0,1.379278,-2.284576e-01,0.242483,-1.815267
4,1,5,0,2012-11-02,44,11,-1,1,0,0,1.379278,-2.284576e-01,0.242483,-1.815267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115059,45,93,0,2013-07-26,30,7,-1,0,1,0,0.000000,2.058387e-15,-0.299095,0.550883
115060,45,94,0,2013-07-26,30,7,-1,0,1,0,0.000000,2.058387e-15,-0.299095,0.550883
115061,45,95,0,2013-07-26,30,7,-1,0,1,0,0.000000,2.058387e-15,-0.299095,0.550883
115062,45,97,0,2013-07-26,30,7,-1,0,1,0,0.000000,2.058387e-15,-0.299095,0.550883


In [24]:
data.to_csv("data/walmart-preprocessing-test.csv",index=False)