# Capstone Two - 2. Data wrangling<a id='2._Data_wrangling'></a>

**0. Imports**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from library.sb_utils import save_file

**1. Load data**

https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data

In [2]:
# Directory of input data
input_dir = '../input_data/store-sales-time-series-forecasting/'

In [3]:
# Store sales (training set)
train = pd.read_csv(input_dir+'train.csv')
print(train.shape)
train.head(3)

(3000888, 6)


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0


In [4]:
# Store sales (testing set)
test = pd.read_csv(input_dir+'test.csv')
test.head(3)

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2


In [5]:
# Store metadata
stores = pd.read_csv(input_dir+'stores.csv')
stores.head(3)

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8


In [6]:
# Daily oil price
oil = pd.read_csv(input_dir+'oil.csv')
oil.head(3)

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97


In [7]:
# Daily transactions per store
transactions = pd.read_csv(input_dir+'transactions.csv')
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [8]:
# Holidays and Events, with metadata
holidays_events = pd.read_csv(input_dir+'holidays_events.csv')
holidays_events.head(3)

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False


**2. Data dimensions**

In [9]:
train[['store_nbr','family']].nunique()

store_nbr    54
family       33
dtype: int64

In [10]:
train[['store_nbr','family']].value_counts()

store_nbr  family                 
1          AUTOMOTIVE                 1684
36         PREPARED FOODS             1684
37         CLEANING                   1684
           CELEBRATION                1684
           BREAD/BAKERY               1684
                                      ... 
18         PREPARED FOODS             1684
           POULTRY                    1684
           PLAYERS AND ELECTRONICS    1684
           PET SUPPLIES               1684
54         SEAFOOD                    1684
Length: 1782, dtype: int64

In [11]:
test[['store_nbr','family']].value_counts()

store_nbr  family                 
1          AUTOMOTIVE                 16
36         PREPARED FOODS             16
37         CLEANING                   16
           CELEBRATION                16
           BREAD/BAKERY               16
                                      ..
18         PREPARED FOODS             16
           POULTRY                    16
           PLAYERS AND ELECTRONICS    16
           PET SUPPLIES               16
54         SEAFOOD                    16
Length: 1782, dtype: int64

We have 54 stores and 33 product families, thus totally 54*33=1782 time series.

Resolution of TS: Daily**

Forecasting horizon: 16 days

**3. Target and Exogenous Variables**

Target: Sales per product family per store per day from 2017-08-16 to 2017-08-31 (16 days); non-categorical.

In [12]:
# Combine train and test sets
data = pd.concat([train, test], sort=True)

# Merge datasets
data = data.merge(stores, how="left", on='store_nbr')   
data = data.merge(oil, how="left", on='date')      
data = data.merge(transactions, how="left", on=['date','store_nbr'])  
data = data.merge(holidays_events,on='date',how='left')  ######

# Rename cloumns with the same name
data = data.rename(columns={'type_x' : 'store_type','type_y':'holiday_type'})

data.head()

Unnamed: 0,date,family,id,onpromotion,sales,store_nbr,city,state,store_type,cluster,dcoilwtico,transactions,holiday_type,locale,locale_name,description,transferred
0,2013-01-01,AUTOMOTIVE,0,0,0.0,1,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
1,2013-01-01,BABY CARE,1,0,0.0,1,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
2,2013-01-01,BEAUTY,2,0,0.0,1,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
3,2013-01-01,BEVERAGES,3,0,0.0,1,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False
4,2013-01-01,BOOKS,4,0,0.0,1,Quito,Pichincha,D,13,,,Holiday,National,Ecuador,Primer dia del ano,False


In [13]:
print(data.shape[0], train.shape[0]+test.shape[0], train.shape[0], test.shape[0])

3082860 3029400 3000888 28512


!!! Problems in holidays_events data !!!
1. Length of data is not equal to the sum of train and test lengths, because sometimes one date has two kinds of holidays (one national and one local, or two different locales).
2. Some holiday locales have an equvalent city name and some other locales have an equvalent state name. One locale (Imbabura) does not has an equvalent city or state name.

In [14]:
data['locale_name'].value_counts().reset_index().sort_values('index')

Unnamed: 0,index,locale_name
3,Ambato,16038
10,Cayambe,8910
17,Cotopaxi,8910
7,Cuenca,10692
0,Ecuador,261954
12,El Carmen,8910
9,Esmeraldas,8910
4,Guaranda,16038
1,Guayaquil,19602
18,Ibarra,7128


In [15]:
data['city'].value_counts().reset_index().sort_values('index')

Unnamed: 0,index,city
7,Ambato,114180
14,Babahoyo,57090
15,Cayambe,57090
3,Cuenca,171270
13,Daule,57090
21,El Carmen,57090
9,Esmeraldas,57090
18,Guaranda,57090
1,Guayaquil,456720
19,Ibarra,57090


In [16]:
data['state'].value_counts().reset_index().sort_values('index')

Unnamed: 0,index,state
3,Azuay,171270
11,Bolivar,57090
9,Chimborazo,57090
5,Cotopaxi,114180
8,El Oro,114180
15,Esmeraldas,57090
1,Guayas,627990
10,Imbabura,57090
14,Loja,57090
7,Los Rios,114180


 Break down the date into different columns:

In [17]:
# Split the date column
data.date = pd.to_datetime(data.date)
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['week'] = data['date'].dt.isocalendar().week
data['quarter'] = data['date'].dt.quarter
data['day_of_week'] = data['date'].dt.day_name()
data.head()

Unnamed: 0,date,family,id,onpromotion,sales,store_nbr,city,state,store_type,cluster,...,holiday_type,locale,locale_name,description,transferred,year,month,week,quarter,day_of_week
0,2013-01-01,AUTOMOTIVE,0,0,0.0,1,Quito,Pichincha,D,13,...,Holiday,National,Ecuador,Primer dia del ano,False,2013,1,1,1,Tuesday
1,2013-01-01,BABY CARE,1,0,0.0,1,Quito,Pichincha,D,13,...,Holiday,National,Ecuador,Primer dia del ano,False,2013,1,1,1,Tuesday
2,2013-01-01,BEAUTY,2,0,0.0,1,Quito,Pichincha,D,13,...,Holiday,National,Ecuador,Primer dia del ano,False,2013,1,1,1,Tuesday
3,2013-01-01,BEVERAGES,3,0,0.0,1,Quito,Pichincha,D,13,...,Holiday,National,Ecuador,Primer dia del ano,False,2013,1,1,1,Tuesday
4,2013-01-01,BOOKS,4,0,0.0,1,Quito,Pichincha,D,13,...,Holiday,National,Ecuador,Primer dia del ano,False,2013,1,1,1,Tuesday


In [18]:
data.dtypes

date            datetime64[ns]
family                  object
id                       int64
onpromotion              int64
sales                  float64
store_nbr                int64
city                    object
state                   object
store_type              object
cluster                  int64
dcoilwtico             float64
transactions           float64
holiday_type            object
locale                  object
locale_name             object
description             object
transferred             object
year                     int64
month                    int64
week                    UInt32
quarter                  int64
day_of_week             object
dtype: object

In [19]:
# Number Of Missing Values By Column
missing = pd.concat([data.isnull().sum(), 100 * data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)

Unnamed: 0,count,%
holiday_type,2578554,83.641618
transferred,2578554,83.641618
description,2578554,83.641618
locale_name,2578554,83.641618
locale,2578554,83.641618
dcoilwtico,962280,31.213873
transactions,277629,9.005566
sales,28512,0.924855
quarter,0,0.0
week,0,0.0


**Save data**

In [20]:
# save the data to a new csv file
datapath = '../data'
save_file(data, 'data.csv', datapath)

Writing file.  "../data/data.csv"
