## Config

In [48]:
import pandas as pd 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:.2f}'.format
import numpy as np 
from datetime import timedelta

## Data

### Train

In [49]:
df_train = pd.read_csv('train.csv')
df_train['date'] = pd.to_datetime(df_train['date'])
df_train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


### Store info

In [50]:
df_stores = pd.read_csv('stores.csv')
df_stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


### Calendar information

In [51]:
df_holiday = pd.read_csv('holidays_events.csv')
df_holiday['date'] = pd.to_datetime(df_holiday['date'])
df_holiday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


In [52]:
# ['Holiday' nan 'Work Day' 'Additional' 'Event' 'Transfer' 'Bridge']
# Mark transferred holidays as normal days and find the correct celebration dates
for index, row in df_holiday[df_holiday['transferred']].iterrows():
    # Find the new date for this transferred holiday
    transfer_row = df_holiday[(df_holiday['description'] == row['description']) & (df_holiday['type'] == 'Transfer')]
    if not transfer_row.empty:
        # Assuming only one row matches, otherwise adjust as necessary
        new_date_index = transfer_row.index[0]

df_holiday['bridge'] = False
df_holiday['holiday_national'] = False
df_holiday['holiday_local'] = False
df_holiday['holiday_regional'] = False 

df_holiday.loc[df_holiday['type'] == 'Bridge', 'bridge'] = True
df_holiday.loc[df_holiday['locale'] == 'National', 'holiday_national'] = True  # Mark regular holidays
df_holiday.loc[df_holiday['type'] == 'Local', 'holiday_local'] = True  # Mark bridge days as holidays
df_holiday.loc[df_holiday['type'] == 'Regional', 'holiday_regional'] = True

In [53]:
df_holiday['transferred'] = df_holiday['transferred'].astype(int)
df_holiday['holiday_national'] = df_holiday['holiday_national'].astype(int)
df_holiday['holiday_regional'] = df_holiday['holiday_regional'].astype(int)
df_holiday['holiday_local'] = df_holiday['holiday_local'].astype(int)
df_holiday['bridge'] = df_holiday['bridge'].astype(int)
df_holiday['type'] = df_holiday['type'].astype('category')
df_holiday['locale'] = df_holiday['locale'].astype('category')
# df_holiday['work_day'] = df_holiday['work_day'].astype(int)
# df_holiday['event'] = df_holiday['event'].astype(int)

df_holiday.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred,bridge,holiday_national,holiday_local,holiday_regional
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,0,0,0,0,0
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,0,0,0,0,0
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,0,0,0,0,0
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,0,0,0,0,0
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,0,0,0,0,0


In [54]:
df_holiday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              350 non-null    datetime64[ns]
 1   type              350 non-null    category      
 2   locale            350 non-null    category      
 3   locale_name       350 non-null    object        
 4   description       350 non-null    object        
 5   transferred       350 non-null    int64         
 6   bridge            350 non-null    int64         
 7   holiday_national  350 non-null    int64         
 8   holiday_local     350 non-null    int64         
 9   holiday_regional  350 non-null    int64         
dtypes: category(2), datetime64[ns](1), int64(5), object(2)
memory usage: 23.0+ KB


### Oil prices

In [55]:
df_oil = pd.read_csv('oil.csv')
df_oil['date'] = pd.to_datetime(df_oil['date'])
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [56]:
df_oil.loc[df_oil['dcoilwtico'] == 0]
# No values are zero

Unnamed: 0,date,dcoilwtico


In [57]:
n_obs_oil = len(df_oil['dcoilwtico'])
print(f'Number of obs oil: {n_obs_oil}')

n_obs_na = df_oil['dcoilwtico'].isna().sum()

print(f'Number of obs isna: {n_obs_na}')
print(f'Proportion missing: {n_obs_na/n_obs_oil}')

Number of obs oil: 1218
Number of obs isna: 43
Proportion missing: 0.035303776683087026


#### Simple interpolation

In [58]:
df_oil = df_oil.set_index(df_oil['date'])
df_oil = df_oil['dcoilwtico'].interpolate(method = 'time')

In [59]:
# Set first manually, just set to the next day 93.14
df_oil.iloc[0] = 93.14

In [60]:
df_oil.isna().sum()

0

In [61]:
df_oil.info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 1218 entries, 2013-01-01 to 2017-08-31
Series name: dcoilwtico
Non-Null Count  Dtype  
--------------  -----  
1218 non-null   float64
dtypes: float64(1)
memory usage: 19.0 KB


### Transactions

In [62]:
df_transactions = pd.read_csv('transactions.csv')
df_transactions['date'] = pd.to_datetime(df_transactions['date'])
df_transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


## Merge dataframes

In [63]:
df_complete = pd.merge(df_train, df_oil, on = 'date', how = 'left')
df_complete.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14
4,4,2013-01-01,1,BOOKS,0.0,0,93.14


In [64]:
df_complete = pd.merge(df_complete, df_holiday, on = 'date', how = 'left')
df_complete.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,type,locale,locale_name,description,transferred,bridge,holiday_national,holiday_local,holiday_regional
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0
4,4,2013-01-01,1,BOOKS,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0


In [65]:
df_complete = pd.merge(df_complete, df_stores, on = 'store_nbr', how = 'left')
df_complete.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,type_x,locale,locale_name,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13


In [66]:
df_complete = pd.merge(df_complete, df_transactions, on = ['store_nbr', 'date'], how = 'left')
df_complete.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,type_x,locale,locale_name,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13,
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13,
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13,
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13,
4,4,2013-01-01,1,BOOKS,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Quito,Pichincha,D,13,


In [67]:
df_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Data columns (total 21 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                int64         
 1   date              datetime64[ns]
 2   store_nbr         int64         
 3   family            object        
 4   sales             float64       
 5   onpromotion       int64         
 6   dcoilwtico        float64       
 7   type_x            category      
 8   locale            category      
 9   locale_name       object        
 10  description       object        
 11  transferred       float64       
 12  bridge            float64       
 13  holiday_national  float64       
 14  holiday_local     float64       
 15  holiday_regional  float64       
 16  city              object        
 17  state             object        
 18  type_y            object        
 19  cluster           int64         
 20  transactions      float64       
dtypes: categ

## Processing and conversion

### Filter out entries where the stores have not opened yet

In [68]:
# Filter rows where sales are greater than zero
sales_not_zero = df_complete[df_complete['sales'] > 0]

# Group by 'store_nbr' and find the first date of non-zero sales for each store
first_non_zero_sales_date_per_store = sales_not_zero.groupby('store_nbr')['date'].min().reset_index()

# Rename columns for clarity
first_non_zero_sales_date_per_store.rename(columns={'date': 'first_sale_date'}, inplace=True)

# Display the result
first_non_zero_sales_date_per_store.head()

Unnamed: 0,store_nbr,first_sale_date
0,1,2013-01-02
1,2,2013-01-02
2,3,2013-01-02
3,4,2013-01-02
4,5,2013-01-02


In [70]:
# Merge the main DataFrame with the first sale date information
df_with_first_sale_date = pd.merge(df_complete, first_non_zero_sales_date_per_store, on='store_nbr', how='left')

# Filter rows where the store's sale date is on or after the first sale date
df_opened_stores = df_with_first_sale_date[df_with_first_sale_date['date'] >= df_with_first_sale_date['first_sale_date']]

df_opened_stores = df_opened_stores.drop(columns=['first_sale_date'])

In [72]:
df_opened_stores.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,type_x,locale,locale_name,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions
561,561,2013-01-01,25,AUTOMOTIVE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0
562,562,2013-01-01,25,BABY CARE,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0
563,563,2013-01-01,25,BEAUTY,2.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0
564,564,2013-01-01,25,BEVERAGES,810.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0
565,565,2013-01-01,25,BOOKS,0.0,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0


### Pivot dataframe to have sales amount per product family

In [75]:
# Pivot the DataFrame to get sales amounts for each family category as separate columns
sales_pivot = df_opened_stores.pivot_table(index=['date', 'store_nbr'], 
                                      columns='family', 
                                      values='sales', 
                                      fill_value=0).reset_index()

# Reset index to make the DataFrame flat
sales_pivot.reset_index(inplace=True, drop=True)

subset_df = df_opened_stores.drop(columns=['sales', 'family']).drop_duplicates(subset=['date', 'store_nbr'])
# Optionally, rename the columns if you want to add a prefix or modify them in any way
# sales_pivot.columns = ['CustomPrefix_' + str(col) for col in sales_pivot.columns]
final_df = pd.merge(subset_df, sales_pivot, on=['date', 'store_nbr'], how='left')
# Display the result
final_df.head()

Unnamed: 0,id,date,store_nbr,onpromotion,dcoilwtico,type_x,locale,locale_name,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,561,2013-01-01,25,0,93.14,Holiday,National,Ecuador,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0,0.0,0.0,2.0,810.0,0.0,180.59,0.0,186.0,143.0,71.09,46.0,29.65,700.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,105.0,0.0,110.8,25.0,0.0,0.0,42.64,37.85,0.0,0.0,0.0
1,1782,2013-01-02,1,0,93.14,,,,,,,,,,Quito,Pichincha,D,13,2111.0,2.0,0.0,2.0,1091.0,0.0,470.65,0.0,1060.0,579.0,164.07,246.0,131.0,2652.0,31.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0,67.0,0.0,369.1,194.0,0.0,0.0,247.3,47.0,0.0,0.0,38.03
2,1815,2013-01-02,10,0,93.14,,,,,,,,,,Quito,Pichincha,C,15,1293.0,3.0,0.0,6.0,396.0,0.0,151.0,0.0,1110.0,101.0,276.0,57.0,41.16,2579.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,291.14,223.0,0.0,0.0,146.78,26.0,0.0,0.0,3.0
3,1848,2013-01-02,11,0,93.14,,,,,,,,,,Cayambe,Pichincha,B,6,3547.0,12.0,0.0,7.0,3443.0,0.0,763.0,0.0,3260.0,296.0,527.29,140.0,57.77,7736.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,29.0,0.0,832.54,604.0,0.0,0.0,548.03,31.0,0.0,0.0,5.0
4,1881,2013-01-02,12,0,93.14,,,,,,,,,,Latacunga,Cotopaxi,C,15,1362.0,4.0,0.0,1.0,682.0,0.0,337.0,0.0,1092.0,151.0,176.0,75.0,14.0,2677.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,36.0,0.0,232.35,210.0,0.0,0.0,103.17,41.0,0.0,0.0,9.0


#### Drop unecessary columns

In [77]:
final_df.drop(columns = ['type_x', 'locale', 'locale_name'], inplace = True)
final_df.head(10)

Unnamed: 0,id,date,store_nbr,onpromotion,dcoilwtico,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,561,2013-01-01,25,0,93.14,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0,0.0,0.0,2.0,810.0,0.0,180.59,0.0,186.0,143.0,71.09,46.0,29.65,700.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,105.0,0.0,110.8,25.0,0.0,0.0,42.64,37.85,0.0,0.0,0.0
1,1782,2013-01-02,1,0,93.14,,,,,,,Quito,Pichincha,D,13,2111.0,2.0,0.0,2.0,1091.0,0.0,470.65,0.0,1060.0,579.0,164.07,246.0,131.0,2652.0,31.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0,67.0,0.0,369.1,194.0,0.0,0.0,247.3,47.0,0.0,0.0,38.03
2,1815,2013-01-02,10,0,93.14,,,,,,,Quito,Pichincha,C,15,1293.0,3.0,0.0,6.0,396.0,0.0,151.0,0.0,1110.0,101.0,276.0,57.0,41.16,2579.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,291.14,223.0,0.0,0.0,146.78,26.0,0.0,0.0,3.0
3,1848,2013-01-02,11,0,93.14,,,,,,,Cayambe,Pichincha,B,6,3547.0,12.0,0.0,7.0,3443.0,0.0,763.0,0.0,3260.0,296.0,527.29,140.0,57.77,7736.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,29.0,0.0,832.54,604.0,0.0,0.0,548.03,31.0,0.0,0.0,5.0
4,1881,2013-01-02,12,0,93.14,,,,,,,Latacunga,Cotopaxi,C,15,1362.0,4.0,0.0,1.0,682.0,0.0,337.0,0.0,1092.0,151.0,176.0,75.0,14.0,2677.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,36.0,0.0,232.35,210.0,0.0,0.0,103.17,41.0,0.0,0.0,9.0
5,1914,2013-01-02,13,0,93.14,,,,,,,Latacunga,Cotopaxi,C,15,1102.0,7.0,0.0,3.0,767.0,0.0,231.0,0.0,1000.0,111.0,195.0,48.0,28.54,2587.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,17.0,0.0,137.91,241.0,0.0,0.0,102.38,25.0,0.0,0.0,1.0
6,1947,2013-01-02,14,0,93.14,,,,,,,Riobamba,Chimborazo,C,7,2002.0,8.0,0.0,3.0,744.0,0.0,409.0,0.0,1716.0,154.0,204.0,53.0,119.34,3257.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,10.0,0.0,368.73,493.0,0.0,0.0,98.18,24.0,0.0,0.0,9.0
7,1980,2013-01-02,15,0,93.14,,,,,,,Ibarra,Imbabura,C,15,1622.0,4.0,0.0,5.0,938.0,0.0,270.0,0.0,1300.0,152.0,219.0,73.0,33.0,3130.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,16.0,0.0,84.81,312.0,0.0,0.0,0.0,22.0,0.0,0.0,4.0
8,2013,2013-01-02,16,0,93.14,,,,,,,Santo Domingo,Santo Domingo de los Tsachilas,C,3,1167.0,5.0,0.0,0.0,389.0,0.0,227.0,0.0,1361.0,122.0,295.0,155.0,29.0,3176.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,17.0,0.0,12.58,275.0,0.0,0.0,117.99,37.0,0.0,0.0,5.0
9,2046,2013-01-02,17,0,93.14,,,,,,,Quito,Pichincha,C,12,1580.0,8.0,0.0,2.0,1011.0,0.0,282.0,0.0,1329.0,185.0,378.0,139.0,48.52,3431.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,120.0,0.0,226.37,214.0,0.0,0.0,223.62,47.0,0.0,0.0,16.0


#### Replace nan values with zero

In [78]:
final_df.fillna(0, inplace = True)
final_df.head()

Unnamed: 0,id,date,store_nbr,onpromotion,dcoilwtico,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,561,2013-01-01,25,0,93.14,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0,0.0,0.0,2.0,810.0,0.0,180.59,0.0,186.0,143.0,71.09,46.0,29.65,700.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,105.0,0.0,110.8,25.0,0.0,0.0,42.64,37.85,0.0,0.0,0.0
1,1782,2013-01-02,1,0,93.14,0,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,D,13,2111.0,2.0,0.0,2.0,1091.0,0.0,470.65,0.0,1060.0,579.0,164.07,246.0,131.0,2652.0,31.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0,67.0,0.0,369.1,194.0,0.0,0.0,247.3,47.0,0.0,0.0,38.03
2,1815,2013-01-02,10,0,93.14,0,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,C,15,1293.0,3.0,0.0,6.0,396.0,0.0,151.0,0.0,1110.0,101.0,276.0,57.0,41.16,2579.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,291.14,223.0,0.0,0.0,146.78,26.0,0.0,0.0,3.0
3,1848,2013-01-02,11,0,93.14,0,0.0,0.0,0.0,0.0,0.0,Cayambe,Pichincha,B,6,3547.0,12.0,0.0,7.0,3443.0,0.0,763.0,0.0,3260.0,296.0,527.29,140.0,57.77,7736.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,29.0,0.0,832.54,604.0,0.0,0.0,548.03,31.0,0.0,0.0,5.0
4,1881,2013-01-02,12,0,93.14,0,0.0,0.0,0.0,0.0,0.0,Latacunga,Cotopaxi,C,15,1362.0,4.0,0.0,1.0,682.0,0.0,337.0,0.0,1092.0,151.0,176.0,75.0,14.0,2677.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,36.0,0.0,232.35,210.0,0.0,0.0,103.17,41.0,0.0,0.0,9.0


#### Add flag column for workday

In [79]:
final_df['description'] = final_df['description'].replace(0, 'Work Day')

Unnamed: 0,id,date,store_nbr,onpromotion,dcoilwtico,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,561,2013-01-01,25,0,93.14,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0,0.0,0.0,2.0,810.0,0.0,180.59,0.0,186.0,143.0,71.09,46.0,29.65,700.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,105.0,0.0,110.8,25.0,0.0,0.0,42.64,37.85,0.0,0.0,0.0
1,1782,2013-01-02,1,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,D,13,2111.0,2.0,0.0,2.0,1091.0,0.0,470.65,0.0,1060.0,579.0,164.07,246.0,131.0,2652.0,31.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0,67.0,0.0,369.1,194.0,0.0,0.0,247.3,47.0,0.0,0.0,38.03
2,1815,2013-01-02,10,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,C,15,1293.0,3.0,0.0,6.0,396.0,0.0,151.0,0.0,1110.0,101.0,276.0,57.0,41.16,2579.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,291.14,223.0,0.0,0.0,146.78,26.0,0.0,0.0,3.0
3,1848,2013-01-02,11,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Cayambe,Pichincha,B,6,3547.0,12.0,0.0,7.0,3443.0,0.0,763.0,0.0,3260.0,296.0,527.29,140.0,57.77,7736.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,29.0,0.0,832.54,604.0,0.0,0.0,548.03,31.0,0.0,0.0,5.0
4,1881,2013-01-02,12,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Latacunga,Cotopaxi,C,15,1362.0,4.0,0.0,1.0,682.0,0.0,337.0,0.0,1092.0,151.0,176.0,75.0,14.0,2677.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,36.0,0.0,232.35,210.0,0.0,0.0,103.17,41.0,0.0,0.0,9.0


In [80]:
final_df['is_workday'] = final_df['description'].apply(lambda x: 1 if 'Work Day' in x else 0)

Unnamed: 0,id,date,store_nbr,onpromotion,dcoilwtico,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD,is_workday
0,561,2013-01-01,25,0,93.14,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0,0.0,0.0,2.0,810.0,0.0,180.59,0.0,186.0,143.0,71.09,46.0,29.65,700.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,105.0,0.0,110.8,25.0,0.0,0.0,42.64,37.85,0.0,0.0,0.0,0
1,1782,2013-01-02,1,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,D,13,2111.0,2.0,0.0,2.0,1091.0,0.0,470.65,0.0,1060.0,579.0,164.07,246.0,131.0,2652.0,31.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0,67.0,0.0,369.1,194.0,0.0,0.0,247.3,47.0,0.0,0.0,38.03,1
2,1815,2013-01-02,10,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,C,15,1293.0,3.0,0.0,6.0,396.0,0.0,151.0,0.0,1110.0,101.0,276.0,57.0,41.16,2579.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,291.14,223.0,0.0,0.0,146.78,26.0,0.0,0.0,3.0,1
3,1848,2013-01-02,11,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Cayambe,Pichincha,B,6,3547.0,12.0,0.0,7.0,3443.0,0.0,763.0,0.0,3260.0,296.0,527.29,140.0,57.77,7736.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,29.0,0.0,832.54,604.0,0.0,0.0,548.03,31.0,0.0,0.0,5.0,1
4,1881,2013-01-02,12,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Latacunga,Cotopaxi,C,15,1362.0,4.0,0.0,1.0,682.0,0.0,337.0,0.0,1092.0,151.0,176.0,75.0,14.0,2677.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,36.0,0.0,232.35,210.0,0.0,0.0,103.17,41.0,0.0,0.0,9.0,1


#### Calender information
1. Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.

2. A magnitude 7.8 earthquake struck Ecuador on April 16, 2016. People rallied in relief efforts donating water and other first need products which greatly affected supermarket sales for several weeks after the earthquake.

In [81]:
# Add public payday flag column on the 15th and the last day of every month and a flag column for the three weeks after earthquake on april 16 2016
final_df['public_payday_flag'] = final_df['date'].apply(
    lambda x: 1 if (x.day == 15 or x.day == x.days_in_month) else 0)

earthquake_date = pd.Timestamp('2016-04-16')
earthquake_end_date = earthquake_date + pd.Timedelta(weeks=3)

# This flag is 1 for dates within three weeks after the earthquake
final_df['earthquake_aftermath_flag'] = final_df['date'].apply(
    lambda x: 1 if earthquake_date <= x <= earthquake_end_date else 0)

final_df.head()

Unnamed: 0,id,date,store_nbr,onpromotion,dcoilwtico,description,transferred,bridge,holiday_national,holiday_local,holiday_regional,city,state,type_y,cluster,transactions,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,GROCERY II,HARDWARE,HOME AND KITCHEN I,HOME AND KITCHEN II,HOME APPLIANCES,HOME CARE,LADIESWEAR,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD,is_workday,public_payday_flag,earthquake_aftermath_flag
0,561,2013-01-01,25,0,93.14,Primer dia del ano,0.0,0.0,1.0,0.0,0.0,Salinas,Santa Elena,D,1,770.0,0.0,0.0,2.0,810.0,0.0,180.59,0.0,186.0,143.0,71.09,46.0,29.65,700.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,105.0,0.0,110.8,25.0,0.0,0.0,42.64,37.85,0.0,0.0,0.0,0,0,0
1,1782,2013-01-02,1,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,D,13,2111.0,2.0,0.0,2.0,1091.0,0.0,470.65,0.0,1060.0,579.0,164.07,246.0,131.0,2652.0,31.0,3.0,0.0,0.0,0.0,0.0,0.0,10.0,13.0,67.0,0.0,369.1,194.0,0.0,0.0,247.3,47.0,0.0,0.0,38.03,1,0,0
2,1815,2013-01-02,10,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Quito,Pichincha,C,15,1293.0,3.0,0.0,6.0,396.0,0.0,151.0,0.0,1110.0,101.0,276.0,57.0,41.16,2579.0,8.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,291.14,223.0,0.0,0.0,146.78,26.0,0.0,0.0,3.0,1,0,0
3,1848,2013-01-02,11,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Cayambe,Pichincha,B,6,3547.0,12.0,0.0,7.0,3443.0,0.0,763.0,0.0,3260.0,296.0,527.29,140.0,57.77,7736.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,29.0,0.0,832.54,604.0,0.0,0.0,548.03,31.0,0.0,0.0,5.0,1,0,0
4,1881,2013-01-02,12,0,93.14,Work Day,0.0,0.0,0.0,0.0,0.0,Latacunga,Cotopaxi,C,15,1362.0,4.0,0.0,1.0,682.0,0.0,337.0,0.0,1092.0,151.0,176.0,75.0,14.0,2677.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,36.0,0.0,232.35,210.0,0.0,0.0,103.17,41.0,0.0,0.0,9.0,1,0,0


### Datatypes conversion for memory saving

In [82]:
final_df['description'] = final_df['description'].astype('category')
final_df['city'] = final_df['city'].astype('category')
final_df['state'] = final_df['state'].astype('category')
final_df['type_y'] = final_df['type_y'].astype('category')
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84207 entries, 0 to 84206
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          84207 non-null  int64         
 1   date                        84207 non-null  datetime64[ns]
 2   store_nbr                   84207 non-null  int64         
 3   onpromotion                 84207 non-null  int64         
 4   dcoilwtico                  84207 non-null  float64       
 5   description                 84207 non-null  category      
 6   transferred                 84207 non-null  float64       
 7   bridge                      84207 non-null  float64       
 8   holiday_national            84207 non-null  float64       
 9   holiday_local               84207 non-null  float64       
 10  holiday_regional            84207 non-null  float64       
 11  city                        84207 non-null  category  

In [83]:
# final_df.to_parquet('store_train_complete.parquet', compression='snappy', engine='pyarrow')