# 1. IMPORTS 

In [12]:
import pandas as pd
from inflection import underscore

## 1. 1 HELPER FUNCTIONS

In [84]:
pd.set_option('display.max_rows', 100)

## 1.2 LOADING DATA

In [10]:
df_store = pd.read_csv('./datasets/store.csv', low_memory = False)
df_train = pd.read_csv('./datasets/train.csv', low_memory = False)
df_raw = df_store.merge(df_train, how='left', on='Store')

# 2. DATA DESCRIPTION 

## 2.1 RENAME COLUMNS

In [13]:
# Trasorming columns names into snake_case
old_cols = df_raw.columns.to_list()
new_cols = [underscore(w) for w in old_cols]
df_raw.columns = new_cols

## 2.2 DATA DIMENSIONS

In [20]:
print(f'Dataset has {df_raw.shape[0]} lines and {df_raw.shape[1]} columns')

Dataset has 1017209 lines and 18 columns


## 2.3 CHECK DTYPES

In [31]:
df_raw.dtypes

store                                    int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
dtype: object

## 2.4 CHECK NA

In [32]:
df_raw.isna().sum()

store                                0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
dtype: int64

## 2.5 FILL NA VALUES

In [52]:
# competition_distance ( vamos trocar os NA's por uma distnacia muito grande, presumindo que não exista competição próxima)
df_raw['competition_distance'] = df_raw['competition_distance'].apply(lambda x: 20000 if pd.isna( x ) else x).astype('int64')

# competition_open_since_month ( qunando for NA vamos substituir pelo mês da data atual)
df_raw['competition_open_since_month']  = df_raw.apply(lambda x: x['date'].month if pd.isna( x['competition_open_since_month'] )  else x['competition_open_since_month'], axis = 1)

# competition_open_since_year
df_raw['competition_open_since_year']  = df_raw.apply(lambda x: x['date'].year if pd.isna( x['competition_open_since_year'] )  else x['competition_open_since_year'], axis = 1)

# promo2_since_week
df_raw['promo2_since_week'] = df_raw.apply(lambda x: x['date'].week if pd.isna( x['promo2_since_week']) else x['promo2_since_week'], axis = 1)

# promo2_since_year
df_raw['promo2_since_year'] = df_raw.apply(lambda x: x['date'].year if pd.isna(x['promo2_since_year']) else x['promo2_since_year' ], axis = 1) 

# promo_interval
month_map = {1: 'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
df_raw['promo_interval'].fillna(0, inplace=True)
df_raw['month_map'] = df_raw['date'].dt.month.map( month_map )
df_raw['is_promo'] = df_raw.apply(lambda x: 0 if x['promo_interval'] == 0 else 1 if str( x['month_map'] ) in str( x['promo_interval']) else 0, axis=1)

## 2.6 CHANGE DTYPES

In [98]:
# Transforming 'date' columns into datetime
df_raw['date'] = pd.to_datetime( df_raw['date'] )

# competition_open_since_month into int
df_raw['competition_open_since_month'] = df_raw['competition_open_since_month'].astype( 'int64' )

# competition_open_since_year into int
df_raw['competition_open_since_year']  = df_raw['competition_open_since_year'].astype( 'int64' )

# promo2_since_week into int
df_raw['promo2_since_week'] = df_raw['promo2_since_week'].astype( 'int64' )

# promo2_since_year into int
df_raw['promo2_since_year'] = df_raw['promo2_since_year'].astype( 'int64' )

## 2.7 DESCRIPTIVE STATISTICS