# 0.0 IMPORTS

In [None]:
import inflection
import math
import datetime
import warnings
import pickle
import seaborn                          as sns
import pandas                           as pd
import numpy                            as np
import matplotlib.pyplot                as plt
import xgboost                          as xgb

from scipy                 import stats as ss
from IPython.display       import Image
from tabulate              import tabulate

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics       import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import train_test_split

## 0.1 Helper Functions

In [None]:
warnings.filterwarnings('ignore')

In [None]:
def cramer_v( x, y ):
    cm = pd.crosstab( x, y ).values
    n = cm.sum()
    r, k = cm.shape
    chi2 = ss.chi2_contingency( cm )[0]
    chi2corr = max( 0, chi2 - (k-1)*(r-1)/(n-1) )
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt( (chi2corr/n) / ( min( kcorr-1, rcorr-1 ) ) )

# Model Avaliation 
def ml_error( model_name, y, yhat ):
    mae = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat )
    rmse = np.sqrt( mean_squared_error( y, yhat ) ) 
    
    return pd.DataFrame( {'Model Name': model_name,
                          'MAE': mae,
                          'MAPE': mape,
                          'RMSE': rmse}, index=[0] )

# Cross Validation
def cross_validation( x_training, kfold, model_name, model, verbose=False):

    mae_list= []
    mape_list= []
    rmse_list= []
    
    for k in reversed( range( 1, kfold+1 ) ):
        if verbose:
            print('\nKFold Number: ', k)
        # start and end date for validation
        validation_start_date = x_training['date'].max() - datetime.timedelta( days=k*6*7 )
        validation_end_date = x_training['date'].max() - datetime.timedelta( days=k*6*7 )
        
        # filtering dataset
        training = x_training[x_training['date'] < validation_start_date]
        validation = x_training[(x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date)]
        
        # training and validation dataset
        xtraining = training.drop(['date', 'sales'], axis=1)
        ytraining = training['sales']
        
        xvalidation = validation.drop(['date', 'sales'], axis=1)
        yvalidation = validation['sales']
        
        # model
        m = model.fit(xtraining, ytraining)
        
        # prediction
        yhat = m.predict(xvalidation)
        
        # performance
        m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(yhat))
        
        # store performance of each kfold iteration
        mae_list.append( m_result['MAE'] )
        mape_list.append( m_result['MAPE'] )
        rmse_list.append( m_result['RMSE'] )
        
    return pd.DataFrame( {
            'Model Name': model_name,
            'MAE CV': np.round( np.mean(mae_list), 2 ).astype(str) + " +/- " + np.round( np.std(mae_list), 2 ).astype(str),
            'MAPE CV': np.round( np.mean(mape_list), 2 ).astype(str) + " +/- " + np.round( np.std(mape_list), 2 ).astype(str),
            'RMSE CV': np.round( np.mean(rmse_list), 2 ).astype(str) + " +/- " + np.round( np.std(rmse_list), 2 ).astype(str)
        }, index=[0] )


def mean_percentage_error( y, yhat ):
    return np.mean( (y-yhat) / y )

## 0.2 Loading Data

In [None]:
dir_data = "../data/raw/"
file_data = ""

# CSV
# df_raw = pd.read_csv(dir + file, low_memory=False)

# EXCEL
# df_raw = pd.read_excel(dir + file, low_memory=False)

# PARQUET
# df_raw = pd.read_parquet(dir + file, low_memory=False)

In [None]:
# merge (if necessary)
# df_raw = pd.merge( df_raw1, df_raw2, how='left', on='<key>' )

# 1.0 DATA DESCRIPTIONS

In [None]:
df1 = df_raw.copy()

## 1.1 Rename Columns

In [None]:
cols_old = []

snakecase = lambda x: inflection.underscore(x)

cols_new = list( map( snakecase, cols_old ) )

# rename
df1.columns = cols_new

## 1.2 Data Dimensions

In [None]:
print( 'Number of Rows: {}'.format(df1.shape[0]) )
print( 'Number of Columns: {}'.format(df1.shape[1]) )

## 1.3 Data Types

In [None]:
df1.dtypes

## 1.4 Check NA

In [None]:
df1.isna().sum()

## 1.5 Fillout NA

In [None]:
# FLOAT or INT
df1['<var>'] = df1['<var>'].apply(lambda x :200000.0 if math.isnan(x) else x)

# BINARY (TRUE or FALSE)
df1['<var>'] = df1['<var>'].apply(lambda x: 0 if x['<var>'] == 0 else, axis = 1)

# maybe useful              
month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
df1['<var>'] = df1['date'].dt.month.map(month_map)

## 1.6 Check Types

In [None]:
# INT
df1['<var>'] = df1['<var>'].astype(int)

# FLOAT
df1['<var>'] = df1['<var>'].astype(float)

# STR
df1['<var>'] = df1['<var>'].astype(str)

# DATETIME
df1['<var>'] = pd.to_datetime(df1['<var>'])

## 1.7 Descriptive Statistical

In [None]:
num_attributes = df1.select_dtypes(include=['int64', 'float64'])
cat_attributes = df1.select_dtypes(exclude=['int64', 'float64', 'datetime64[ns]'])

### 1.7.1 Numerical Attributes

In [None]:
# Alternative Code
num_attributes.agg(["mean","median","std","min","max","skew","kurtosis"]).T.reset_index().rename(columns={'index': 'attributes'})

### 1.7.2 Categorical Attributes

In [None]:
cat_attributes.apply( lambda x: x.unique().shape[0] )

# 2.0 FEATURE ENGINEERING

In [None]:
df2 = df1.copy()

## 2.1 - Mapa Mental de Hipoteses

In [None]:
dir = ''
img = ''
Image(dir + img)

## 2.2 - Criacao das Hipoteses

### 2.2.1 - Hipoteses #1

**1.** 

**2.** 


## 2.3 - Lista Final de Hipoteses

**1.** 

**2.** 


## 2.4 - Feature Engineering

In [None]:
# year
df2['year'] = df2['date'].dt.year

# month
df2['month'] = df2['date'].dt.month

# day
df2['day'] = df2['date'].dt.day

# week of year
df2['week_of_year'] = df2['date'].dt.weekofyear

# year week
df2['year_week'] = df2['date'].dt.strftime( '%Y-%W' )

# categories
df2['<var>'] = df2['<var>'].apply( lambda x: 'example1' if x == 'a' else 'example2' if x == 'b' else 'example3' )

In [None]:
df2.head().T

# 3.0 DATA FILTERING

In [None]:
df3 = df2.copy()

## 3.1 - Filtragem das Linhas

## 3.2 - Selecao das Colunas

In [None]:
cols_drop = []
df3.drop(cols_drop, axis=1, inplace=True)

# 4.0 EXPLORATORY DATA ANALYSIS

In [None]:
df4 = df3.copy()

## 4.1 - Univariate Analysis

### 4.1.1 - Response Variable

In [None]:
# EXAMPLE 1
# plt.figure(figsize=(15,8))
# sns.histplot( df4['<var>'] )

### 4.1.2 - Numerical Variable

In [None]:
num_attributes.hist(figsize=(20,12), bins=25);

### 4.1.3 - Categorical Variable

## 4.2 - Bivariate Analysis

### H1 - Example
**FALSE** or **TRUE** - Explanation

## 4.3 - Multivariate Analysis

### 4.3.1 - Numerical Attributes

In [None]:
correlation = num_attributes.corr(method='pearson')

plt.figure(figsize=(16,10))
sns.heatmap(correlation, annot=True);

### 4.3.2 - Caterogical Attributes

In [None]:
a = df4.select_dtypes( include='object' )
a

In [None]:
# only categorical data
a = df4.select_dtypes( include='object' )
cramer_v(a['<cols>'])


## 4.4 - Resumo das Hipoteses

In [None]:
tab =[
['Hipoteses', 'Conclusao', 'Relevancia'],
['H1', 'Falsa', 'Baixa'],
['H2', 'Falsa', 'Media'],
['H3', 'Verdadeira', 'Alta']
]
print( tabulate( tab, headers='firstrow' ) )

# 5.0 DATA PREPARATION

In [None]:
df5 = df4.copy()

## 5.1 - Normalizacao

## 5.2 - Rescaling

In [None]:
plt.figure(figsize=(16,10))
sns.boxplot(y=df5['<var>']);

In [None]:
dir_parameter = 'src/parameter/'

# Example RobustScaler
rs = RobustScaler()
df5['<var>'] = rs.fit_transform( df5[['<var>']].values )
pickle.dump( rs, open(dir_parameter + '<archive>.pkl', 'wb') )

# Example MinMaxScaler
mms = MinMaxScaler()
df5['<var>'] = mms.fit_transform( df5[['<var>']].values )
pickle.dump( mms, open(dir_parameter + '<archive>.pkl', 'wb') )

## 5.3 - Transformacao

### 5.3.1 - Encoding

In [None]:
dir_parameter = 'src/parameter/'

# Example One Hot Encoding
df5 = pd.get_dummies( df5, prefix=['<var>'], columns=['<var>'] )

# Example Label Encoding
le = LabelEncoder()
df5['<var>'] = le.fit_transform( df5['<var>'] )
pickle.dump( le, open(dir_parameter + '<archive>.pkl', 'wb') )

# Example Ordinal Encoding
assortment_dict = {'basic': 1, 'extra': 2, 'extented': 3}
df5['<var>'] = df5['<var>'].map( assortment_dict )

### 5.3.2 - Response Variable Transformation

In [None]:
# Example
df5['<var>'] = np.log1p( df5['<var>'] )

plt.figure(figsize=(16,10))
sns.distplot(df5['<var>'])

### 5.3.3 - Nature Transformation

In [None]:
# day_of_week
df5['day_of_week_sin'] = df5['day_of_week'].apply( lambda x: np.sin( x * ( 2. * np.pi/7 ) ) )
df5['day_of_week_cos'] = df5['day_of_week'].apply( lambda x: np.cos( x * ( 2. * np.pi/7 ) ) )

# month
df5['month_sin'] = df5['month'].apply( lambda x: np.sin( x * ( 2. * np.pi/12 ) ) )
df5['month_cos'] = df5['month'].apply( lambda x: np.cos( x * ( 2. * np.pi/12 ) ) )

# day 
df5['day_sin'] = df5['day'].apply( lambda x: np.sin( x * ( 2. * np.pi/30 ) ) )
df5['day_cos'] = df5['day'].apply( lambda x: np.cos( x * ( 2. * np.pi/30 ) ) )

# week_of_year
df5['week_of_year_sin'] = df5['week_of_year'].apply( lambda x: np.sin( x * ( 2. * np.pi/52 ) ) )
df5['week_of_year_cos'] = df5['week_of_year'].apply( lambda x: np.cos( x * ( 2. * np.pi/52 ) ) )

# 6.0 FEATURE SELECTION

In [None]:
df6 = df5.copy()

cols_drop = []
df6.drop(cols_drop, axis=1, inplace=True)

## 6.1 - Split Dataframe into training and test dataset

In [None]:
X = df6.drop('<var>', axis=1)
Y = df6['<var>']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2 ,random_state=42)

# 7.0 MACHINE LEARNING MODELLING

## 7.1 Example Model

In [None]:
# model
lr = LinearRegression().fit( x_train, y_train )

# prediction
yhat_lr = lr.predict( x_test )

# performance
lr_result = ml_error( 'Linear Regression', np.expm1(y_test), np.expm1(yhat_lr) )
lr_result

### 7.1.1 Example Model - Cross Validation

In [None]:
lr_result_cv = cross_validation( x_training, 5, 'Linear Regression', lr )
lr_result_cv

# 8.0 HYPERPARAMETER FINE TUNING

# 9.0 TRANSLATE AND INTERPRETATION ERROR

## 9.1 Business Performance

## 9.2 Total Performance

## 9.3 Macinhe Learning Performance