# Importando Bibliotecas

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import numpy as np
import scipy

import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from mlxtend.feature_selection import SequentialFeatureSelector

from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

import itertools

import warnings
warnings.filterwarnings('ignore')

# Funções

In [2]:
def shapiro_test(data, alpha=0.05):
    stat, p = scipy.stats.shapiro(data)
    print('p-valor: p=%.3f' % (p))
    alpha = alpha
    if p > alpha:
        print('A amostra parece ser normalmente distribuída Gaussian. Não rejeitamos H0')
    else:
        print('A amostra NÃO parece ser normalmente distribuiída Gaussian. Rejeitamos H0')
    return stat, p

# Lendo os Dados

In [3]:
df = pd.read_csv('../Dados/df_final.csv')
df.head()

Unnamed: 0,data,vendas,vendas_Outra_Faculdade,vendas_PUCRIO,vendas_UFABC,vendas_UFF,vendas_UFMG,vendas_UFPE,vendas_UFRJ,vendas_UFSC,...,vendas_Outra_Cidade,vendas_Porto_Alegre,vendas_Recife,vendas_Rio_de_Janeiro,vendas_Salvador,vendas_Santo_Andre,vendas_Sao_Carlos,vendas_Sao_Paulo,trend,seasonal
0,2018-01-01,1819.0,618.0,100.0,17.0,194.0,34.0,71.0,407.0,23.0,...,587.0,11.0,51.0,521.0,25.0,9.0,14.0,187.0,,-2401.78443
1,2018-02-01,2012.0,652.0,98.0,22.0,199.0,34.0,85.0,492.0,28.0,...,654.0,11.0,62.0,592.0,23.0,13.0,14.0,205.0,,-1835.22888
2,2018-03-01,4035.0,926.0,229.0,16.0,357.0,46.0,163.0,1473.0,30.0,...,1103.0,14.0,123.0,1544.0,22.0,12.0,29.0,282.0,,935.86834
3,2018-04-01,4305.0,1204.0,189.0,186.0,512.0,96.0,134.0,1098.0,151.0,...,1299.0,21.0,105.0,1300.0,38.0,40.0,27.0,359.0,,670.7989
4,2018-05-01,3956.0,1462.0,176.0,130.0,349.0,212.0,158.0,721.0,122.0,...,1250.0,20.0,110.0,1065.0,41.0,39.0,31.0,300.0,,267.66001


---
# Time Series

## Prophet

In [4]:
df_prophet = df[['data', 'vendas']]
df_prophet.rename(columns={
    'data': 'ds',
    'vendas': 'y'
}, inplace=True)

In [5]:
model = Prophet()
model.fit(df_prophet)

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<prophet.forecaster.Prophet at 0x27df34958b0>

In [6]:
forecast = model.predict(df_prophet)
forecast.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2018-01-01,3032.78846,-366.12326,1888.12984,3032.78846,3032.78846,-2302.05514,-2302.05514,-2302.05514,-2302.05514,-2302.05514,-2302.05514,0.0,0.0,0.0,730.73332
1,2018-02-01,3198.9746,516.15894,2761.43339,3198.9746,3198.9746,-1617.85773,-1617.85773,-1617.85773,-1617.85773,-1617.85773,-1617.85773,0.0,0.0,0.0,1581.11687
2,2018-03-01,3349.07821,2561.35247,4802.79365,3349.07821,3349.07821,342.37187,342.37187,342.37187,342.37187,342.37187,342.37187,0.0,0.0,0.0,3691.45008
3,2018-04-01,3515.26435,2896.38819,5114.67313,3515.26435,3515.26435,491.2571,491.2571,491.2571,491.2571,491.2571,491.2571,0.0,0.0,0.0,4006.52145
4,2018-05-01,3676.08965,3211.65501,5543.9298,3676.08965,3676.08965,722.67142,722.67142,722.67142,722.67142,722.67142,722.67142,0.0,0.0,0.0,4398.76107


In [7]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto',x=forecast['ds'], y=forecast['yhat'], marker={'color': 'orange'}),
    go.Scatter(name='Real', x=df_prophet['ds'], y=df_prophet['y'], marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto (Prophet)')
fig.show()

In [8]:
mse = metrics.mean_squared_error(df_prophet['y'], forecast['yhat'])
rmse = metrics.mean_squared_error(df_prophet['y'], forecast['yhat'], squared=False)
mad = metrics.mean_absolute_error(df_prophet['y'], forecast['yhat'])
mape = metrics.mean_absolute_percentage_error(df_prophet['y'], forecast['yhat'])

print(f'''
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      EQM: 764752.27
      REQM: 874.5
      MAD: 696.18
      MAPE: 10.57%
      


### Tuning

In [9]:
param_grid = {  
    'changepoint_prior_scale': [0.8, 1.0],
    'seasonality_prior_scale': [0.8, 0.9, 1.0],
    'seasonality_mode': ['additive', 'multiplicative']
}

In [10]:
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

In [11]:
mape = []

for params in all_params:
    m = Prophet(**params).fit(df_prophet) 
    df_cv = cross_validation(m, initial='1460 days', horizon='60 days', period='30 days', parallel="processes")
    df_p = performance_metrics(df_cv, rolling_window=1)
    mape.append(df_p['mape'].values[0])

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:prophet:Making 2 forecasts with cutoffs between 2022-01-01 00:00:00 and 2022-01-31 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000027DFB7B9E80>
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:prophet:Making 2 forecasts with cutoffs between 2022-01-01 00:00:00 and 2022-01-31 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000027DFB4DB340>
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_s

In [12]:
tuning_results = pd.DataFrame(all_params)
tuning_results['mape'] = mape
tuning_results

Unnamed: 0,changepoint_prior_scale,seasonality_prior_scale,seasonality_mode,mape
0,0.8,0.8,additive,0.08027
1,0.8,0.8,multiplicative,0.35809
2,0.8,0.9,additive,0.07317
3,0.8,0.9,multiplicative,0.34797
4,0.8,1.0,additive,0.08055
5,0.8,1.0,multiplicative,0.36011
6,1.0,0.8,additive,0.03692
7,1.0,0.8,multiplicative,0.24479
8,1.0,0.9,additive,0.03703
9,1.0,0.9,multiplicative,0.23986


In [13]:
best_params = all_params[np.argmin(mape)]
print(np.min(mape))
print(best_params)

0.0369151211521806
{'changepoint_prior_scale': 1.0, 'seasonality_prior_scale': 0.8, 'seasonality_mode': 'additive'}


In [14]:
model = Prophet(
    seasonality_mode=best_params['seasonality_mode'],
    seasonality_prior_scale=best_params['seasonality_prior_scale'],
    changepoint_prior_scale=best_params['changepoint_prior_scale'])

In [15]:
model.fit(df_prophet)

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<prophet.forecaster.Prophet at 0x27dfb4a03a0>

### Performance

In [16]:
forecast = model.predict(df_prophet)

In [17]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto',x=forecast['ds'], y=forecast['yhat'], marker={'color': 'orange'}),
    go.Scatter(name='Real', x=df_prophet['ds'], y=df_prophet['y'], marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto (Prophet)')
fig.show()

In [18]:
mse = metrics.mean_squared_error(df_prophet['y'], forecast['yhat'])
rmse = metrics.mean_squared_error(df_prophet['y'], forecast['yhat'], squared=False)
mad = metrics.mean_absolute_error(df_prophet['y'], forecast['yhat'])
mape = metrics.mean_absolute_percentage_error(df_prophet['y'], forecast['yhat'])

print(f'''
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      EQM: 47189.23
      REQM: 217.23
      MAD: 160.92
      MAPE: 2.76%
      


---
# Regressões

## Feature Engineering

### Criando LAGs

In [19]:
df_r = df.copy().drop(['vendas_Outra_Faculdade', 'vendas_Outra_Cidade', 'trend'], axis=1)
df_r.dropna(axis=0, inplace=True)
df_r.head()

Unnamed: 0,data,vendas,vendas_PUCRIO,vendas_UFABC,vendas_UFF,vendas_UFMG,vendas_UFPE,vendas_UFRJ,vendas_UFSC,vendas_UNICAMP,...,vendas_Natal,vendas_Niteroi,vendas_Porto_Alegre,vendas_Recife,vendas_Rio_de_Janeiro,vendas_Salvador,vendas_Santo_Andre,vendas_Sao_Carlos,vendas_Sao_Paulo,seasonal
0,2018-01-01,1819.0,100.0,17.0,194.0,34.0,71.0,407.0,23.0,200.0,...,3.0,79.0,11.0,51.0,521.0,25.0,9.0,14.0,187.0,-2401.78443
1,2018-02-01,2012.0,98.0,22.0,199.0,34.0,85.0,492.0,28.0,219.0,...,2.0,81.0,11.0,62.0,592.0,23.0,13.0,14.0,205.0,-1835.22888
2,2018-03-01,4035.0,229.0,16.0,357.0,46.0,163.0,1473.0,30.0,577.0,...,9.0,171.0,14.0,123.0,1544.0,22.0,12.0,29.0,282.0,935.86834
3,2018-04-01,4305.0,189.0,186.0,512.0,96.0,134.0,1098.0,151.0,434.0,...,4.0,232.0,21.0,105.0,1300.0,38.0,40.0,27.0,359.0,670.7989
4,2018-05-01,3956.0,176.0,130.0,349.0,212.0,158.0,721.0,122.0,376.0,...,8.0,155.0,20.0,110.0,1065.0,41.0,39.0,31.0,300.0,267.66001


In [20]:
lag_columns = ['vendas_PUCRIO', 'vendas_UFABC', 'vendas_UFF', 'vendas_UFMG', 'vendas_UFPE', 'vendas_UFRJ', 'vendas_UFSC', 'vendas_UNICAMP', 'vendas_USP', 'vendas_UTFPR', 'vendas_Belo_Horizonte',
       'vendas_Brasilia', 'vendas_Campinas', 'vendas_Campo_Grande',
       'vendas_Curitiba', 'vendas_Florianopolis', 'vendas_Fortaleza',
       'vendas_Goiania', 'vendas_Joinville', 'vendas_Juiz_de_Fora',
       'vendas_Manaus', 'vendas_Natal', 'vendas_Niteroi',
       'vendas_Porto_Alegre', 'vendas_Recife', 'vendas_Rio_de_Janeiro',
       'vendas_Salvador', 'vendas_Santo_Andre', 'vendas_Sao_Carlos',
       'vendas_Sao_Paulo']

In [21]:
for column in lag_columns:
    for lag in range(1, 4):
        novo_nome = f'{column}_LAG_{lag}'
        df_r[novo_nome] = df_r[column].shift(periods=lag, fill_value=0)

In [22]:
df_r['seasonal_LAG_12'] = df_r['seasonal'].shift(periods=12, fill_value=0)
df_r['vendas_LAG_12'] = df_r['vendas'].shift(periods=12, fill_value=0)

In [23]:
df_r.drop(lag_columns, axis=1, inplace=True)
df_r.drop('seasonal', axis=1, inplace=True)
df_r.head()

Unnamed: 0,data,vendas,vendas_PUCRIO_LAG_1,vendas_PUCRIO_LAG_2,vendas_PUCRIO_LAG_3,vendas_UFABC_LAG_1,vendas_UFABC_LAG_2,vendas_UFABC_LAG_3,vendas_UFF_LAG_1,vendas_UFF_LAG_2,...,vendas_Santo_Andre_LAG_2,vendas_Santo_Andre_LAG_3,vendas_Sao_Carlos_LAG_1,vendas_Sao_Carlos_LAG_2,vendas_Sao_Carlos_LAG_3,vendas_Sao_Paulo_LAG_1,vendas_Sao_Paulo_LAG_2,vendas_Sao_Paulo_LAG_3,seasonal_LAG_12,vendas_LAG_12
0,2018-01-01,1819.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-02-01,2012.0,100.0,0.0,0.0,17.0,0.0,0.0,194.0,0.0,...,0.0,0.0,14.0,0.0,0.0,187.0,0.0,0.0,0.0,0.0
2,2018-03-01,4035.0,98.0,100.0,0.0,22.0,17.0,0.0,199.0,194.0,...,9.0,0.0,14.0,14.0,0.0,205.0,187.0,0.0,0.0,0.0
3,2018-04-01,4305.0,229.0,98.0,100.0,16.0,22.0,17.0,357.0,199.0,...,13.0,9.0,29.0,14.0,14.0,282.0,205.0,187.0,0.0,0.0
4,2018-05-01,3956.0,189.0,229.0,98.0,186.0,16.0,22.0,512.0,357.0,...,12.0,13.0,27.0,29.0,14.0,359.0,282.0,205.0,0.0,0.0


### Divisão entre Treino e Teste

In [24]:
X_r = df_r.drop(['data', 'vendas'], axis=1)
y_r = df_r['vendas']

In [25]:
X_r_treino, X_r_teste, y_r_treino, y_r_teste = train_test_split(X_r, y_r, train_size=0.65, random_state=123)

## Modelos

In [26]:
modelos = [LinearRegression(), Ridge(), DecisionTreeRegressor(), SVR(), AdaBoostRegressor(), ExtraTreesRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), HistGradientBoostingRegressor(), LGBMRegressor(), CatBoostRegressor(verbose=False)]

In [27]:
modelos_score = []

for modelo in modelos:
    cv = cross_val_score(modelo, X_r_treino, y_r_treino, scoring='neg_mean_absolute_percentage_error')
    modelos_score.append({'modelo': modelo, 'mape': cv.mean()*(-100)})

In [28]:
modelos_score = pd.DataFrame(modelos_score)
modelos_score.sort_values(by='mape', ascending=True, inplace=True)
modelos_score

Unnamed: 0,modelo,mape
1,Ridge(),9.24343
0,LinearRegression(),9.24356
5,ExtraTreesRegressor(),14.63388
7,RandomForestRegressor(),15.74659
4,AdaBoostRegressor(),15.9355
10,<catboost.core.CatBoostRegressor object at 0x0...,16.04957
6,GradientBoostingRegressor(),16.52352
2,DecisionTreeRegressor(),20.0589
8,HistGradientBoostingRegressor(),35.51489
9,LGBMRegressor(),35.51489


### Linear Regression

In [29]:
linreg = LinearRegression()

linreg.fit(X_r_treino, y_r_treino)

##### Teste

In [30]:
linreg_pred_teste = linreg.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, linreg_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, linreg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, linreg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, linreg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, linreg_pred_teste)

print(f'''
      Linear Regression - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Linear Regression - Teste
      R2: 0.66
      EQM: 2199636.55
      REQM: 1483.12
      MAD: 1053.04
      MAPE: 32.27%
      


##### Série Completa

In [31]:
linreg_pred = linreg.predict(X_r)

r2 = metrics.r2_score(y_r, linreg_pred)
mse = metrics.mean_squared_error(y_r, linreg_pred)
rmse = metrics.mean_squared_error(y_r, linreg_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, linreg_pred)
mape = metrics.mean_absolute_percentage_error(y_r, linreg_pred)

print(f'''
      Linear Regression - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Linear Regression - Série Completa
      R2: 0.91
      EQM: 803713.35
      REQM: 896.5
      MAD: 384.76
      MAPE: 11.79%
      


### Ridge

In [32]:
ridge = Ridge()

ridge.fit(X_r_treino, y_r_treino)

##### Teste

In [33]:
ridge_pred_teste = ridge.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, ridge_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, ridge_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, ridge_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, ridge_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, ridge_pred_teste)

print(f'''
      Ridge Regression - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression - Teste
      R2: 0.66
      EQM: 2199509.41
      REQM: 1483.07
      MAD: 1053.02
      MAPE: 32.27%
      


##### Série Completa

In [34]:
ridge_pred = ridge.predict(X_r)

r2 = metrics.r2_score(y_r, ridge_pred)
mse = metrics.mean_squared_error(y_r, ridge_pred)
rmse = metrics.mean_squared_error(y_r, ridge_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, ridge_pred)
mape = metrics.mean_absolute_percentage_error(y_r, ridge_pred)

print(f'''
      Ridge Regression - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression - Série Completa
      R2: 0.91
      EQM: 803666.9
      REQM: 896.47
      MAD: 384.77
      MAPE: 11.79%
      


### Decision Tree Regressor

In [35]:
dtreg = DecisionTreeRegressor()

dtreg.fit(X_r_treino, y_r_treino)

##### Teste

In [36]:
dtreg_pred_teste = dtreg.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, dtreg_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, dtreg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, dtreg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, dtreg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, dtreg_pred_teste)

print(f'''
      Decision Tree Regressor - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Decision Tree Regressor - Teste
      R2: 0.48
      EQM: 3384089.58
      REQM: 1839.59
      MAD: 1560.11
      MAPE: 38.73%
      


##### Série Completa

In [37]:
dtreg_pred = dtreg.predict(X_r)

r2 = metrics.r2_score(y_r, dtreg_pred)
mse = metrics.mean_squared_error(y_r, dtreg_pred)
rmse = metrics.mean_squared_error(y_r, dtreg_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, dtreg_pred)
mape = metrics.mean_absolute_percentage_error(y_r, dtreg_pred)

print(f'''
      Decision Tree Regressor - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Decision Tree Regressor - Série Completa
      R2: 0.86
      EQM: 1236494.27
      REQM: 1111.98
      MAD: 570.04
      MAPE: 14.15%
      


### SVR

In [38]:
svr = SVR()

svr.fit(X_r_treino, y_r_treino)

##### Teste

In [39]:
svr_pred_teste = svr.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, svr_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, svr_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, svr_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, svr_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, svr_pred_teste)

print(f'''
      SVR - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      SVR - Teste
      R2: -0.94
      EQM: 12687724.8
      REQM: 3561.98
      MAD: 3018.04
      MAPE: 91.69%
      


##### Série Completa

In [40]:
svr_pred = svr.predict(X_r)

r2 = metrics.r2_score(y_r, svr_pred)
mse = metrics.mean_squared_error(y_r, svr_pred)
rmse = metrics.mean_squared_error(y_r, svr_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, svr_pred)
mape = metrics.mean_absolute_percentage_error(y_r, svr_pred)

print(f'''
      SVR - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      SVR - Série Completa
      R2: -0.06
      EQM: 9145401.07
      REQM: 3024.14
      MAD: 2608.6
      MAPE: 54.73%
      


### AdaBoost

In [41]:
adaboost = AdaBoostRegressor()

adaboost.fit(X_r_treino, y_r_treino)

##### Teste

In [42]:
adaboost_pred_teste = adaboost.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, adaboost_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, adaboost_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, adaboost_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, adaboost_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, adaboost_pred_teste)

print(f'''
      AdaBoost - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      AdaBoost - Teste
      R2: 0.38
      EQM: 4047613.75
      REQM: 2011.87
      MAD: 1803.23
      MAPE: 46.02%
      


##### Série Completa

In [43]:
adaboost_pred = adaboost.predict(X_r)

r2 = metrics.r2_score(y_r, adaboost_pred)
mse = metrics.mean_squared_error(y_r, adaboost_pred)
rmse = metrics.mean_squared_error(y_r, adaboost_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, adaboost_pred)
mape = metrics.mean_absolute_percentage_error(y_r, adaboost_pred)

print(f'''
      AdaBoost - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      AdaBoost - Série Completa
      R2: 0.82
      EQM: 1524773.05
      REQM: 1234.82
      MAD: 775.05
      MAPE: 18.56%
      


### Extra Trees Regressor

In [44]:
et_reg = ExtraTreesRegressor()

et_reg.fit(X_r_treino, y_r_treino)

##### Teste

In [45]:
et_reg_pred_teste = et_reg.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, et_reg_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, et_reg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, et_reg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, et_reg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, et_reg_pred_teste)

print(f'''
      Extra Trees - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Extra Trees - Teste
      R2: 0.47
      EQM: 3468655.68
      REQM: 1862.43
      MAD: 1638.02
      MAPE: 43.44%
      


##### Série Completa

In [46]:
et_reg_pred = et_reg.predict(X_r)

r2 = metrics.r2_score(y_r, et_reg_pred)
mse = metrics.mean_squared_error(y_r, et_reg_pred)
rmse = metrics.mean_squared_error(y_r, et_reg_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, et_reg_pred)
mape = metrics.mean_absolute_percentage_error(y_r, et_reg_pred)

print(f'''
      Extra Trees - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Extra Trees - Série Completa
      R2: 0.85
      EQM: 1267393.42
      REQM: 1125.79
      MAD: 598.51
      MAPE: 15.87%
      


### Gradient Boosting Regressor

In [47]:
gb_reg = GradientBoostingRegressor()

gb_reg.fit(X_r_treino, y_r_treino)

##### Teste

In [48]:
gb_reg_pred_teste = gb_reg.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, gb_reg_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, gb_reg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, gb_reg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, gb_reg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, gb_reg_pred_teste)

print(f'''
      Gradient Boosting - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Gradient Boosting - Teste
      R2: 0.46
      EQM: 3515672.25
      REQM: 1875.01
      MAD: 1637.79
      MAPE: 40.02%
      


##### Série Completa

In [49]:
gb_reg_pred = gb_reg.predict(X_r)

r2 = metrics.r2_score(y_r, gb_reg_pred)
mse = metrics.mean_squared_error(y_r, gb_reg_pred)
rmse = metrics.mean_squared_error(y_r, gb_reg_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, gb_reg_pred)
mape = metrics.mean_absolute_percentage_error(y_r, gb_reg_pred)

print(f'''
      Gradient Boosting - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Gradient Boosting - Série Completa
      R2: 0.85
      EQM: 1284573.02
      REQM: 1133.39
      MAD: 598.87
      MAPE: 14.63%
      


### Random Forest Regressor

In [50]:
rf_reg = RandomForestRegressor()

rf_reg.fit(X_r_treino, y_r_treino)

##### Teste

In [51]:
rf_reg_pred_teste = rf_reg.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, rf_reg_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, rf_reg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, rf_reg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, rf_reg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, rf_reg_pred_teste)

print(f'''
      Random Forest - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Random Forest - Teste
      R2: 0.42
      EQM: 3791034.42
      REQM: 1947.06
      MAD: 1729.47
      MAPE: 45.17%
      


##### Série Completa

In [52]:
rf_reg_pred = rf_reg.predict(X_r)

r2 = metrics.r2_score(y_r, rf_reg_pred)
mse = metrics.mean_squared_error(y_r, rf_reg_pred)
rmse = metrics.mean_squared_error(y_r, rf_reg_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, rf_reg_pred)
mape = metrics.mean_absolute_percentage_error(y_r, rf_reg_pred)

print(f'''
      Random Forest - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Random Forest - Série Completa
      R2: 0.82
      EQM: 1558452.75
      REQM: 1248.38
      MAD: 892.58
      MAPE: 19.92%
      


### Histocastic Gradient Boosting

In [53]:
histgrad = HistGradientBoostingRegressor()

histgrad.fit(X_r_treino, y_r_treino)

##### Teste

In [54]:
histgrad_pred_teste = histgrad.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, histgrad_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, histgrad_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, histgrad_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, histgrad_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, histgrad_pred_teste)

print(f'''
      Histocastic Gradient Boosting - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Histocastic Gradient Boosting - Teste
      R2: -1.19
      EQM: 14318693.29
      REQM: 3784.0
      MAD: 3231.58
      MAPE: 97.97%
      


##### Série Completa

In [55]:
histgrad_pred = histgrad.predict(X_r)

r2 = metrics.r2_score(y_r, histgrad_pred)
mse = metrics.mean_squared_error(y_r, histgrad_pred)
rmse = metrics.mean_squared_error(y_r, histgrad_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, histgrad_pred)
mape = metrics.mean_absolute_percentage_error(y_r, histgrad_pred)

print(f'''
      Histocastic Gradient Boosting - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Histocastic Gradient Boosting - Série Completa
      R2: -0.12
      EQM: 9697360.69
      REQM: 3114.06
      MAD: 2694.91
      MAPE: 57.95%
      


### LGBM

In [56]:
lgbm = LGBMRegressor()

lgbm.fit(X_r_treino, y_r_treino)

##### Teste

In [57]:
lgbm_pred_teste = lgbm.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, lgbm_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, lgbm_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, lgbm_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, lgbm_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, lgbm_pred_teste)

print(f'''
      LGBM - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      LGBM - Teste
      R2: -1.19
      EQM: 14318693.29
      REQM: 3784.0
      MAD: 3231.58
      MAPE: 97.97%
      


##### Série Completa

In [58]:
lgbm_pred = lgbm.predict(X_r)

r2 = metrics.r2_score(y_r, lgbm_pred)
mse = metrics.mean_squared_error(y_r, lgbm_pred)
rmse = metrics.mean_squared_error(y_r, lgbm_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, lgbm_pred)
mape = metrics.mean_absolute_percentage_error(y_r, lgbm_pred)

print(f'''
      LGBM - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      LGBM - Série Completa
      R2: -0.12
      EQM: 9697360.69
      REQM: 3114.06
      MAD: 2694.91
      MAPE: 57.95%
      


### CatBoost

In [59]:
catboost = CatBoostRegressor(verbose=False)

catboost.fit(X_r_treino, y_r_treino)

<catboost.core.CatBoostRegressor at 0x27dfb7fe1c0>

##### Teste

In [60]:
catboost_pred_teste = catboost.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, catboost_pred_teste)
mse = metrics.mean_squared_error(y_r_teste, catboost_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, catboost_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, catboost_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, catboost_pred_teste)

print(f'''
      CatBoost - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      CatBoost - Teste
      R2: 0.4
      EQM: 3923827.49
      REQM: 1980.87
      MAD: 1767.14
      MAPE: 46.29%
      


##### Série Completa

In [61]:
catboost_pred = catboost.predict(X_r)

r2 = metrics.r2_score(y_r, catboost_pred)
mse = metrics.mean_squared_error(y_r, catboost_pred)
rmse = metrics.mean_squared_error(y_r, catboost_pred, squared=False)
mad = metrics.mean_absolute_error(y_r, catboost_pred)
mape = metrics.mean_absolute_percentage_error(y_r, catboost_pred)

print(f'''
      CatBoost - Série Completa
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      CatBoost - Série Completa
      R2: 0.83
      EQM: 1433708.15
      REQM: 1197.38
      MAD: 646.58
      MAPE: 16.93%
      


# Ridge

In [62]:
pipe = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', Ridge())
])

In [63]:
pipe.fit(X_r_treino, y_r_treino)

In [64]:
pred_teste = pipe.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, pred_teste)
mse = metrics.mean_squared_error(y_r_teste, pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, pred_teste)

print(f'''
      Ridge Regression Standard - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression Standard - Teste
      R2: 0.74
      EQM: 1728008.93
      REQM: 1314.54
      MAD: 960.39
      MAPE: 28.66%
      


### Tuning

In [65]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=123)

In [66]:
grid = dict(model__alpha=np.linspace(1, 10, 21))
grid

{'model__alpha': array([ 1.  ,  1.45,  1.9 ,  2.35,  2.8 ,  3.25,  3.7 ,  4.15,  4.6 ,
         5.05,  5.5 ,  5.95,  6.4 ,  6.85,  7.3 ,  7.75,  8.2 ,  8.65,
         9.1 ,  9.55, 10.  ])}

In [67]:
gscv = GridSearchCV(pipe, grid, scoring='neg_mean_absolute_percentage_error', n_jobs=1, cv=cv)
gscv.fit(X_r_treino, y_r_treino)

In [68]:
gscv.best_params_

{'model__alpha': 1.0}

In [69]:
pipe_tuned = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', Ridge(alpha=5.5))
])

In [70]:
pipe_tuned.fit(X_r_treino, y_r_treino)

In [71]:
pred_teste = pipe_tuned.predict(X_r_teste)

r2 = metrics.r2_score(y_r_teste, pred_teste)
mse = metrics.mean_squared_error(y_r_teste, pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, pred_teste)

print(f'''
      Ridge Regression Standard Tuned - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression Standard Tuned - Teste
      R2: 0.73
      EQM: 1765652.92
      REQM: 1328.78
      MAD: 1073.55
      MAPE: 30.82%
      


### Feature Selection

In [72]:
bfs = SequentialFeatureSelector(pipe_tuned, k_features='best', forward=False, n_jobs=1, cv=cv)

In [73]:
bfs.fit(X_r_treino, y_r_treino)

In [74]:
bfs_features = list(bfs.k_feature_names_)
bfs_features

['vendas_PUCRIO_LAG_2',
 'vendas_PUCRIO_LAG_3',
 'vendas_UFABC_LAG_2',
 'vendas_UFF_LAG_2',
 'vendas_UFMG_LAG_1',
 'vendas_UFRJ_LAG_1',
 'vendas_UFSC_LAG_1',
 'vendas_UNICAMP_LAG_1',
 'vendas_UNICAMP_LAG_3',
 'vendas_UTFPR_LAG_1',
 'vendas_Belo_Horizonte_LAG_1',
 'vendas_Campinas_LAG_1',
 'vendas_Campo_Grande_LAG_3',
 'vendas_Curitiba_LAG_3',
 'vendas_Florianopolis_LAG_2',
 'vendas_Goiania_LAG_3',
 'vendas_Joinville_LAG_2',
 'vendas_Manaus_LAG_2',
 'vendas_Manaus_LAG_3',
 'vendas_Natal_LAG_3',
 'vendas_Niteroi_LAG_2',
 'vendas_Recife_LAG_3',
 'vendas_Salvador_LAG_2',
 'vendas_Santo_Andre_LAG_1',
 'vendas_Sao_Carlos_LAG_3',
 'seasonal_LAG_12']

In [75]:
X_r_treino_bfs = X_r_treino.loc[:, bfs_features]
X_r_teste_bfs = X_r_teste.loc[:, bfs_features]
X_r_bfs = X_r.loc[:, bfs_features]

In [76]:
pipe_tuned_bfs = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', Ridge(alpha=5.5))
])

In [77]:
pipe_tuned_bfs.fit(X_r_treino_bfs, y_r_treino)

In [78]:
pred_teste_bfs = pipe_tuned_bfs.predict(X_r_teste_bfs)

r2 = metrics.r2_score(y_r_teste, pred_teste_bfs)
mse = metrics.mean_squared_error(y_r_teste, pred_teste_bfs)
rmse = metrics.mean_squared_error(y_r_teste, pred_teste_bfs, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, pred_teste_bfs)
mape = metrics.mean_absolute_percentage_error(y_r_teste, pred_teste_bfs)

print(f'''
      Ridge Regression Standard Tuned - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression Standard Tuned - Teste
      R2: 0.71
      EQM: 1882527.91
      REQM: 1372.05
      MAD: 1055.05
      MAPE: 30.93%
      


In [79]:
pred_bfs = pipe_tuned_bfs.predict(X_r_bfs)

r2 = metrics.r2_score(y_r, pred_bfs)
mse = metrics.mean_squared_error(y_r, pred_bfs)
rmse = metrics.mean_squared_error(y_r, pred_bfs, squared=False)
mad = metrics.mean_absolute_error(y_r, pred_bfs)
mape = metrics.mean_absolute_percentage_error(y_r, pred_bfs)

print(f'''
      Ridge Regression Standard Tuned - Teste
      R2: {round(r2,2)}
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression Standard Tuned - Teste
      R2: 0.91
      EQM: 752418.48
      REQM: 867.42
      MAD: 547.13
      MAPE: 13.47%
      


In [80]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto',x=df_r['data'], y=pred_bfs, marker={'color': 'orange'}),
    go.Scatter(name='Real', x=df_r['data'], y=y_r, marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto (Ridge Regression)')
fig.show()

# Extra Trees

In [81]:
pipe_et = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', ExtraTreesRegressor())
])