# Importando Bibliotecas

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import numpy as np
import scipy

import plotly.graph_objects as go

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedKFold, TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from mlxtend.feature_selection import SequentialFeatureSelector

from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor

from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics

import itertools

import warnings
warnings.filterwarnings('ignore')

# Funções

In [2]:
def shapiro_test(data, alpha=0.05):
    stat, p = scipy.stats.shapiro(data)
    print('p-valor: p=%.3f' % (p))
    alpha = alpha
    if p > alpha:
        print('A amostra parece ser normalmente distribuída Gaussian. Não rejeitamos H0')
    else:
        print('A amostra NÃO parece ser normalmente distribuiída Gaussian. Rejeitamos H0')
    return stat, p

# Lendo os Dados

In [3]:
df = pd.read_csv('../Dados/df_final.csv')
df.head()

Unnamed: 0,data,vendas,vendas_Outra_Faculdade,vendas_PUCRIO,vendas_UFABC,vendas_UFF,vendas_UFMG,vendas_UFPE,vendas_UFRJ,vendas_UFSC,...,vendas_Outra_Cidade,vendas_Porto_Alegre,vendas_Recife,vendas_Rio_de_Janeiro,vendas_Salvador,vendas_Santo_Andre,vendas_Sao_Carlos,vendas_Sao_Paulo,trend,seasonal
0,2018-01-01,1819.0,618.0,100.0,17.0,194.0,34.0,71.0,407.0,23.0,...,587.0,11.0,51.0,521.0,25.0,9.0,14.0,187.0,,-2401.78443
1,2018-02-01,2012.0,652.0,98.0,22.0,199.0,34.0,85.0,492.0,28.0,...,654.0,11.0,62.0,592.0,23.0,13.0,14.0,205.0,,-1835.22888
2,2018-03-01,4035.0,926.0,229.0,16.0,357.0,46.0,163.0,1473.0,30.0,...,1103.0,14.0,123.0,1544.0,22.0,12.0,29.0,282.0,,935.86834
3,2018-04-01,4305.0,1204.0,189.0,186.0,512.0,96.0,134.0,1098.0,151.0,...,1299.0,21.0,105.0,1300.0,38.0,40.0,27.0,359.0,,670.7989
4,2018-05-01,3956.0,1462.0,176.0,130.0,349.0,212.0,158.0,721.0,122.0,...,1250.0,20.0,110.0,1065.0,41.0,39.0,31.0,300.0,,267.66001


---
# Time Series

## Prophet

In [4]:
df_prophet = df[['data', 'vendas']]
df_prophet.rename(columns={
    'data': 'ds',
    'vendas': 'y'
}, inplace=True)

In [5]:
df_prophet_treino, df_prophet_teste = train_test_split(df_prophet, train_size=0.75, shuffle=False)

In [6]:
model = Prophet()
model.fit(df_prophet_treino)

21:09:39 - cmdstanpy - INFO - Chain [1] start processing
21:09:40 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x7fe5c878df10>

### Performance

In [7]:
forecast = model.predict(df_prophet_teste)

mse = metrics.mean_squared_error(df_prophet_teste['y'], forecast['yhat'])
rmse = metrics.mean_squared_error(df_prophet_teste['y'], forecast['yhat'], squared=False)
mad = metrics.mean_absolute_error(df_prophet_teste['y'], forecast['yhat'])
mape = metrics.mean_absolute_percentage_error(df_prophet_teste['y'], forecast['yhat'])

print(f'''
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      EQM: 2432356.32
      REQM: 1559.6
      MAD: 1415.49
      MAPE: 13.46%
      


In [8]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto',x=forecast['ds'], y=forecast['yhat'], marker={'color': 'orange'}),
    go.Scatter(name='Real', x=df_prophet_teste['ds'], y=df_prophet_teste['y'], marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto (Prophet)')
fig.show()

---
# Regressões

## Feature Engineering

### Criando LAGs

In [9]:
df_r = df.copy().drop(['vendas_Outra_Faculdade', 'vendas_Outra_Cidade', 'trend'], axis=1)
df_r.dropna(axis=0, inplace=True)
df_r.head()

Unnamed: 0,data,vendas,vendas_PUCRIO,vendas_UFABC,vendas_UFF,vendas_UFMG,vendas_UFPE,vendas_UFRJ,vendas_UFSC,vendas_UNICAMP,...,vendas_Natal,vendas_Niteroi,vendas_Porto_Alegre,vendas_Recife,vendas_Rio_de_Janeiro,vendas_Salvador,vendas_Santo_Andre,vendas_Sao_Carlos,vendas_Sao_Paulo,seasonal
0,2018-01-01,1819.0,100.0,17.0,194.0,34.0,71.0,407.0,23.0,200.0,...,3.0,79.0,11.0,51.0,521.0,25.0,9.0,14.0,187.0,-2401.78443
1,2018-02-01,2012.0,98.0,22.0,199.0,34.0,85.0,492.0,28.0,219.0,...,2.0,81.0,11.0,62.0,592.0,23.0,13.0,14.0,205.0,-1835.22888
2,2018-03-01,4035.0,229.0,16.0,357.0,46.0,163.0,1473.0,30.0,577.0,...,9.0,171.0,14.0,123.0,1544.0,22.0,12.0,29.0,282.0,935.86834
3,2018-04-01,4305.0,189.0,186.0,512.0,96.0,134.0,1098.0,151.0,434.0,...,4.0,232.0,21.0,105.0,1300.0,38.0,40.0,27.0,359.0,670.7989
4,2018-05-01,3956.0,176.0,130.0,349.0,212.0,158.0,721.0,122.0,376.0,...,8.0,155.0,20.0,110.0,1065.0,41.0,39.0,31.0,300.0,267.66001


In [10]:
lag_columns = ['vendas_PUCRIO', 'vendas_UFABC', 'vendas_UFF', 'vendas_UFMG', 'vendas_UFPE', 'vendas_UFRJ', 'vendas_UFSC', 'vendas_UNICAMP', 'vendas_USP', 'vendas_UTFPR', 'vendas_Belo_Horizonte',
       'vendas_Brasilia', 'vendas_Campinas', 'vendas_Campo_Grande',
       'vendas_Curitiba', 'vendas_Florianopolis', 'vendas_Fortaleza',
       'vendas_Goiania', 'vendas_Joinville', 'vendas_Juiz_de_Fora',
       'vendas_Manaus', 'vendas_Natal', 'vendas_Niteroi',
       'vendas_Porto_Alegre', 'vendas_Recife', 'vendas_Rio_de_Janeiro',
       'vendas_Salvador', 'vendas_Santo_Andre', 'vendas_Sao_Carlos',
       'vendas_Sao_Paulo']

In [11]:
for column in lag_columns:
    for lag in range(1, 4):
        novo_nome = f'{column}_LAG_{lag}'
        df_r[novo_nome] = df_r[column].shift(periods=lag, fill_value=0)

In [12]:
df_r['seasonal_LAG_12'] = df_r['seasonal'].shift(periods=12, fill_value=0)
df_r['vendas_LAG_12'] = df_r['vendas'].shift(periods=12, fill_value=0)

In [13]:
df_r.drop(lag_columns, axis=1, inplace=True)
df_r.drop('seasonal', axis=1, inplace=True)
df_r.head()

Unnamed: 0,data,vendas,vendas_PUCRIO_LAG_1,vendas_PUCRIO_LAG_2,vendas_PUCRIO_LAG_3,vendas_UFABC_LAG_1,vendas_UFABC_LAG_2,vendas_UFABC_LAG_3,vendas_UFF_LAG_1,vendas_UFF_LAG_2,...,vendas_Santo_Andre_LAG_2,vendas_Santo_Andre_LAG_3,vendas_Sao_Carlos_LAG_1,vendas_Sao_Carlos_LAG_2,vendas_Sao_Carlos_LAG_3,vendas_Sao_Paulo_LAG_1,vendas_Sao_Paulo_LAG_2,vendas_Sao_Paulo_LAG_3,seasonal_LAG_12,vendas_LAG_12
0,2018-01-01,1819.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-02-01,2012.0,100.0,0.0,0.0,17.0,0.0,0.0,194.0,0.0,...,0.0,0.0,14.0,0.0,0.0,187.0,0.0,0.0,0.0,0.0
2,2018-03-01,4035.0,98.0,100.0,0.0,22.0,17.0,0.0,199.0,194.0,...,9.0,0.0,14.0,14.0,0.0,205.0,187.0,0.0,0.0,0.0
3,2018-04-01,4305.0,229.0,98.0,100.0,16.0,22.0,17.0,357.0,199.0,...,13.0,9.0,29.0,14.0,14.0,282.0,205.0,187.0,0.0,0.0
4,2018-05-01,3956.0,189.0,229.0,98.0,186.0,16.0,22.0,512.0,357.0,...,12.0,13.0,27.0,29.0,14.0,359.0,282.0,205.0,0.0,0.0


### Criando Dummies - Mês

In [14]:
df_r['mes'] = pd.to_datetime(df_r['data']).dt.month_name()
df_r.head()

Unnamed: 0,data,vendas,vendas_PUCRIO_LAG_1,vendas_PUCRIO_LAG_2,vendas_PUCRIO_LAG_3,vendas_UFABC_LAG_1,vendas_UFABC_LAG_2,vendas_UFABC_LAG_3,vendas_UFF_LAG_1,vendas_UFF_LAG_2,...,vendas_Santo_Andre_LAG_3,vendas_Sao_Carlos_LAG_1,vendas_Sao_Carlos_LAG_2,vendas_Sao_Carlos_LAG_3,vendas_Sao_Paulo_LAG_1,vendas_Sao_Paulo_LAG_2,vendas_Sao_Paulo_LAG_3,seasonal_LAG_12,vendas_LAG_12,mes
0,2018-01-01,1819.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,January
1,2018-02-01,2012.0,100.0,0.0,0.0,17.0,0.0,0.0,194.0,0.0,...,0.0,14.0,0.0,0.0,187.0,0.0,0.0,0.0,0.0,February
2,2018-03-01,4035.0,98.0,100.0,0.0,22.0,17.0,0.0,199.0,194.0,...,0.0,14.0,14.0,0.0,205.0,187.0,0.0,0.0,0.0,March
3,2018-04-01,4305.0,229.0,98.0,100.0,16.0,22.0,17.0,357.0,199.0,...,9.0,29.0,14.0,14.0,282.0,205.0,187.0,0.0,0.0,April
4,2018-05-01,3956.0,189.0,229.0,98.0,186.0,16.0,22.0,512.0,357.0,...,13.0,27.0,29.0,14.0,359.0,282.0,205.0,0.0,0.0,May


In [15]:
df_r = pd.get_dummies(df_r, columns=['mes'])
df_r.head()

Unnamed: 0,data,vendas,vendas_PUCRIO_LAG_1,vendas_PUCRIO_LAG_2,vendas_PUCRIO_LAG_3,vendas_UFABC_LAG_1,vendas_UFABC_LAG_2,vendas_UFABC_LAG_3,vendas_UFF_LAG_1,vendas_UFF_LAG_2,...,mes_December,mes_February,mes_January,mes_July,mes_June,mes_March,mes_May,mes_November,mes_October,mes_September
0,2018-01-01,1819.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1,2018-02-01,2012.0,100.0,0.0,0.0,17.0,0.0,0.0,194.0,0.0,...,0,1,0,0,0,0,0,0,0,0
2,2018-03-01,4035.0,98.0,100.0,0.0,22.0,17.0,0.0,199.0,194.0,...,0,0,0,0,0,1,0,0,0,0
3,2018-04-01,4305.0,229.0,98.0,100.0,16.0,22.0,17.0,357.0,199.0,...,0,0,0,0,0,0,0,0,0,0
4,2018-05-01,3956.0,189.0,229.0,98.0,186.0,16.0,22.0,512.0,357.0,...,0,0,0,0,0,0,1,0,0,0


### Divisão entre Treino e Teste

In [16]:
df_r_treino, df_r_teste = train_test_split(df_r, train_size=0.75, shuffle=False)

In [17]:
X_r = df_r.drop(['data', 'vendas'], axis=1)
y_r = df_r['vendas']

In [18]:
X_r_treino, X_r_teste, y_r_treino, y_r_teste = train_test_split(X_r, y_r, train_size=0.75, shuffle=False)

In [19]:
tss = TimeSeriesSplit(n_splits=10)

## Modelos

In [20]:
modelos = [LinearRegression(), Ridge(), Lasso(), DecisionTreeRegressor(), SVR(), AdaBoostRegressor(), ExtraTreesRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), HistGradientBoostingRegressor(), LGBMRegressor(), CatBoostRegressor(verbose=False)]

In [21]:
modelos_score = []

for modelo in modelos:
    pipe = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', modelo)
    ])
    cv = cross_val_score(pipe, X_r_treino, y_r_treino, cv=tss, scoring='neg_mean_absolute_percentage_error')
    modelos_score.append({'modelo': modelo, 'mape': cv.mean()*(-100), 'std': cv.std()*(100)})

In [22]:
modelos_score = pd.DataFrame(modelos_score)
modelos_score.sort_values(by='mape', ascending=True, inplace=True)
modelos_score

Unnamed: 0,modelo,mape,std
5,AdaBoostRegressor(),20.925,6.64164
6,ExtraTreesRegressor(),21.18214,8.17018
8,RandomForestRegressor(),21.24283,7.63492
7,GradientBoostingRegressor(),21.46235,9.35069
1,Ridge(),22.01866,10.03976
11,<catboost.core.CatBoostRegressor object at 0x7...,22.54341,8.48459
0,LinearRegression(),23.75999,9.96971
2,Lasso(),26.80369,15.32121
3,DecisionTreeRegressor(),28.91705,18.90759
9,HistGradientBoostingRegressor(),29.81422,9.92709


### Linear Regression

In [23]:
linreg = LinearRegression()

linreg.fit(X_r_treino, y_r_treino)

##### Treino

In [24]:
linreg_pred_treino = linreg.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, linreg_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, linreg_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, linreg_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, linreg_pred_treino)

print(f'''
      Linear Regression - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Linear Regression - Treino
      EQM: 0.0
      REQM: 0.0
      MAD: 0.0
      MAPE: 0.0%
      


##### Teste

In [25]:
linreg_pred_teste = linreg.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, linreg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, linreg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, linreg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, linreg_pred_teste)

print(f'''
      Linear Regression - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Linear Regression - Teste
      EQM: 13559960.23
      REQM: 3682.39
      MAD: 2991.56
      MAPE: 27.68%
      


### Ridge

In [26]:
ridge = Ridge()

ridge.fit(X_r_treino, y_r_treino)

##### Treino

In [27]:
ridge_pred_treino = ridge.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, ridge_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, ridge_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, ridge_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, ridge_pred_treino)

print(f'''
      Ridge Regression - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression - Treino
      EQM: 0.01
      REQM: 0.09
      MAD: 0.07
      MAPE: 0.0%
      


##### Teste

In [28]:
ridge_pred_teste = ridge.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, ridge_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, ridge_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, ridge_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, ridge_pred_teste)

print(f'''
      Ridge Regression - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Ridge Regression - Teste
      EQM: 13555033.59
      REQM: 3681.72
      MAD: 2991.01
      MAPE: 27.68%
      


### Decision Tree Regressor

In [29]:
dtreg = DecisionTreeRegressor()

dtreg.fit(X_r_treino, y_r_treino)

##### Treino

In [30]:
dtreg_pred_treino = dtreg.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, dtreg_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, dtreg_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, dtreg_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, dtreg_pred_treino)

print(f'''
      Decision Tree Regressor - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Decision Tree Regressor - Treino
      EQM: 0.0
      REQM: 0.0
      MAD: 0.0
      MAPE: 0.0%
      


##### Teste

In [31]:
dtreg_pred_teste = dtreg.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, dtreg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, dtreg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, dtreg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, dtreg_pred_teste)

print(f'''
      Decision Tree Regressor - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Decision Tree Regressor - Teste
      EQM: 7053662.0
      REQM: 2655.87
      MAD: 2188.46
      MAPE: 20.35%
      


### SVR

In [32]:
svr = SVR()

svr.fit(X_r_treino, y_r_treino)

##### Treino

In [33]:
svr_pred_treino = svr.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, svr_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, svr_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, svr_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, svr_pred_treino)

print(f'''
      SVR - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      SVR - Treino
      EQM: 5752669.49
      REQM: 2398.47
      MAD: 1926.36
      MAPE: 39.4%
      


##### Teste

In [34]:
svr_pred_teste = svr.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, svr_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, svr_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, svr_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, svr_pred_teste)

print(f'''
      SVR - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      SVR - Teste
      EQM: 26473260.98
      REQM: 5145.22
      MAD: 4980.37
      MAPE: 45.79%
      


### AdaBoost

In [35]:
adaboost = AdaBoostRegressor()

adaboost.fit(X_r_treino, y_r_treino)

##### Treino

In [36]:
adaboost_pred_treino = adaboost.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, adaboost_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, adaboost_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, adaboost_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, adaboost_pred_treino)

print(f'''
      AdaBoost - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      AdaBoost - Treino
      EQM: 119575.75
      REQM: 345.8
      MAD: 277.27
      MAPE: 6.08%
      


##### Teste

In [37]:
adaboost_pred_teste = adaboost.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, adaboost_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, adaboost_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, adaboost_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, adaboost_pred_teste)

print(f'''
      AdaBoost - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      AdaBoost - Teste
      EQM: 7844136.08
      REQM: 2800.74
      MAD: 2566.54
      MAPE: 23.43%
      


### Extra Trees Regressor

In [38]:
et_reg = ExtraTreesRegressor()

et_reg.fit(X_r_treino, y_r_treino)

##### Treino

In [39]:
et_reg_pred_treino = et_reg.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, et_reg_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, et_reg_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, et_reg_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, et_reg_pred_treino)

print(f'''
      Extra Trees - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Extra Trees - Treino
      EQM: 0.0
      REQM: 0.0
      MAD: 0.0
      MAPE: 0.0%
      


##### Teste

In [40]:
et_reg_pred_teste = et_reg.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, et_reg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, et_reg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, et_reg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, et_reg_pred_teste)

print(f'''
      Extra Trees - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Extra Trees - Teste
      EQM: 4890245.15
      REQM: 2211.39
      MAD: 1944.66
      MAPE: 17.5%
      


### Gradient Boosting Regressor

In [41]:
gb_reg = GradientBoostingRegressor()

gb_reg.fit(X_r_treino, y_r_treino)

##### Treino

In [42]:
gb_reg_pred_treino = gb_reg.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, gb_reg_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, gb_reg_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, gb_reg_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, gb_reg_pred_treino)

print(f'''
      Gradient Boosting - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Gradient Boosting - Treino
      EQM: 6.11
      REQM: 2.47
      MAD: 2.07
      MAPE: 0.04%
      


##### Teste

In [43]:
gb_reg_pred_teste = gb_reg.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, gb_reg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, gb_reg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, gb_reg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, gb_reg_pred_teste)

print(f'''
      Gradient Boosting - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Gradient Boosting - Teste
      EQM: 6021339.05
      REQM: 2453.84
      MAD: 2123.2
      MAPE: 18.99%
      


### Random Forest Regressor

In [44]:
rf_reg = RandomForestRegressor()

rf_reg.fit(X_r_treino, y_r_treino)

##### Treino

In [45]:
rf_reg_pred_treino = rf_reg.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, rf_reg_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, rf_reg_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, rf_reg_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, rf_reg_pred_treino)

print(f'''
      Random Forest - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Random Forest - Treino
      EQM: 197994.93
      REQM: 444.97
      MAD: 361.51
      MAPE: 7.19%
      


##### Teste

In [46]:
rf_reg_pred_teste = rf_reg.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, rf_reg_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, rf_reg_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, rf_reg_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, rf_reg_pred_teste)

print(f'''
      Random Forest - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Random Forest - Teste
      EQM: 6309048.69
      REQM: 2511.78
      MAD: 2265.34
      MAPE: 20.31%
      


### Histocastic Gradient Boosting

In [47]:
histgrad = HistGradientBoostingRegressor()

histgrad.fit(X_r_treino, y_r_treino)

##### Treino

In [48]:
histgrad_pred_treino = histgrad.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, histgrad_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, histgrad_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, histgrad_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, histgrad_pred_treino)

print(f'''
      Histocastic Gradient Boosting - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Histocastic Gradient Boosting - Treino
      EQM: 5648125.62
      REQM: 2376.58
      MAD: 1942.34
      MAPE: 42.26%
      


##### Teste

In [49]:
histgrad_pred_teste = histgrad.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, histgrad_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, histgrad_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, histgrad_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, histgrad_pred_teste)

print(f'''
      Histocastic Gradient Boosting - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Histocastic Gradient Boosting - Teste
      EQM: 23023330.41
      REQM: 4798.26
      MAD: 4620.77
      MAPE: 42.36%
      


### LGBM

In [50]:
lgbm = LGBMRegressor()

lgbm.fit(X_r_treino, y_r_treino)

##### Treino

In [51]:
lgbm_pred_treino = lgbm.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, lgbm_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, lgbm_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, lgbm_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, lgbm_pred_treino)

print(f'''
      LGBM - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      LGBM - Treino
      EQM: 5648125.62
      REQM: 2376.58
      MAD: 1942.34
      MAPE: 42.26%
      


##### Teste

In [52]:
lgbm_pred_teste = lgbm.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, lgbm_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, lgbm_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, lgbm_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, lgbm_pred_teste)

print(f'''
      LGBM - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      LGBM - Teste
      EQM: 23023330.41
      REQM: 4798.26
      MAD: 4620.77
      MAPE: 42.36%
      


### CatBoost

In [53]:
catboost = CatBoostRegressor(verbose=False)

catboost.fit(X_r_treino, y_r_treino)

<catboost.core.CatBoostRegressor at 0x7fe5ab693fd0>

##### Treino

In [54]:
catboost_pred_treino = catboost.predict(X_r_treino)

mse = metrics.mean_squared_error(y_r_treino, catboost_pred_treino)
rmse = metrics.mean_squared_error(y_r_treino, catboost_pred_treino, squared=False)
mad = metrics.mean_absolute_error(y_r_treino, catboost_pred_treino)
mape = metrics.mean_absolute_percentage_error(y_r_treino, catboost_pred_treino)

print(f'''
      CatBoost - Treino
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      CatBoost - Treino
      EQM: 12.4
      REQM: 3.52
      MAD: 3.01
      MAPE: 0.06%
      


##### Teste

In [55]:
catboost_pred_teste = catboost.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, catboost_pred_teste)
rmse = metrics.mean_squared_error(y_r_teste, catboost_pred_teste, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, catboost_pred_teste)
mape = metrics.mean_absolute_percentage_error(y_r_teste, catboost_pred_teste)

print(f'''
      CatBoost - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      CatBoost - Teste
      EQM: 8120071.33
      REQM: 2849.57
      MAD: 2622.77
      MAPE: 23.71%
      


# Extra Trees Regressor

In [56]:
pipe_et = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', ExtraTreesRegressor())
])

In [57]:
pipe_et.fit(X_r_treino, y_r_treino)

In [58]:
pred_teste_et = pipe_et.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, pred_teste_et)
rmse = metrics.mean_squared_error(y_r_teste, pred_teste_et, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, pred_teste_et)
mape = metrics.mean_absolute_percentage_error(y_r_teste, pred_teste_et)

print(f'''
      ExtraTrees Regression Standard - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      ExtraTrees Regression Standard - Teste
      EQM: 5185066.18
      REQM: 2277.07
      MAD: 1993.31
      MAPE: 17.87%
      


In [59]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto', x=df_r_teste['data'], y=pred_teste_et, marker={'color': 'orange'}),
    go.Scatter(name='Real', x=df_r_teste['data'], y=y_r_teste, marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto (ExtraTrees Regression)')
fig.show()

# Random Forest

In [60]:
pipe_rf = Pipeline(steps=[
    ('preprocessor', StandardScaler()),
    ('model', RandomForestRegressor())
])

In [61]:
pipe_rf.fit(X_r_treino, y_r_treino)

In [62]:
pred_teste_rf = pipe_rf.predict(X_r_teste)

mse = metrics.mean_squared_error(y_r_teste, pred_teste_rf)
rmse = metrics.mean_squared_error(y_r_teste, pred_teste_rf, squared=False)
mad = metrics.mean_absolute_error(y_r_teste, pred_teste_rf)
mape = metrics.mean_absolute_percentage_error(y_r_teste, pred_teste_rf)

print(f'''
      Random Forest Regression Standard - Teste
      EQM: {round(mse,2)}
      REQM: {round(rmse,2)}
      MAD: {round(mad,2)}
      MAPE: {round(mape*100,2)}%
      ''')


      Random Forest Regression Standard - Teste
      EQM: 6360267.59
      REQM: 2521.96
      MAD: 2282.98
      MAPE: 20.52%
      


In [63]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto', x=df_r_teste['data'], y=pred_teste_rf, marker={'color': 'orange'}),
    go.Scatter(name='Real', x=df_r_teste['data'], y=y_r_teste, marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto (Random Forest Regression)')
fig.show()

# Plot dos Três Métodos

In [65]:
fig = go.Figure(data=[
    go.Scatter(name='Previsto Prophet',x=forecast['ds'], y=forecast['yhat'], marker={'color': 'green'}),
    go.Scatter(name='Previsto ExtraTrees', x=df_r_teste['data'], y=pred_teste_et, marker={'color': 'orange'}),
    go.Scatter(name='Previsto RandomForest', x=df_r_teste['data'], y=pred_teste_rf, marker={'color': 'blue'}),
    go.Scatter(name='Real', x=df_r_teste['data'], y=y_r_teste, marker={'color': 'gray'})
    ])
fig.update_layout(font=dict(color='black'), title_text='Real vs. Previsto')
fig.show()