# **Imports**

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# ------------------------------------------
# desabilita warnings
# ------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ------------------------------------------
# bibliotecas principais
# ------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

In [5]:
df = pd.read_csv('/content/drive/Shareddrives/grupo4-rappi-hour/bases-rappi/df-oficial.csv')

In [6]:
df = df.drop(columns = ['Unnamed: 0'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157470 entries, 0 to 157469
Data columns (total 33 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   IS_ACTIVE                   157470 non-null  int64  
 1   AUTO_ACEITE                 157470 non-null  int64  
 2   COOKING_TIME_MEDIO          157470 non-null  float64
 3   ITENS_MEDIO                 157470 non-null  float64
 4   DISTANCE_TO_USER            157470 non-null  float64
 5   CANCELS_OPS_RT              157470 non-null  float64
 6   ACCEPTANCE_RATE             157470 non-null  float64
 7   PUNISHMENT_MINUTES          157470 non-null  float64
 8   TEMPORARY_BLOCKS            157470 non-null  float64
 10  ORDERS_PER_HOURS_CONNECTED  157470 non-null  float64
 11  age-1st-qtile               157470 non-null  int64  
 12  age-2nd-qtile               157470 non-null  int64  
 13  age-3rd-qtile               157470 non-null  int64  
 14  age-4th-qtile 

# Features / Target


In [8]:
x = df.drop(columns="IS_ACTIVE") # features
y = df["IS_ACTIVE"] # target

In [None]:
y.squeeze()

0         1
1         1
2         1
3         1
4         1
         ..
157465    0
157466    0
157467    0
157468    0
157469    0
Name: IS_ACTIVE, Length: 157470, dtype: int64

In [9]:
scoring = ['precision_macro', 'recall_macro']

In [10]:
from sklearn.model_selection import cross_validate

# Modelos - Cross Validation 

Nesta seção faremos o cross validation, validação cruzada, dos algoritmos de machine learning escolhidos

## AdaBoost Cross Validation

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()

In [None]:
ada_scores = cross_validate(adaboost, x, y, cv=5, error_score='raise', scoring=scoring)

In [None]:
ada_scores

{'fit_time': array([15.11308432,  9.51332951,  9.46014214,  9.44577193, 10.58348393]),
 'score_time': array([0.34097242, 0.35418534, 0.36006856, 0.35078287, 0.34775138]),
 'test_precision_macro': array([0.71428585, 0.88704187, 0.83881698, 0.87473748, 0.87449879]),
 'test_recall_macro': array([0.69588705, 0.79010995, 0.81340696, 0.78468703, 0.78559256])}

## RandomForest Cross Validation

In [None]:
from sklearn.ensemble import RandomForestClassifier

ranfor = RandomForestClassifier()

In [None]:
ranfor_scores = cross_validate(ranfor, x, y, cv=5, error_score='raise', scoring=scoring)

In [None]:
ranfor_scores

{'fit_time': array([25.48084664, 27.60660076, 27.29132056, 26.0064764 , 26.10364151]),
 'score_time': array([0.90615225, 0.93442965, 1.00150537, 0.96426654, 0.94860005]),
 'test_precision_macro': array([0.86148803, 0.89675985, 0.87479634, 0.8784946 , 0.87770068]),
 'test_recall_macro': array([0.79310455, 0.83411478, 0.8493921 , 0.83415747, 0.82754613])}

## Lgbm Cross Validation


In [21]:
import lightgbm as lgb

lgb = lgb.LGBMClassifier()

In [12]:
scores_lgb = cross_validate(lgb, x, y, cv=5, error_score='raise', scoring=scoring)

In [13]:
scores_lgb

{'fit_time': array([5.03361535, 5.51448822, 8.10115027, 7.29791737, 4.95582581]),
 'score_time': array([0.42752242, 0.39631033, 0.90471911, 0.39884233, 0.40124345]),
 'test_precision_macro': array([0.86681858, 0.90165403, 0.87691125, 0.88097714, 0.87797645]),
 'test_recall_macro': array([0.8279951 , 0.84859355, 0.85780917, 0.84447514, 0.83471705])}

# Seção de Balanceamento e Nova Rodada do Método de Validação Cruzada

Nesta seção realizamos novamente a validação cruzada dos modelos, mas desta vez com o balanceamento das classes contidas no target do modelo, neste caso a diminuição do número de amostras da classe majoritária.

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=13)
x_resampled, y_resampled = rus.fit_resample(x, y)

In [None]:
from sklearn.model_selection import cross_validate

scores_logreg_resampled = cross_validate(logreg, x_resampled, y_resampled, cv=5, scoring=scoring, error_score='raise')

In [None]:
scores_logreg_resampled

{'fit_time': array([0.58538127, 0.66056943, 0.58818173, 0.5420711 , 0.54924512]),
 'score_time': array([0.01806712, 0.01891398, 0.01867294, 0.01741815, 0.0174768 ]),
 'test_precision_macro': array([0.15814015, 0.1462286 , 0.13550834, 0.12824378, 0.11316322]),
 'test_recall_macro': array([0.25574266, 0.23006631, 0.20756191, 0.19408953, 0.16966536])}

## LGBM Resampled Cross Validation


In [None]:
scores_lgb_resampled = cross_validate(lgb, x_resampled, y_resampled, cv=5, error_score='raise', scoring=scoring)

In [None]:
scores_lgb_resampled

{'fit_time': array([2.05402064, 2.0207746 , 1.98676229, 2.02315831, 1.98045421]),
 'score_time': array([0.13877368, 0.1425364 , 0.1385622 , 0.14075255, 0.13506341]),
 'test_precision_macro': array([0.8661568 , 0.86622151, 0.87163214, 0.86739217, 0.87220057]),
 'test_recall_macro': array([0.86380102, 0.86599055, 0.87085295, 0.86710126, 0.87196871])}

## AdaBoost Cross Validation

In [None]:
scores_ada_resampled = cross_validate(adaboost, x_resampled, y_resampled, cv=5, error_score='raise', scoring=scoring)

In [None]:
scores_ada_resampled

{'fit_time': array([3.33178544, 3.46661139, 3.31442618, 3.77578759, 3.96159816]),
 'score_time': array([0.163908  , 0.15446496, 0.16484952, 0.15984869, 0.16902947]),
 'test_precision_macro': array([0.74632501, 0.82260446, 0.82958718, 0.82538933, 0.82973512]),
 'test_recall_macro': array([0.70647826, 0.81289982, 0.8220039 , 0.81399392, 0.82199044])}

## RandomForest Cross Validation

In [None]:
scores_ranfor_resampled = cross_validate(ranfor, x_resampled, y_resampled, cv=5, error_score='raise', scoring=scoring)

In [None]:
scores_ranfor_resampled

{'fit_time': array([ 7.65265751,  7.36240768,  7.65677857,  9.59747124, 11.40754175]),
 'score_time': array([0.32046914, 0.29967475, 0.30920959, 0.33145308, 0.33207035]),
 'test_precision_macro': array([0.85366173, 0.86135902, 0.85972192, 0.86061424, 0.86249916]),
 'test_recall_macro': array([0.85093705, 0.86086357, 0.85868539, 0.86023468, 0.86197306])}