# Modelo de machine learning

In [1]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd

df = pd.read_csv('df_encoded.csv', index_col = 'Unnamed: 0')
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,FamilyMembers,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,1,38.0,71.2833,1,1,0,1,0,0
1,1,3,26.0,7.925,0,1,0,0,0,1
2,1,1,35.0,53.1,1,1,0,0,0,1
3,1,3,27.0,11.1333,2,1,0,0,0,1
4,1,2,14.0,30.0708,1,1,0,1,0,0


In [6]:
from sklearn.model_selection import train_test_split

SEED = 42

X = df.drop(columns = 'Survived')
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify = y)

In [7]:
def executa_modelo(modelo):

    modelo.fit(X_train,y_train)
    y_pred = modelo.predict(X_test)

    return y_pred

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier


SEED = 42

dummy = DummyClassifier(random_state = SEED)
lr = LogisticRegression(max_iter = 1000, random_state = SEED)
rf = RandomForestClassifier(random_state = SEED)
xgboost = XGBClassifier()

modelos = [dummy, lr, rf, xgboost]
resultados = {}


for modelo in modelos:
    y_pred = executa_modelo(modelo)
    resultados[modelo] = y_pred

resultados

{DummyClassifier(random_state=42): array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]),
 LogisticRegression(max_iter=1000, random_state=42): arr

In [16]:
from sklearn import metrics

def valida_modelo(modelo, y_test, y_pred):
    acuracia = metrics.accuracy_score(y_test, y_pred).round(4)
    precisao = metrics.precision_score(y_test, y_pred).round(4)
    recall = metrics.recall_score(y_test, y_pred).round(4)
    f1 = metrics.f1_score(y_test, y_pred).round(4)

    metricas = [acuracia, precisao, recall, f1]

    return metricas


index = ['Acurácia', 'Precisão', 'Recall', 'F1']
df_metricas = pd.DataFrame(index = index)

for modelo, resultado in resultados.items():
    df_metricas[modelo] = valida_modelo(modelo, y_test, resultado)

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
df_metricas.columns = ['Dummy', 'LogisticRegression', 'RandomForest', 'XgBoost']
df_metricas.T.style.highlight_max()

Unnamed: 0,Acurácia,Precisão,Recall,F1
Dummy,0.6157,0.0,0.0,0.0
LogisticRegression,0.8022,0.7451,0.7379,0.7415
RandomForest,0.8097,0.7826,0.699,0.7385
XgBoost,0.8582,0.8736,0.7379,0.8


Da pra perceber que o XgBoost é o melhor dos modelos, por isso será mantido para analises futuras. É possível melhorar, mas devemos analisar com calma.

In [21]:
def imprime_resultado(resultados):

    acuracia_teste = resultados['test_score'].mean() * 100
    acuracia_treino = resultados['train_score'].mean() * 100

    print(f'Acurácia: teste = {acuracia_teste:.2f}, treino = {acuracia_treino:.2f}')

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate


def validacao_cruzada(modelo):
    cv = StratifiedKFold(n_splits = 10)
    resultados = cross_validate(modelo, X, y, cv = cv, return_train_score=True)
    imprime_resultado(resultados)

In [25]:
SEED = 42

modelo = XGBClassifier()
validacao_cruzada(modelo)

Acurácia: teste = 86.53, treino = 89.94


As métricas de treino e teste estão muito boas. É possível melhorar esses números, caso desejamos. No momento o modelo será utilizado sem que ele seja otimizado.