In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from imblearn.under_sampling import AllKNN

In [None]:
# Carregando o conjunto de dados Titanic
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

In [None]:
# Pré-processamento dos dados
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [None]:
# Dividindo os dados em features (X) e variável alvo (y)
X = df.drop('Survived', axis=1)
y = df['Survived']
print("Verificação da Variável Alvo sem balanceamento")
print(pd.DataFrame(y.value_counts(normalize=True)))

In [None]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Balanceamento das classes usando AllKNN
allknn = AllKNN(allow_minority=True,n_neighbors=4,sampling_strategy='auto')
X_train_resampled, y_train_resampled = allknn.fit_resample(X_train, y_train)

In [None]:
# Treinando o modelo de Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

In [None]:
# Fazendo previsões no conjunto de teste
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

In [None]:
# Calculando as métricas
auc_score = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
# Gerando o gráfico de balanceamento
print("Verificação da Variável Alvo com balanceamento")
print(pd.DataFrame(y_train_resampled.value_counts(normalize=True)))

In [None]:
# Cálculo do limiar de probabilidade ótimo para maximizar a métrica F1
thresholds = np.linspace(0, 1, 1000)
f1_scores = [f1_score(y_test, y_proba >= t) for t in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

In [None]:
print("Resultados:")
print("AUC Score:", auc_score)
print("F1 Score:", f1)
print("Acurácia:", accuracy)
print("Limiar de probabilidade ótimo para F1:", optimal_threshold)