In [None]:
import sys
from pathlib import Path

import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer

from imblearn.over_sampling import SMOTE
from category_encoders import BinaryEncoder
from ydata_profiling import ProfileReport
import optuna

# Modelos de Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from xgboost import XGBClassifier

from data.io import load_data

import pandas as pd
import numpy as np

%matplotlib inline

# Adiciona o diretório raiz do projeto (titanic/) ao sys.path
sys.path.append(str(Path().resolve().parent))

In [None]:
# Carregar os dados

df_train = load_data('train.csv')

df_train.head()

In [None]:
Preprocessing = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selector(dtype_include="number")),
        ("cat", OneHotEncoder(), selector(dtype_exclude="number"))
    ]
)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gaussian Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'MLP': MLPClassifier(max_iter=1000)
}

cv = RepeatedKFold(n_splits=3, n_repeats=10, random_state=None)

results = {}
for name, model in models.items():

    pipeline = Pipeline([
    ("Organizing", TransformData()),
    ("Preprocessing", Preprocessing),
    ("Balancing", SMOTE(sampling_strategy='auto', random_state=42)),
    ("Model", model)
    ])
      
    scores = cross_val_score(pipeline, features, target, cv=cv)
    
    results[name] = scores.mean()
    
    print(f"{name} CV Accuracy: {results[name]:.4f}")


best_model_name = max(results, key=results.get)
print(f"\nBest model based on CV: {best_model_name}")

# Train the best model on the full training set and evaluate it on the test set
best_model = models[best_model_name]

pipeline = Pipeline([
    ("Organizing", TransformData()),
    ("Preprocessing", Preprocessing),
    ("Model", best_model)
    ])

pipeline.fit(X_train, y_train)
test_accuracy = pipeline.score(X_test, y_test)
print(f"Test set Accuracy: {test_accuracy:.4f}")