In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import shap
import joblib
import os


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

from src.tabular.processing import split_data, build_pipeline

with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

df = pd.read_csv(f"../{config['data']['tabular_path']}")


In [None]:
# Remover colunas não utilizadas (id e Unnamed: 32)
X = df.drop(["diagnosis", "id", "Unnamed: 32"], axis=1, errors="ignore")
y = df["diagnosis"]

X_train, X_test, y_train, y_test = split_data(
    X, y,
    test_size=config["split"]["test_size"],
    random_state=config["split"]["random_state"]
)


In [None]:
lr = LogisticRegression(
    max_iter=config["models"]["logistic_regression"]["max_iter"]
)

lr_pipeline = build_pipeline(lr)
lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_lr))


In [None]:
rf = RandomForestClassifier(
    n_estimators=config["models"]["random_forest"]["n_estimators"],
    max_depth=config["models"]["random_forest"]["max_depth"],
    random_state=42
)

rf_pipeline = build_pipeline(rf)
rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_rf))


In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d")
plt.title("Matriz de Confusão – Random Forest")
plt.show()


In [None]:
importances = rf.feature_importances_
features = X.columns

fi = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

sns.barplot(x="Importance", y="Feature", data=fi)
plt.title("Importância das Variáveis Clínicas")
plt.show()


In [None]:
# Criar diretório de modelos, se não existir
os.makedirs("../models", exist_ok=True)

# Salvar pipeline completo
model_path = "../models/maternal_risk_model.pkl"
joblib.dump(rf_pipeline, model_path)

print(f"Modelo salvo em: {model_path}")
