In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib


In [None]:
import src.data as d
import inspect

print(d.__file__)
print(inspect.getsource(d.load_data))



In [None]:
import importlib
import src.data

importlib.reload(src.data)

from src.data import load_data
df = load_data()
df.shape


In [None]:
import sys
sys.path.append("..")  # para importar src/ a partir de notebooks/

from src.data import load_data
df = load_data()
df.shape



In [None]:
print(df.dtypes)
df.head()


In [None]:
TARGET = "nivel_obesidade"  # ajuste se seu nome estiver diferente

if TARGET not in df.columns:
    raise ValueError(f"Coluna alvo '{TARGET}' não existe. Colunas: {list(df.columns)}")

print("Distribuição do alvo:")
print(df[TARGET].value_counts())
print("\nPercentual:")
print((df[TARGET].value_counts(normalize=True) * 100).round(2))


In [None]:
#Separar X e y + split

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


In [None]:
#Pré-processamento (numéricas + categóricas)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)

print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)


In [None]:
# Testar modelos (baseline)

models = {
    "LogReg": LogisticRegression(max_iter=3000),
    "RandomForest": RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
}

results = []

best_name = None
best_pipe = None
best_acc = -1

for name, model in models.items():
    pipe = Pipeline(steps=[("prep", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)

    results.append((name, acc))
    if acc > best_acc:
        best_name, best_pipe, best_acc = name, pipe, acc

results


In [None]:
# Resultado do melhor modelo (métricas)

preds = best_pipe.predict(X_test)

print("Melhor modelo:", best_name)
print("Acurácia:", round(best_acc, 4))

print("\nRelatório de Classificação:")
print(classification_report(y_test, preds))

print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, preds))


In [None]:
# Salvar modelo

os.makedirs("../models", exist_ok=True)
joblib.dump(best_pipe, "../models/model.joblib")

print("Modelo salvo em ../models/model.joblib")
