In [6]:
# 1. Imports
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import resample

# Modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    GradientBoostingClassifier, RandomForestClassifier,
    BaggingClassifier, AdaBoostClassifier
)

# 2. Carregamento de dados
df = pd.read_csv('C:/Users/Windows/Desktop/Carreira/CAIXAVERSO/deploy_app/MOCK_DATA.csv')

# 3. Pré-processamento
def preprocess_data(df):
    df = df.copy()

    # Binárias -> int
    bool_cols = [
        "usa_app_caixa", "tem_seguro_vida", "tem_cartao_credito", "inadimplente",
        "recebeu_oferta_seguro", "clicou_oferta_app", "aderiu_produto"
    ]
    df[bool_cols] = df[bool_cols].astype(bool).astype(int)

    # Categóricas -> LabelEncoder
    cat_cols = ["sexo", "estado_civil", "escolaridade", "regiao", "produto_principal"]
    le = LabelEncoder()
    for col in cat_cols:
        df[col] = le.fit_transform(df[col])

    # Normalização
    num_cols = [
        "idade", "renda_mensal", "tempo_clientes_anos", "limite_credito",
        "frequencia_acesso_ap_p_mes", "qte_transacoes_pix_mes",
        "saldo_medio_6m", "score_credito", "tempo_resposta_oferta"
    ]
    df[num_cols] = StandardScaler().fit_transform(df[num_cols])

    return df

# 4. Seleção de Atributos
def select_best_features(X, y, k=2):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    print("✅ Features selecionadas:", selected_features)
    return X[selected_features]

# 5. Balanceamento
def balance_classes(X, y):
    df = pd.concat([X, y], axis=1)
    target_col = y.name

    class_counts = df[target_col].value_counts()
    min_class = class_counts.idxmin()
    max_class = class_counts.idxmax()

    df_min = df[df[target_col] == min_class]
    df_max = df[df[target_col] == max_class]

    df_max_downsampled = resample(
        df_max,
        replace=False,
        n_samples=len(df_min),
        random_state=42
    )

    balanced_df = pd.concat([df_min, df_max_downsampled]).sample(frac=1, random_state=42).reset_index(drop=True)
    X_bal = balanced_df.drop(columns=target_col)
    y_bal = balanced_df[target_col]

    return X_bal, y_bal

# 6. Divisão dos dados
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Treinamento e Avaliação
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7]}),
        "Decision Tree": (DecisionTreeClassifier(), {"max_depth": [3, 5, 10]}),
        "SVM": (SVC(), {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}),
        "Gradient Boosting": (GradientBoostingClassifier(), {"n_estimators": [50, 100], "learning_rate": [0.05, 0.1]}),
        "Random Forest": (RandomForestClassifier(), {"n_estimators": [50, 100], "max_depth": [5, 10]}),
        "Bagging": (BaggingClassifier(), {"n_estimators": [10, 50]}),
        "AdaBoost": (AdaBoostClassifier(), {"n_estimators": [50, 100], "learning_rate": [0.05, 0.1]})
    }

    best_model_name = None
    best_model_instance = None
    best_f1 = 0

    for name, (model, params) in models.items():
        print(f"\n🔍 Treinando modelo: {name}")
        grid = GridSearchCV(model, params, cv=5, scoring="f1", n_jobs=1, error_score='raise')
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        preds = best_model.predict(X_test)
        f1 = f1_score(y_test, preds)

        print(classification_report(y_test, preds))
        print(f"✅ Treino: {best_model.score(X_train, y_train):.2f} | Teste: {best_model.score(X_test, y_test):.2f} | F1: {f1:.2f}")

        if f1 > best_f1:
            best_f1 = f1
            best_model_name = name
            best_model_instance = best_model

    print(f"\n🏆 Melhor modelo para produção: {best_model_name} (F1 = {best_f1:.2f})")
    return best_model_instance

# 8. Pipeline
def pipeline_ml(df):
    df = preprocess_data(df)
    X = df.drop(columns="aderiu_produto")
    y = df["aderiu_produto"]
    X = select_best_features(X, y, k=2)
    X_bal, y_bal = balance_classes(X, y)
    X_train, X_test, y_train, y_test = split_data(X_bal, y_bal)
    best_model = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    return best_model

# 9. Executar e salvar
modelo_final = pipeline_ml(df)
joblib.dump(modelo_final, 'modelo_treinado.pkl', compress=3)
print("✅ Modelo salvo como 'modelo_treinado.pkl' 🎯")


✅ Features selecionadas: ['idade', 'usa_app_caixa']

🔍 Treinando modelo: KNN
              precision    recall  f1-score   support

           0       0.59      0.57      0.58       105
           1       0.54      0.56      0.55        94

    accuracy                           0.57       199
   macro avg       0.57      0.57      0.57       199
weighted avg       0.57      0.57      0.57       199

✅ Treino: 0.64 | Teste: 0.57 | F1: 0.55

🔍 Treinando modelo: Decision Tree
              precision    recall  f1-score   support

           0       0.62      0.33      0.43       105
           1       0.51      0.78      0.62        94

    accuracy                           0.54       199
   macro avg       0.57      0.55      0.53       199
weighted avg       0.57      0.54      0.52       199

✅ Treino: 0.56 | Teste: 0.54 | F1: 0.62

🔍 Treinando modelo: SVM
              precision    recall  f1-score   support

           0       0.56      0.50      0.53       105
           1       0

In [7]:
df.head(5)

Unnamed: 0,idade,sexo,renda_mensal,estado_civil,escolaridade,regiao,tempo_clientes_anos,produto_principal,usa_app_caixa,tem_seguro_vida,...,limite_credito,frequencia_acesso_ap_p_mes,qte_transacoes_pix_mes,saldo_medio_6m,inadimplente,score_credito,recebeu_oferta_seguro,clicou_oferta_app,aderiu_produto,tempo_resposta_oferta
0,22,M,4385,viuvo,fundamental,sul,13,cartao-credito,True,False,...,7014,24,31,5353,True,67,True,True,True,15
1,27,F,9994,solteiro,superior,nordeste,5,seguro,False,False,...,13125,29,1,1372,True,62,True,False,False,16
2,21,M,8715,casado,fundamental,centro-oeste,10,investimento,False,False,...,18276,1,18,1440,False,50,True,False,True,3
3,31,M,2107,solteiro,fundamental,centro-oeste,6,emprestimo,False,False,...,5548,47,31,1963,True,92,True,False,True,30
4,34,M,2169,casado,medio,nordeste,8,seguro,True,True,...,23640,15,31,6413,True,70,True,True,True,24
