In [3]:
# 15_1: Stacking Logit + LightGBM + XGBoost con SMOTE
# ====================================================

# 🚗 Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 🔐 Reproducibilidad
import os, random
import numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# 📚 Librerías
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss
)
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt

# 📂 Rutas
ruta_base = "/content/drive/MyDrive/Datos/6_Base_Modelos_Predictivos.parquet"
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"
ruta_roc = "/content/drive/MyDrive/Resultados/15_1_Curva_ROC_Stacking_SMOTE.png"
ruta_prc = "/content/drive/MyDrive/Resultados/15_1_Curva_PRC_Stacking_SMOTE.png"
ruta_shap = "/content/drive/MyDrive/Resultados/15_1_Importancia_SHAP_Stacking.png"

# 📥 Cargar base
df = pd.read_parquet(ruta_base).drop_duplicates()
df = pd.get_dummies(df, columns=['DEP', 'CIIU_Letra'], drop_first=False)
y = df["RQ"]
X = df.drop(columns=["RQ", "NIT", "Año"])

# 🧼 Reemplazo de infs y NaN
X = X.replace([np.inf, -np.inf], np.nan)
X = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X), columns=X.columns)
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# 🔄 Validación cruzada con SMOTE + Stacking
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
resultados, y_true_total, y_prob_total = [], [], []

for train_idx, test_idx in tqdm(skf.split(X, y), total=10, desc="Stacking CV"):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    sm = SMOTE(random_state=SEED)
    X_res, y_res = sm.fit_resample(X_train, y_train)

    estimadores = [
        ('logit', LogisticRegression(max_iter=1000, class_weight='balanced')),
        ('lgbm', LGBMClassifier(n_estimators=100, random_state=SEED)),
        ('xgb', XGBClassifier(n_estimators=100, eval_metric='logloss', random_state=SEED))
    ]

    meta_model = LogisticRegression(max_iter=1000)
    model = StackingClassifier(estimators=estimadores, final_estimator=meta_model, passthrough=True)
    model.fit(X_res, y_res)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    resultados.append([
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred, zero_division=0),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        log_loss(y_test, y_prob)
    ])
    y_true_total.extend(y_test)
    y_prob_total.extend(y_prob)

# 📈 Promedio de métricas
res = np.array(resultados)
acc_m, prec_m, rec_m, f1_m, auc_m, log_m = res.mean(axis=0)
acc_s, prec_s, rec_s, f1_s, auc_s, log_s = res.std(axis=0)

# 🧠 Entrenar stacking
model_final = StackingClassifier(estimators=estimadores, final_estimator=meta_model, passthrough=True)
model_final.fit(X, y)

# 🔍 Explicar el meta-modelo (Logit) con SHAP usando las predicciones base
X_meta = model_final.transform(X)
meta_explainer = shap.Explainer(model_final.final_estimator_, X_meta)
shap_vals = meta_explainer(X_meta)

# 📊 Gráfico SHAP
shap.summary_plot(shap_vals, X_meta, show=False)
plt.tight_layout()
plt.savefig(ruta_shap)
plt.close()

# 📄 Guardar resultados
if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

idx = resumen[
    (resumen['Base'] == 'Turismo') & (resumen['Modelo'] == 'Stacking (Logit + LGBM + XGB + SMOTE)')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

X_meta = pd.DataFrame(model_final.transform(X), columns=[
    f"logit_pred", f"lgbm_pred", f"xgb_pred"
] + list(X.columns))  # si passthrough=True

meta_explainer = shap.Explainer(model_final.final_estimator_, X_meta)
shap_vals = meta_explainer(X_meta)

shap_importance = np.abs(shap_vals.values).mean(axis=0)
top_vars = pd.Series(shap_importance, index=X_meta.columns).sort_values(ascending=False).head(3).index.tolist()

top1, top2, top3 = (top_vars + [None]*3)[:3]

resumen.loc[idx] = [
    'Turismo', 'Stacking (Logit + LGBM + XGB + SMOTE)', 'Híbrido', 'Estática', 'Supervisado',
    round(acc_m, 4), round(acc_s, 4),
    round(prec_m, 4), round(prec_s, 4),
    round(rec_m, 4), round(rec_s, 4),
    round(f1_m, 4), f"±{f1_s:.4f}",
    round(auc_m, 4), round(auc_s, 4),
    round(log_m, 4), round(log_s, 4),
    top1, top2, top3
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resultados guardados en: {ruta_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Stacking CV:   0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 40943, number of negative: 40943
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12687
[LightGBM] [Info] Number of data points in the train set: 81886, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32754, number of negative: 32754
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12657
[LightGBM] [Info] Number of data points in the train set: 65508, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  10%|█         | 1/10 [01:16<11:27, 76.35s/it]

[LightGBM] [Info] Number of positive: 40943, number of negative: 40943
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12708
[LightGBM] [Info] Number of data points in the train set: 81886, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32754, number of negative: 32754
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12670
[LightGBM] [Info] Number of data points in the train set: 65508, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  20%|██        | 2/10 [02:31<10:03, 75.40s/it]

[LightGBM] [Info] Number of positive: 40943, number of negative: 40943
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12646
[LightGBM] [Info] Number of data points in the train set: 81886, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32754, number of negative: 32754
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12616
[LightGBM] [Info] Number of data points in the train set: 65508, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  30%|███       | 3/10 [03:45<08:44, 74.91s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12713
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036856 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12681
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  40%|████      | 4/10 [05:00<07:30, 75.15s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12683
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028874 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12647
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  50%|█████     | 5/10 [06:17<06:18, 75.75s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12630
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12603
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  60%|██████    | 6/10 [07:30<04:59, 74.87s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12688
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12656
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  70%|███████   | 7/10 [08:46<03:45, 75.11s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12672
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12644
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  80%|████████  | 8/10 [10:01<02:30, 75.01s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12676
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12642
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV:  90%|█████████ | 9/10 [11:17<01:15, 75.25s/it]

[LightGBM] [Info] Number of positive: 40944, number of negative: 40944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12677
[LightGBM] [Info] Number of data points in the train set: 81888, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 32755, number of negative: 32755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12644
[LightGBM] [Info] Number of data points in the train set: 65510, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

Stacking CV: 100%|██████████| 10/10 [12:31<00:00, 75.19s/it]


[LightGBM] [Info] Number of positive: 7012, number of negative: 45493
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11607
[LightGBM] [Info] Number of data points in the train set: 52505, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.133549 -> initscore=-1.869936
[LightGBM] [Info] Start training from score -1.869936
[LightGBM] [Info] Number of positive: 5610, number of negative: 36394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11607
[LightGBM] [Info] Number of data points in the train set: 42004, number of used features: 89
[LightGBM] [Info] 