In [None]:
# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Librerías necesarias
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, log_loss
)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# 📍 Ruta de entrada
ruta_base = "/content/drive/MyDrive/Datos/7_Base_Modelos_Predictivos_Reducida.parquet"
df = pd.read_parquet(ruta_base)

# 🧼 Ordenar y preparar
df = df.sort_values(by=["NIT", "Año"]).copy()

# 🎯 Construir RQ_final por empresa
df_rq_final = df.groupby("NIT")["RQ"].max().reset_index().rename(columns={"RQ": "RQ_final"})
df_last_year = df.groupby("NIT")["Año"].max().reset_index().rename(columns={"Año": "Año_final"})
df = df.merge(df_rq_final, on="NIT").merge(df_last_year, on="NIT")

# 📦 Lista de variables financieras (excepto metadata)
col_excluir = ['NIT', 'Año', 'DEP', 'CIIU_Letra', 'RQ', 'RQ_final', 'Año_final']
variables_financieras = [col for col in df.columns if col not in col_excluir]

# 🧱 Construcción de ventana móvil (stack de 5 años hacia atrás)
ventana = 5
df_ventanas = []

for nit, grupo in tqdm(df.groupby("NIT"), desc="⏳ Construyendo stacks"):
    grupo = grupo.sort_values("Año", ascending=False)
    if grupo.shape[0] < 4:
        continue
    año_final = grupo["Año"].max()
    grupo = grupo[grupo["Año"].between(año_final - ventana + 1, año_final)]
    if grupo.shape[0] < 4:
        continue
    fila = {}
    for i, (_, fila_anio) in enumerate(grupo.sort_values("Año", ascending=False).iterrows()):
        for var in variables_financieras:
            fila[f"{var}_-{i}"] = fila_anio[var]
    fila["RQ_final"] = grupo["RQ_final"].iloc[0]
    fila["DEP"] = grupo["DEP"].iloc[0]
    fila["CIIU_Letra"] = grupo["CIIU_Letra"].iloc[0]
    fila["Año_final"] = año_final
    df_ventanas.append(fila)

df_stack = pd.DataFrame(df_ventanas)
print(f"✅ Base construida: {df_stack.shape}")

# 🧹 Limpieza de inf y NaN
df_stack.replace([np.inf, -np.inf], np.nan, inplace=True)
limite_nan = df_stack.shape[0] * 0.5
df_stack.dropna(thresh=limite_nan, axis=1, inplace=True)
df_stack.fillna(df_stack.median(numeric_only=True), inplace=True)
print(f"✅ Base limpia: {df_stack.shape}")

# 🎯 Variables predictoras y objetivo
X = df_stack.drop(columns=["RQ_final", "DEP", "CIIU_Letra", "Año_final"])
y = df_stack["RQ_final"]

# 🔢 Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🧪 Validación cruzada con SMOTE y XGBoost
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracy_list, precision_list, recall_list = [], [], []
f1_list, auc_list, logloss_list = [], [], []

print("🚀 Ejecutando validación cruzada...")

for train_idx, test_idx in tqdm(kf.split(X_scaled, y), total=10):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    smote = SMOTE(random_state=42)
    X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

    model = XGBClassifier(
        n_estimators=100, max_depth=6, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric="logloss",
        random_state=42
    )

    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred, zero_division=0))
    recall_list.append(recall_score(y_test, y_pred, zero_division=0))
    f1_list.append(f1_score(y_test, y_pred, zero_division=0))
    auc_list.append(roc_auc_score(y_test, y_prob))
    logloss_list.append(log_loss(y_test, y_prob))

# 📊 Resultados finales
print("\n📋 Resultados promedio (10 folds):")
print(f"Accuracy:   {np.mean(accuracy_list):.4f} ± {np.std(accuracy_list):.4f}")
print(f"Precision:  {np.mean(precision_list):.4f} ± {np.std(precision_list):.4f}")
print(f"Recall:     {np.mean(recall_list):.4f} ± {np.std(recall_list):.4f}")
print(f"F1-score:   {np.mean(f1_list):.4f} ± {np.std(f1_list):.4f}")
print(f"AUC:        {np.mean(auc_list):.4f} ± {np.std(auc_list):.4f}")
print(f"LogLoss:    {np.mean(logloss_list):.4f} ± {np.std(logloss_list):.4f}")

# 🧠 Top 3 variables más importantes
importances = model.feature_importances_
feature_names = X.columns
top_idx = np.argsort(importances)[::-1][:3]
top_vars = [feature_names[i] for i in top_idx]

# 🧾 Guardar resultados en CSV separado para modelos con stack
ruta_csv_stack = "/content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv"

if os.path.exists(ruta_csv_stack):
    resumen_stack = pd.read_csv(ruta_csv_stack)
else:
    resumen_stack = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

nueva_fila = {
    'Base': 'Turismo',
    'Modelo': 'XGBoost (Stack 5 años)',
    'Naturaleza': 'Avanzado',
    'Temporalidad': 'Estática con stack',
    'Tipo de aprendizaje': 'Supervisado',
    'Accuracy': round(np.mean(accuracy_list), 4),
    'Desv. Accuracy': round(np.std(accuracy_list), 4),
    'Precision': round(np.mean(precision_list), 4),
    'Desv. Precision': round(np.std(precision_list), 4),
    'Recall': round(np.mean(recall_list), 4),
    'Desv. Recall': round(np.std(recall_list), 4),
    'F1-score promedio': round(np.mean(f1_list), 4),
    'Desviación F1': f"±{round(np.std(f1_list), 4)}",
    'AUC': round(np.mean(auc_list), 4),
    'Desv. AUC': round(np.std(auc_list), 4),
    'LogLoss': round(np.mean(logloss_list), 4),
    'Desv. LogLoss': round(np.std(logloss_list), 4),
    'Top 1 variable': top_vars[0] if len(top_vars) > 0 else None,
    'Top 2 variable': top_vars[1] if len(top_vars) > 1 else None,
    'Top 3 variable': top_vars[2] if len(top_vars) > 2 else None,
}

resumen_stack = pd.concat([resumen_stack, pd.DataFrame([nueva_fila])], ignore_index=True)
resumen_stack.to_csv(ruta_csv_stack, index=False)
print(f"\n✅ Resultados guardados en: {ruta_csv_stack}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


⏳ Construyendo stacks:   0%|          | 0/5770 [00:00<?, ?it/s]

✅ Base construida: (4067, 310)
✅ Base limpia: (4067, 87)
🚀 Ejecutando validación cruzada...


  0%|          | 0/10 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




📋 Resultados promedio (10 folds):
Accuracy:   0.8397 ± 0.0179
Precision:  0.9091 ± 0.0147
Recall:     0.8824 ± 0.0186
F1-score:   0.8954 ± 0.0120
AUC:        0.8996 ± 0.0109
LogLoss:    0.3519 ± 0.0244

✅ Resultados guardados en: /content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv


  resumen_stack = pd.concat([resumen_stack, pd.DataFrame([nueva_fila])], ignore_index=True)
