In [1]:
# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Librerías necesarias
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, log_loss
)
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import set_random_seed

set_random_seed(42)

# 📥 Cargar base reducida
ruta_base = "/content/drive/MyDrive/Datos/7_Base_Modelos_Predictivos_Reducida.parquet"
df = pd.read_parquet(ruta_base)
df = df.sort_values(["NIT", "Año"]).copy()

# 🎯 Crear RQ_final y año final
df_rq_final = df.groupby("NIT")["RQ"].max().reset_index().rename(columns={"RQ": "RQ_final"})
df_last_year = df.groupby("NIT")["Año"].max().reset_index().rename(columns={"Año": "Año_final"})
df = df.merge(df_rq_final, on="NIT").merge(df_last_year, on="NIT")

# 🧱 Stack de los últimos 5 años
col_excluir = ['NIT', 'Año', 'DEP', 'CIIU_Letra', 'RQ', 'RQ_final', 'Año_final']
variables_financieras = [col for col in df.columns if col not in col_excluir]
ventana = 5
df_ventanas = []

for nit, grupo in tqdm(df.groupby("NIT"), desc="⏳ Construyendo stacks"):
    grupo = grupo.sort_values("Año", ascending=False)
    if grupo.shape[0] < ventana:
        continue
    año_final = grupo["Año"].max()
    grupo = grupo[grupo["Año"].between(año_final - ventana + 1, año_final)]
    if grupo.shape[0] < ventana:
        continue
    fila = {}
    for i, (_, fila_anio) in enumerate(grupo.sort_values("Año", ascending=False).iterrows()):
        for var in variables_financieras:
            fila[f"{var}_-{i}"] = fila_anio[var]
    fila["RQ_final"] = grupo["RQ_final"].iloc[0]
    df_ventanas.append(fila)

df_stack = pd.DataFrame(df_ventanas)
df_stack.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stack.dropna(thresh=df_stack.shape[0] * 0.5, axis=1, inplace=True)
df_stack.fillna(df_stack.median(numeric_only=True), inplace=True)

# 🎯 Features y target
X = df_stack.drop(columns=["RQ_final"])
y = df_stack["RQ_final"]

# 🔢 Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ⚙️ Validación cruzada
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list, auc_list, logloss_list = [], [], [], [], [], []

print("🚀 Ejecutando validación cruzada...")

for train_idx, test_idx in tqdm(kf.split(X_scaled, y), total=10):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # 🌲 XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_train, y_train)
    xgb_train_probs = xgb.predict_proba(X_train)[:, 1].reshape(-1, 1)
    xgb_test_probs = xgb.predict_proba(X_test)[:, 1].reshape(-1, 1)

    # 🧠 ANN sobre las probabilidades
    ann = Sequential([
        Input(shape=(1,)),
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    ann.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    ann.fit(xgb_train_probs, y_train, epochs=20, batch_size=32, verbose=0)

    y_prob = ann.predict(xgb_test_probs).ravel()
    y_pred = (y_prob >= 0.5).astype(int)

    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred, zero_division=0))
    recall_list.append(recall_score(y_test, y_pred, zero_division=0))
    f1_list.append(f1_score(y_test, y_pred, zero_division=0))
    auc_list.append(roc_auc_score(y_test, y_prob))
    logloss_list.append(log_loss(y_test, y_prob))

# 📈 Resultados promedio
print("\n📋 Resultados promedio (10 folds):")
print(f"Accuracy:   {np.mean(accuracy_list):.4f} ± {np.std(accuracy_list):.4f}")
print(f"Precision:  {np.mean(precision_list):.4f} ± {np.std(precision_list):.4f}")
print(f"Recall:     {np.mean(recall_list):.4f} ± {np.std(recall_list):.4f}")
print(f"F1-score:   {np.mean(f1_list):.4f} ± {np.std(f1_list):.4f}")
print(f"AUC:        {np.mean(auc_list):.4f} ± {np.std(auc_list):.4f}")
print(f"LogLoss:    {np.mean(logloss_list):.4f} ± {np.std(logloss_list):.4f}")

# 💾 Guardar resultados
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv"
if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

resumen.loc[len(resumen)] = [
    'Turismo', 'XGBoost + ANN', 'Híbrido', 'Estática', 'Supervisado',
    round(np.mean(accuracy_list), 4), round(np.std(accuracy_list), 4),
    round(np.mean(precision_list), 4), round(np.std(precision_list), 4),
    round(np.mean(recall_list), 4), round(np.std(recall_list), 4),
    round(np.mean(f1_list), 4), f"±{np.std(f1_list):.4f}",
    round(np.mean(auc_list), 4), round(np.std(auc_list), 4),
    round(np.mean(logloss_list), 4), round(np.std(logloss_list), 4),
    None, None, None
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resultados guardados en: {ruta_csv}")


Mounted at /content/drive


⏳ Construyendo stacks:   0%|          | 0/5770 [00:00<?, ?it/s]

🚀 Ejecutando validación cruzada...


  0%|          | 0/10 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


Parameters: { "use_label_encoder" } are not used.



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 

📋 Resultados promedio (10 folds):
Accuracy:   0.8029 ± 0.0181
Precision:  0.8237 ± 0.0144
Recall:     0.9345 ± 0.0185
F1-score:   0.8755 ± 0.0114
AUC:        0.8959 ± 0.0202
LogLoss:    1.0073 ± 0.1686

✅ Resultados guardados en: /content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv
