In [1]:
# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Librerías
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
from lightgbm import LGBMClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import set_random_seed

set_random_seed(42)

# 📥 Cargar base
ruta_base = "/content/drive/MyDrive/Datos/7_Base_Modelos_Predictivos_Reducida.parquet"
df = pd.read_parquet(ruta_base)
df = df.sort_values(["NIT", "Año"]).copy()

# 🧱 Crear RQ_final + año final
df_rq_final = df.groupby("NIT")["RQ"].max().reset_index().rename(columns={"RQ": "RQ_final"})
df_last_year = df.groupby("NIT")["Año"].max().reset_index().rename(columns={"Año": "Año_final"})
df = df.merge(df_rq_final, on="NIT").merge(df_last_year, on="NIT")

# 🧱 Stack por empresa
col_excluir = ['NIT', 'Año', 'DEP', 'CIIU_Letra', 'RQ', 'RQ_final', 'Año_final']
variables = [col for col in df.columns if col not in col_excluir]
ventana = 5
df_stack = []

for nit, grupo in tqdm(df.groupby("NIT"), desc="⏳ Stack 5 años"):
    grupo = grupo.sort_values("Año", ascending=False)
    if grupo.shape[0] < ventana:
        continue
    año_final = grupo["Año"].max()
    grupo = grupo[grupo["Año"].between(año_final - ventana + 1, año_final)]
    if grupo.shape[0] < ventana:
        continue
    fila = {}
    for i, (_, fila_anio) in enumerate(grupo.sort_values("Año", ascending=False).iterrows()):
        for var in variables:
            fila[f"{var}_-{i}"] = fila_anio[var]
    fila["RQ_final"] = grupo["RQ_final"].iloc[0]
    df_stack.append(fila)

df_stack = pd.DataFrame(df_stack)
df_stack.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stack.dropna(thresh=df_stack.shape[0]*0.5, axis=1, inplace=True)
df_stack.fillna(df_stack.median(numeric_only=True), inplace=True)

# 🎯 Features
X_real = df_stack.drop(columns=["RQ_final"])
y_real = df_stack["RQ_final"]

# 🔢 Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)

# 🏷️ Separar por clase
X_min = X_scaled[y_real == 1]
X_maj = X_scaled[y_real == 0]

# 🧠 GAN para clase minoritaria
latent_dim = 10
generator = Sequential([
    Input(shape=(latent_dim,)),
    Dense(64, activation='relu'),
    Dense(X_min.shape[1], activation='linear')
])
discriminator = Sequential([
    Input(shape=(X_min.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
discriminator.compile(optimizer=Adam(0.0002), loss='binary_crossentropy')
gan = Sequential([generator, discriminator])
discriminator.trainable = False
gan.compile(optimizer=Adam(0.0002), loss='binary_crossentropy')

# 🏋️ Entrenar GAN
batch_size = 32
for epoch in tqdm(range(300), desc="🔁 Entrenando GAN"):
    idx = np.random.randint(0, X_min.shape[0], batch_size)
    real_samples = X_min[idx]
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    fake_samples = generator.predict(noise, verbose=0)
    X_combined = np.concatenate([real_samples, fake_samples])
    y_combined = np.concatenate([np.ones(batch_size), np.zeros(batch_size)])
    discriminator.trainable = True
    discriminator.train_on_batch(X_combined, y_combined)
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    y_mislabeled = np.ones(batch_size)
    discriminator.trainable = False
    gan.train_on_batch(noise, y_mislabeled)

# 🎲 Generar muestras sintéticas
n_sinteticos = 2000
noise = np.random.normal(0, 1, (n_sinteticos, latent_dim))
X_sint = generator.predict(noise, verbose=0)
y_sint = np.ones(n_sinteticos)

# ⚙️ Entrenamiento con validación cruzada
X_total = np.vstack([X_scaled, X_sint])
y_total = np.concatenate([y_real.values, y_sint])

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
acc, prec, rec, f1, auc, logl = [], [], [], [], [], []

print("🚀 Validación cruzada GAN+LGBM...")
for train_idx, test_idx in tqdm(kf.split(X_scaled, y_real), total=10):
    X_train_real, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train_real, y_test = y_real.iloc[train_idx], y_real.iloc[test_idx]

    # Agregar muestras sintéticas al entrenamiento
    X_train = np.vstack([X_train_real, X_sint])
    y_train = np.concatenate([y_train_real, y_sint])

    model = LGBMClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc.append(accuracy_score(y_test, y_pred))
    prec.append(precision_score(y_test, y_pred, zero_division=0))
    rec.append(recall_score(y_test, y_pred, zero_division=0))
    f1.append(f1_score(y_test, y_pred, zero_division=0))
    auc.append(roc_auc_score(y_test, y_prob))
    logl.append(log_loss(y_test, y_prob))

# 📈 Resultados
print("\n📋 Resultados promedio (10 folds):")
print(f"Accuracy:   {np.mean(acc):.4f} ± {np.std(acc):.4f}")
print(f"Precision:  {np.mean(prec):.4f} ± {np.std(prec):.4f}")
print(f"Recall:     {np.mean(rec):.4f} ± {np.std(rec):.4f}")
print(f"F1-score:   {np.mean(f1):.4f} ± {np.std(f1):.4f}")
print(f"AUC:        {np.mean(auc):.4f} ± {np.std(auc):.4f}")
print(f"LogLoss:    {np.mean(logl):.4f} ± {np.std(logl):.4f}")

# 💾 Guardar resultados
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv"
if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

resumen.loc[len(resumen)] = [
    'Turismo', 'GAN + LightGBM', 'Híbrido', 'Estática', 'Supervisado',
    round(np.mean(acc), 4), round(np.std(acc), 4),
    round(np.mean(prec), 4), round(np.std(prec), 4),
    round(np.mean(rec), 4), round(np.std(rec), 4),
    round(np.mean(f1), 4), f"±{np.std(f1):.4f}",
    round(np.mean(auc), 4), round(np.std(auc), 4),
    round(np.mean(logl), 4), round(np.std(logl), 4),
    None, None, None
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resultados guardados en: {ruta_csv}")


Mounted at /content/drive


⏳ Stack 5 años:   0%|          | 0/5770 [00:00<?, ?it/s]

🔁 Entrenando GAN:   0%|          | 0/300 [00:00<?, ?it/s]

🚀 Validación cruzada GAN+LGBM...


  0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 4129, number of negative: 742




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4871, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847670 -> initscore=1.716441
[LightGBM] [Info] Start training from score 1.716441




[LightGBM] [Info] Number of positive: 4129, number of negative: 743
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847496 -> initscore=1.715094
[LightGBM] [Info] Start training from score 1.715094




[LightGBM] [Info] Number of positive: 4129, number of negative: 743
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847496 -> initscore=1.715094
[LightGBM] [Info] Start training from score 1.715094




[LightGBM] [Info] Number of positive: 4129, number of negative: 743
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847496 -> initscore=1.715094
[LightGBM] [Info] Start training from score 1.715094




[LightGBM] [Info] Number of positive: 4129, number of negative: 743
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847496 -> initscore=1.715094
[LightGBM] [Info] Start training from score 1.715094




[LightGBM] [Info] Number of positive: 4129, number of negative: 743
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847496 -> initscore=1.715094
[LightGBM] [Info] Start training from score 1.715094




[LightGBM] [Info] Number of positive: 4130, number of negative: 742
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847701 -> initscore=1.716683
[LightGBM] [Info] Start training from score 1.716683




[LightGBM] [Info] Number of positive: 4130, number of negative: 742
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847701 -> initscore=1.716683
[LightGBM] [Info] Start training from score 1.716683




[LightGBM] [Info] Number of positive: 4130, number of negative: 742
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847701 -> initscore=1.716683
[LightGBM] [Info] Start training from score 1.716683




[LightGBM] [Info] Number of positive: 4130, number of negative: 742
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21675
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.847701 -> initscore=1.716683
[LightGBM] [Info] Start training from score 1.716683





📋 Resultados promedio (10 folds):
Accuracy:   0.8317 ± 0.0176
Precision:  0.8841 ± 0.0162
Recall:     0.8901 ± 0.0199
F1-score:   0.8869 ± 0.0121
AUC:        0.9003 ± 0.0161
LogLoss:    0.3725 ± 0.0451

✅ Resultados guardados en: /content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv
