In [1]:
# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Librerías
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, log_loss
)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dropout, Dense, Input
from tensorflow.keras.optimizers import Adam

# 📥 Cargar base
ruta_base = "/content/drive/MyDrive/Datos/7_Base_Modelos_Predictivos_Reducida.parquet"
df = pd.read_parquet(ruta_base)
df = df.sort_values(["NIT", "Año"]).copy()

# 🎯 Crear RQ_final y año final
df_rq_final = df.groupby("NIT")["RQ"].max().reset_index().rename(columns={"RQ": "RQ_final"})
df_last_year = df.groupby("NIT")["Año"].max().reset_index().rename(columns={"Año": "Año_final"})
df = df.merge(df_rq_final, on="NIT").merge(df_last_year, on="NIT")

# 🧱 Variables y stack
col_excluir = ['NIT', 'Año', 'DEP', 'CIIU_Letra', 'RQ', 'RQ_final', 'Año_final']
variables_financieras = [col for col in df.columns if col not in col_excluir]
ventana = 5
X_seq, y_seq = [], []

for nit, grupo in tqdm(df.groupby("NIT"), desc="⏳ Construyendo ventanas"):
    grupo = grupo.sort_values("Año")
    if grupo.shape[0] < ventana:
        continue
    rq_final = grupo["RQ_final"].iloc[-1]
    datos = grupo[variables_financieras].values
    for i in range(len(grupo) - ventana + 1):
        ventana_datos = datos[i:i+ventana]
        if ventana_datos.shape[0] == ventana:
            X_seq.append(ventana_datos)
            y_seq.append(rq_final)

X = np.array(X_seq)
y = np.array(y_seq)
print(f"✔️ Dataset final: X={X.shape}, y={y.shape}")

# 🧼 Limpieza
X_flat = X.reshape(X.shape[0], -1)
for j in range(X_flat.shape[1]):
    col = X_flat[:, j]
    finite_vals = col[np.isfinite(col)]
    if len(finite_vals) > 0:
        p99 = np.percentile(finite_vals, 99)
        p01 = np.percentile(finite_vals, 1)
        col[np.isposinf(col)] = p99
        col[np.isneginf(col)] = p01
        X_flat[:, j] = col

X_flat = SimpleImputer(strategy='mean').fit_transform(X_flat)
X_scaled = StandardScaler().fit_transform(X_flat)
X = X_scaled.reshape(X.shape[0], ventana, len(variables_financieras))

# 📊 Partición
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# ⚖️ Class weights
cw = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
cw_dict = {0: cw[0], 1: cw[1]}

# 🧠 CNN + LSTM
model = Sequential([
    Input(shape=(X.shape[1], X.shape[2])),
    Conv1D(filters=64, kernel_size=2, activation='relu'),
    LSTM(64, activation='tanh', dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1, class_weight=cw_dict)

# 🔮 Predicción
y_prob = model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

# 📊 Métricas
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc = roc_auc_score(y_test, y_prob)
logl = log_loss(y_test, y_prob)

print("\n📋 Resultados CNN-LSTM:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")
print(f"LogLoss:   {logl:.4f}")

# 💾 Guardar resultados en CSV stack
ruta_csv_stack = "/content/drive/MyDrive/Resultados/resultados_modelos_stack5_RQfinal.csv"
if os.path.exists(ruta_csv_stack):
    resumen_stack = pd.read_csv(ruta_csv_stack)
else:
    resumen_stack = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

nueva_fila = {
    'Base': 'Turismo',
    'Modelo': 'CNN-LSTM (Stack 5 años)',
    'Naturaleza': 'Avanzado',
    'Temporalidad': 'Secuencial',
    'Tipo de aprendizaje': 'Supervisado',
    'Accuracy': round(acc, 4),
    'Desv. Accuracy': None,
    'Precision': round(prec, 4),
    'Desv. Precision': None,
    'Recall': round(rec, 4),
    'Desv. Recall': None,
    'F1-score promedio': round(f1, 4),
    'Desviación F1': "±N/A",
    'AUC': round(auc, 4),
    'Desv. AUC': None,
    'LogLoss': round(logl, 4),
    'Desv. LogLoss': None,
    'Top 1 variable': None,
    'Top 2 variable': None,
    'Top 3 variable': None,
}

resumen_stack = pd.concat([resumen_stack, pd.DataFrame([nueva_fila])], ignore_index=True)
resumen_stack.to_csv(ruta_csv_stack, index=False)
print(f"\n✅ Resultados guardados en: {ruta_csv_stack}")


Mounted at /content/drive


⏳ Construyendo ventanas:   0%|          | 0/5770 [00:00<?, ?it/s]

✔️ Dataset final: X=(32192, 5, 17), y=(32192,)
Epoch 1/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - accuracy: 0.6204 - loss: 0.6544 - val_accuracy: 0.6888 - val_loss: 0.6273
Epoch 2/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.7002 - loss: 0.6064 - val_accuracy: 0.6768 - val_loss: 0.6388
Epoch 3/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.7031 - loss: 0.6026 - val_accuracy: 0.7102 - val_loss: 0.5934
Epoch 4/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.7093 - loss: 0.5887 - val_accuracy: 0.6853 - val_loss: 0.6082
Epoch 5/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7003 - loss: 0.5891 - val_accuracy: 0.7148 - val_loss: 0.5590
Epoch 6/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.6990 - loss: 0.5882 - val_accuracy:

  resumen_stack = pd.concat([resumen_stack, pd.DataFrame([nueva_fila])], ignore_index=True)
