In [1]:
# 14_3: CNN + LSTM con dummies de DEP y CIIU_Letra
# ================================================

# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 🔐 Reproducibilidad
import os, random
import numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# 📚 Librerías
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv1D, LSTM, Dense, Dropout, Input, MaxPooling1D, Flatten
)
from tensorflow.keras.optimizers import Adam

# 📥 Cargar base
ruta_base = "/content/drive/MyDrive/Datos/7_Base_Modelos_Predictivos_Reducida.parquet"
df = pd.read_parquet(ruta_base)
df = df.sort_values(['NIT', 'Año']).reset_index(drop=True)

# 🧠 One-hot encoding para DEP y CIIU_Letra
df_dummies = pd.get_dummies(df[['DEP', 'CIIU_Letra']], drop_first=False)
df = pd.concat([df, df_dummies], axis=1)

# 🎯 Variables numéricas + dummies
col_excluir = ['NIT', 'Año', 'RQ', 'DEP', 'CIIU_Letra']
var_num = [col for col in df.columns if col not in col_excluir and col != 'RQ']
n_ventana = 5

# 🌀 Rolling window + agregar dummies como metadata repetida
X_seq, y_seq = [], []
for nit in tqdm(df['NIT'].unique(), desc="🔄 Generando ventanas CNN-LSTM"):
    df_emp = df[df['NIT'] == nit].sort_values('Año')
    if len(df_emp) < n_ventana:
        continue
    datos = df_emp[var_num].values
    etiquetas = df_emp['RQ'].values
    dummy_vals = df_emp[df_dummies.columns].iloc[0].values
    dummy_repeated = np.tile(dummy_vals, (n_ventana, 1))
    for i in range(len(df_emp) - n_ventana + 1):
        ventana = datos[i:i+n_ventana]
        ventana_ext = np.concatenate([ventana, dummy_repeated], axis=1)
        etiqueta = etiquetas[i+n_ventana-1]
        X_seq.append(ventana_ext)
        y_seq.append(etiqueta)

X = np.array(X_seq).astype(np.float64)
y = np.array(y_seq)
print(f"✔️ Dataset final: X={X.shape}, y={y.shape}")

# 🔁 Reemplazo de inf/-inf antes de imputar
X_flat = X.reshape(X.shape[0], -1)
for j in range(X_flat.shape[1]):
    col = X_flat[:, j]
    finite_vals = col[np.isfinite(col)]
    if len(finite_vals) > 0:
        p99 = np.percentile(finite_vals, 99)
        p01 = np.percentile(finite_vals, 1)
        col[np.isposinf(col)] = p99
        col[np.isneginf(col)] = p01
        X_flat[:, j] = col

# 🧼 Imputación y escalado
X_flat = SimpleImputer(strategy='mean').fit_transform(X_flat)
X_scaled = StandardScaler().fit_transform(X_flat)
X = X_scaled.reshape(X.shape[0], n_ventana, -1)

# 📊 Partición
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=SEED)

# ⚖️ Ponderación de clases
cw = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
cw_dict = {0: cw[0], 1: cw[1]}

# 🧠 Modelo CNN + LSTM
model = Sequential([
    Input(shape=(X.shape[1], X.shape[2])),
    Conv1D(filters=32, kernel_size=2, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(64, activation='tanh', dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 🏋️ Entrenamiento
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2,
                    verbose=1, class_weight=cw_dict)

# 🔮 Predicción
y_prob = []
batch_size = 512
for i in tqdm(range(0, len(X_test), batch_size), desc="🔮 Prediciendo"):
    y_prob_batch = model.predict(X_test[i:i+batch_size]).ravel()
    y_prob.extend(y_prob_batch)
y_prob = np.array(y_prob)
y_pred = (y_prob >= 0.5).astype(int)

# 📈 Métricas
try:
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob) if not np.isnan(y_prob).any() else np.nan
    logl = log_loss(y_test, y_prob) if not np.isnan(y_prob).any() else np.nan
except Exception as e:
    print(f"⚠️ Error en métricas: {e}")
    acc = prec = rec = f1 = auc = logl = np.nan

# 💾 Guardar resultados
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"
if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy',
        'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall',
        'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC',
        'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

idx = resumen[
    (resumen['Base'] == 'Turismo') & (resumen['Modelo'] == 'CNN-LSTM + Dummies')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

resumen.loc[idx] = [
    'Turismo', 'CNN-LSTM + Dummies', 'Avanzado', 'Secuencial', 'Supervisado',
    round(acc, 4) if not np.isnan(acc) else None, None,
    round(prec, 4) if not np.isnan(prec) else None, None,
    round(rec, 4) if not np.isnan(rec) else None, None,
    round(f1, 4) if not np.isnan(f1) else None, "±N/A",
    round(auc, 4) if not np.isnan(auc) else None, None,
    round(logl, 4) if not np.isnan(logl) else None, None,
    None, None, None
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resultados guardados: {ruta_csv}")


Mounted at /content/drive


🔄 Generando ventanas CNN-LSTM: 100%|██████████| 5770/5770 [00:08<00:00, 692.43it/s]


✔️ Dataset final: X=(32192, 5, 121), y=(32192,)
Epoch 1/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 16ms/step - accuracy: 0.5003 - loss: 0.6910 - val_accuracy: 0.6702 - val_loss: 0.6544
Epoch 2/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.5940 - loss: 0.6739 - val_accuracy: 0.5836 - val_loss: 0.6774
Epoch 3/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.6081 - loss: 0.6513 - val_accuracy: 0.5898 - val_loss: 0.6555
Epoch 4/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.6029 - loss: 0.6557 - val_accuracy: 0.5347 - val_loss: 0.6906
Epoch 5/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5991 - loss: 0.6510 - val_accuracy: 0.6721 - val_loss: 0.6121
Epoch 6/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.6320 - loss: 0.6386 - val_accuracy: 0

🔮 Prediciendo:   0%|          | 0/13 [00:00<?, ?it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step  


🔮 Prediciendo:   8%|▊         | 1/13 [00:01<00:16,  1.36s/it]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


🔮 Prediciendo:  15%|█▌        | 2/13 [00:01<00:07,  1.48it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


🔮 Prediciendo:  23%|██▎       | 3/13 [00:01<00:04,  2.13it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


🔮 Prediciendo:  31%|███       | 4/13 [00:01<00:03,  2.85it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


🔮 Prediciendo:  38%|███▊      | 5/13 [00:02<00:02,  3.29it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


🔮 Prediciendo:  46%|████▌     | 6/13 [00:02<00:01,  3.80it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


🔮 Prediciendo:  54%|█████▍    | 7/13 [00:02<00:01,  3.98it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


🔮 Prediciendo:  62%|██████▏   | 8/13 [00:02<00:01,  4.10it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


🔮 Prediciendo:  69%|██████▉   | 9/13 [00:03<00:00,  4.20it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


🔮 Prediciendo:  77%|███████▋  | 10/13 [00:03<00:00,  4.24it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  85%|████████▍ | 11/13 [00:03<00:00,  5.04it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  92%|█████████▏| 12/13 [00:03<00:00,  5.76it/s]

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step


🔮 Prediciendo: 100%|██████████| 13/13 [00:04<00:00,  3.25it/s]



✅ Resultados guardados: /content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv


  resumen.loc[idx] = [
