In [4]:
# 14_1: LSTM mejorado con class_weight y dropout
# ==============================================

# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 🔐 Reproducibilidad
import os, random
import numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# 📚 Librerías
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

# 📥 Cargar base
ruta_base = "/content/drive/MyDrive/Datos/7_Base_Modelos_Predictivos_Reducida.parquet"
df = pd.read_parquet(ruta_base)
df = df.sort_values(['NIT', 'Año']).reset_index(drop=True)

# 🎯 Selección de columnas
col_excluir = ['NIT', 'Año', 'RQ', 'DEP', 'CIIU_Letra'] if 'DEP' in df.columns else ['NIT', 'Año', 'RQ']
var_num = [col for col in df.columns if col not in col_excluir]
n_ventana = 5

# 🌀 Rolling window
X_seq, y_seq = [], []
for nit in tqdm(df['NIT'].unique(), desc="🔄 Generando ventanas"):
    df_emp = df[df['NIT'] == nit].sort_values('Año')
    if len(df_emp) < n_ventana:
        continue
    datos = df_emp[var_num].values
    etiquetas = df_emp['RQ'].values
    for i in range(len(df_emp) - n_ventana + 1):
        ventana = datos[i:i+n_ventana]
        etiqueta = etiquetas[i+n_ventana-1]
        X_seq.append(ventana)
        y_seq.append(etiqueta)

X = np.array(X_seq)
y = np.array(y_seq)
print(f"✔️ Dataset final: X={X.shape}, y={y.shape}")

# 🔁 Reemplazo de inf/-inf antes de imputar
X_flat = X.reshape(X.shape[0], -1)
for j in range(X_flat.shape[1]):
    col = X_flat[:, j]
    finite_vals = col[np.isfinite(col)]
    if len(finite_vals) > 0:
        p99 = np.percentile(finite_vals, 99)
        p01 = np.percentile(finite_vals, 1)
        col[np.isposinf(col)] = p99
        col[np.isneginf(col)] = p01
        X_flat[:, j] = col

# 🧼 Imputación y escalado
X_flat = SimpleImputer(strategy='mean').fit_transform(X_flat)
X_scaled = StandardScaler().fit_transform(X_flat)
X = X_scaled.reshape(X.shape[0], n_ventana, len(var_num))

# 📊 Partición
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=SEED)

# ⚖️ Calcular class_weight
cw = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
cw_dict = {0: cw[0], 1: cw[1]}

# 🧠 Modelo LSTM mejorado
model = Sequential([
    Input(shape=(X.shape[1], X.shape[2])),
    LSTM(64, activation='tanh', dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 🏋️ Entrenamiento con class_weight
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2,
                    verbose=1, class_weight=cw_dict)

# 📈 Predicción con tqdm
from tqdm import tqdm
y_prob = []
batch_size = 512
for i in tqdm(range(0, len(X_test), batch_size), desc="🔮 Prediciendo"):
    y_prob_batch = model.predict(X_test[i:i+batch_size]).ravel()
    y_prob.extend(y_prob_batch)
y_prob = np.array(y_prob)
y_pred = (y_prob >= 0.5).astype(int)

# 🧮 Métricas
try:
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob) if not np.isnan(y_prob).any() else np.nan
    logl = log_loss(y_test, y_prob) if not np.isnan(y_prob).any() else np.nan
except Exception as e:
    print(f"⚠️ Error en métricas: {e}")
    acc = prec = rec = f1 = auc = logl = np.nan

# 🧾 Guardar resumen
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"
if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy',
        'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall',
        'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC',
        'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

idx = resumen[
    (resumen['Base'] == 'Turismo') & (resumen['Modelo'] == 'LSTM')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

resumen.loc[idx] = [
    'Turismo', 'LSTM', 'Avanzado', 'Secuencial', 'Supervisado',
    round(acc, 4) if not np.isnan(acc) else None, None,
    round(prec, 4) if not np.isnan(prec) else None, None,
    round(rec, 4) if not np.isnan(rec) else None, None,
    round(f1, 4) if not np.isnan(f1) else None, "±N/A",
    round(auc, 4) if not np.isnan(auc) else None, None,
    round(logl, 4) if not np.isnan(logl) else None, None,
    None, None, None
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resultados guardados: {ruta_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


🔄 Generando ventanas: 100%|██████████| 5770/5770 [00:10<00:00, 566.72it/s] 


✔️ Dataset final: X=(32192, 5, 17), y=(32192,)
Epoch 1/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.5062 - loss: 0.6869 - val_accuracy: 0.5228 - val_loss: 0.6684
Epoch 2/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.5325 - loss: 0.6767 - val_accuracy: 0.5764 - val_loss: 0.6413
Epoch 3/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.5793 - loss: 0.6557 - val_accuracy: 0.5090 - val_loss: 0.6789
Epoch 4/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.5293 - loss: 0.6712 - val_accuracy: 0.5500 - val_loss: 0.6490
Epoch 5/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.5462 - loss: 0.6631 - val_accuracy: 0.4929 - val_loss: 0.6769
Epoch 6/20
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.5559 - loss: 0.6568 - val_accuracy:

🔮 Prediciendo:   0%|          | 0/13 [00:00<?, ?it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step  


🔮 Prediciendo:   8%|▊         | 1/13 [00:00<00:05,  2.00it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  15%|█▌        | 2/13 [00:00<00:03,  3.63it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  23%|██▎       | 3/13 [00:00<00:02,  4.77it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  31%|███       | 4/13 [00:00<00:01,  5.84it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  38%|███▊      | 5/13 [00:00<00:01,  6.55it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  46%|████▌     | 6/13 [00:01<00:00,  7.11it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  54%|█████▍    | 7/13 [00:01<00:00,  7.56it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  62%|██████▏   | 8/13 [00:01<00:00,  7.90it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  69%|██████▉   | 9/13 [00:01<00:00,  8.23it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  77%|███████▋  | 10/13 [00:01<00:00,  7.97it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  85%|████████▍ | 11/13 [00:01<00:00,  8.16it/s]

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


🔮 Prediciendo:  92%|█████████▏| 12/13 [00:01<00:00,  8.31it/s]

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step


🔮 Prediciendo: 100%|██████████| 13/13 [00:02<00:00,  5.23it/s]


✅ Resultados guardados: /content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv



