In [1]:
# 🚗 Paso 1: Montar Drive y configurar entorno
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os, random
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss
)

# Reproducibilidad
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# 📥 Paso 2: Cargar base
ruta_base = "/content/drive/MyDrive/Datos/6_Base_Modelos_Predictivos.parquet"
df = pd.read_parquet(ruta_base)
print(f"✅ Base cargada: {df.shape}")

# 🎯 Paso 3: Filtrar columnas válidas
df = df.sort_values(['NIT', 'Año']).reset_index(drop=True)
columnas_validas = df.select_dtypes(include=[np.number]).columns.tolist()
columnas_validas = [col for col in columnas_validas if col not in ['Año', 'RQ']]

# 🔄 Paso 4: Rolling window de 5 años
X_seq, y_seq = [], []
n_ventana = 5

for nit in tqdm(df['NIT'].unique(), desc="🔄 Generando ventanas"):
    df_emp = df[df['NIT'] == nit].sort_values('Año')
    if len(df_emp) < n_ventana:
        continue
    datos = df_emp[columnas_validas].values
    etiquetas = df_emp['RQ'].values
    for i in range(len(df_emp) - n_ventana + 1):
        X_seq.append(datos[i:i+n_ventana].flatten())
        y_seq.append(etiquetas[i+n_ventana-1])

X = np.array(X_seq)
y = np.array(y_seq)
print(f"📦 Dataset generado: X={X.shape}, y={y.shape}")

# 🧼 Paso 5: Reemplazo inf y nan
X = np.where(np.isposinf(X), np.nan, X)
X = np.where(np.isneginf(X), np.nan, X)

# 📊 Paso 6: Modelo con imputación y escalado
modelo = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('logit', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED))
])

# 🔁 Paso 7: Validación cruzada
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
resultados, loglosses, aucs = [], [], []

for train_idx, test_idx in tqdm(skf.split(X, y), total=10, desc="🔁 Validación cruzada"):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    y_prob = modelo.predict_proba(X_test)[:, 1]

    resultados.append([
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ])
    loglosses.append(log_loss(y_test, y_prob))
    aucs.append(roc_auc_score(y_test, y_prob))

# 📈 Paso 8: Resumen de métricas
res = np.array(resultados)
mean_vals = res.mean(axis=0)
std_vals = res.std(axis=0)
logl_mean, logl_std = np.mean(loglosses), np.std(loglosses)
auc_mean, auc_std = np.mean(aucs), np.std(aucs)

# 🧾 Paso 9: Guardar resumen
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"

if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy',
        'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall',
        'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC',
        'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

idx = resumen[
    (resumen['Base'] == 'Turismo') & (resumen['Modelo'] == 'Rolling Logit')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

resumen.loc[idx] = [
    'Turismo', 'Rolling Logit', 'Tradicional', 'Dinámica', 'Supervisado',
    round(mean_vals[0], 4), round(std_vals[0], 4),
    round(mean_vals[1], 4), round(std_vals[1], 4),
    round(mean_vals[2], 4), round(std_vals[2], 4),
    round(mean_vals[3], 4), f"±{std_vals[3]:.4f}",
    round(auc_mean, 4), round(auc_std, 4),
    round(logl_mean, 4), round(logl_std, 4),
    None, None, None
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resumen actualizado: {ruta_csv}")


Mounted at /content/drive
✅ Base cargada: (52575, 50)


🔄 Generando ventanas: 100%|██████████| 5770/5770 [00:06<00:00, 896.70it/s] 


📦 Dataset generado: X=(32192, 230), y=(32192,)


🔁 Validación cruzada: 100%|██████████| 10/10 [01:01<00:00,  6.17s/it]



✅ Resumen actualizado: /content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv
