In [None]:
# 13_13: XGBoost Dinámico con rolling de 5 años
# ============================================

# 🚗 Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 🔐 Reproducibilidad
import random, os
import numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# 📚 Librerías
import pandas as pd
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss
)

# 📥 Cargar base
ruta_base = "/content/drive/MyDrive/Datos/6_Base_Modelos_Predictivos.parquet"
df = pd.read_parquet(ruta_base)
df = df.sort_values(['NIT', 'Año']).reset_index(drop=True)

# 🎯 Variables numéricas válidas
columnas_validas = df.select_dtypes(include=[np.number]).columns.tolist()
columnas_validas = [col for col in columnas_validas if col not in ['Año', 'RQ']]

# 🌀 Rolling window de 5 años
n_ventana = 5
X_seq, y_seq = [], []
for nit in tqdm(df['NIT'].unique(), desc="🔄 Generando ventanas"):
    df_emp = df[df['NIT'] == nit].sort_values('Año')
    if len(df_emp) < n_ventana:
        continue
    datos = df_emp[columnas_validas].values
    etiquetas = df_emp['RQ'].values
    for i in range(len(df_emp) - n_ventana + 1):
        X_seq.append(datos[i:i+n_ventana].flatten())
        y_seq.append(etiquetas[i + n_ventana - 1])

X = np.array(X_seq)
y = np.array(y_seq)
print(f"✔️ Dataset: {X.shape}, Labels: {y.shape}")

# 🏷️ Construir nombres reales de columnas
nombres_columnas = []
for t in range(-n_ventana + 1, 1):
    for var in columnas_validas:
        nombres_columnas.append(f"{var}_{t}")

# 🧼 Reemplazo de infs y NaNs
X = np.where(np.isposinf(X), np.nan, X)
X = np.where(np.isneginf(X), np.nan, X)

# ⚙️ Modelo
modelo = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(n_estimators=100, eval_metric='logloss', use_label_encoder=False, random_state=SEED))
])

# 🔁 Validación cruzada
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
metrics, loglosses, aucs = [], [], []

for train_idx, test_idx in tqdm(skf.split(X, y), total=10, desc="🔁 CV XGBoost"):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    y_prob = modelo.predict_proba(X_test)[:, 1]

    metrics.append([
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ])
    loglosses.append(log_loss(y_test, y_prob))
    aucs.append(roc_auc_score(y_test, y_prob))

# 📊 Resultados finales
res = np.array(metrics)
mean_vals = res.mean(axis=0)
std_vals = res.std(axis=0)
logl_mean, logl_std = np.mean(loglosses), np.std(loglosses)
auc_mean, auc_std = np.mean(aucs), np.std(aucs)

# 🔝 Top 3 variables
modelo.fit(X, y)
importancias = modelo.named_steps['xgb'].feature_importances_
importancia_series = pd.Series(importancias, index=nombres_columnas).sort_values(ascending=False)
top_vars = importancia_series.head(3).index.tolist()
top1, top2, top3 = (top_vars + [None]*3)[:3]

# 🧾 Guardar en CSV comparativo
ruta_csv = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"
if os.path.exists(ruta_csv):
    resumen = pd.read_csv(ruta_csv)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy',
        'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall',
        'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC',
        'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

idx = resumen[
    (resumen['Base'] == 'Turismo') & (resumen['Modelo'] == 'XGBoost Dinámico')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

resumen.loc[idx] = [
    'Turismo', 'XGBoost Dinámico', 'Avanzado', 'Dinámica', 'Supervisado',
    round(mean_vals[0], 4), round(std_vals[0], 4),
    round(mean_vals[1], 4), round(std_vals[1], 4),
    round(mean_vals[2], 4), round(std_vals[2], 4),
    round(mean_vals[3], 4), f"±{std_vals[3]:.4f}",
    round(auc_mean, 4), round(auc_std, 4),
    round(logl_mean, 4), round(logl_std, 4),
    top1, top2, top3
]

resumen.to_csv(ruta_csv, index=False)
print(f"\n✅ Resumen actualizado: {ruta_csv}")
