In [2]:
# 🚀 Instalar librerías necesarias
!pip install optuna --quiet
!pip install xgboost --quiet

# 🚗 Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from tqdm import tqdm
import os, gc, warnings
warnings.filterwarnings("ignore")

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, roc_curve, precision_recall_curve,
    average_precision_score
)

# 📍 Rutas
ruta_base = "/content/drive/MyDrive/Datos/6_Base_Modelos_Predictivos.parquet"
ruta_resumen = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"
ruta_roc = "/content/drive/MyDrive/Resultados/15_7_Curva_ROC_XGB_Optuna_SMOTE_Año.png"
ruta_prc = "/content/drive/MyDrive/Resultados/15_7_Curva_PRC_XGB_Optuna_SMOTE_Año.png"

# 📦 Cargar base y preparar variables
df = pd.read_parquet(ruta_base).drop_duplicates()
df = pd.get_dummies(df, columns=["Año", "DEP", "CIIU_Letra"], drop_first=False)

y = df["RQ"]
X = df.drop(columns=["RQ", "NIT"])
X = X.replace([np.inf, -np.inf], 1e6)

# ⚙️ Función objetivo para Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'random_state': 42
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train, y_train)

        model = XGBClassifier(**params)
        model.fit(X_res, y_res)
        y_pred = model.predict(X.iloc[test_idx])
        f1_scores.append(f1_score(y.iloc[test_idx], y_pred))

    return np.mean(f1_scores)

# 🔍 Buscar hiperparámetros óptimos
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

best_params = study.best_params
best_params.update({
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'random_state': 42
})

# 🧠 Entrenamiento final con validación cruzada
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
resultados, roc_curves, prc_curves = [], [], []
loglosses, aucs, y_true_total, y_prob_total = [], [], [], []

for train_idx, test_idx in tqdm(skf.split(X, y), total=10, desc="XGBoost Optuna SMOTE + Año"):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_train)

    model = XGBClassifier(**best_params)
    model.fit(X_res, y_res)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    resultados.append([
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ])
    loglosses.append(log_loss(y_test, y_prob))
    aucs.append(roc_auc_score(y_test, y_prob))

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    roc_curves.append((fpr, tpr))
    prc_curves.append((recall, precision))

    y_true_total.extend(y_test)
    y_prob_total.extend(y_prob)

# 📈 Curva ROC
fpr_interp = np.linspace(0, 1, 100)
tprs = [np.interp(fpr_interp, fpr, tpr) for fpr, tpr in roc_curves]
tpr_mean, tpr_std = np.mean(tprs, axis=0), np.std(tprs, axis=0)

plt.figure()
plt.plot(fpr_interp, tpr_mean, label=f'ROC promedio (AUC = {np.mean(aucs):.3f})')
plt.fill_between(fpr_interp, tpr_mean - tpr_std, tpr_mean + tpr_std, alpha=0.2)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Falsos Positivos")
plt.ylabel("Verdaderos Positivos")
plt.title("Curva ROC - XGBoost Optuna + Año")
plt.grid()
plt.tight_layout()
plt.savefig(ruta_roc)
plt.close()

# 📈 Curva PRC
recall_interp = np.linspace(0, 1, 100)
precisions = [np.interp(recall_interp, r[::-1], p[::-1]) for r, p in prc_curves]
prec_mean, prec_std = np.mean(precisions, axis=0), np.std(precisions, axis=0)

plt.figure()
plt.plot(recall_interp, prec_mean, label='PRC promedio')
plt.fill_between(recall_interp, prec_mean - prec_std, prec_mean + prec_std, alpha=0.2)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Curva PRC - XGBoost Optuna + Año")
plt.grid()
plt.tight_layout()
plt.savefig(ruta_prc)
plt.close()

# 📊 Métricas finales
res_np = np.array(resultados)
mean = res_np.mean(axis=0)
std = res_np.std(axis=0)
logl_mean, logl_std = np.mean(loglosses), np.std(loglosses)
auc_mean, auc_std = np.mean(aucs), np.std(aucs)

# 🧾 Guardar en resumen
if os.path.exists(ruta_resumen):
    resumen = pd.read_csv(ruta_resumen)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

model_final = XGBClassifier(**best_params)
model_final.fit(X, y)
importancias = pd.Series(model_final.feature_importances_, index=X.columns)
top_vars = importancias.sort_values(ascending=False).head(3).index.tolist()
top1, top2, top3 = (top_vars + [None]*3)[:3]

idx = resumen[
    (resumen['Base'] == 'Turismo') &
    (resumen['Modelo'] == 'XGBoost (Optuna + SMOTE + Año)')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

resumen.loc[idx] = [
    'Turismo', 'XGBoost (Optuna + SMOTE + Año)', 'Avanzado', 'Estática', 'Supervisado',
    round(mean[0], 4), round(std[0], 4),
    round(mean[1], 4), round(std[1], 4),
    round(mean[2], 4), round(std[2], 4),
    round(mean[3], 4), f"±{std[3]:.4f}",
    round(auc_mean, 4), round(auc_std, 4),
    round(logl_mean, 4), round(logl_std, 4),
    top1, top2, top3
]

resumen.to_csv(ruta_resumen, index=False)
print(f"\n✅ Resultados guardados en: {ruta_resumen}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[I 2025-05-19 01:06:48,985] A new study created in memory with name: no-name-4efd512a-5961-448e-8acc-ae53ee19431d
[I 2025-05-19 01:07:41,629] Trial 0 finished with value: 0.43116273574958264 and parameters: {'n_estimators': 150, 'max_depth': 7, 'learning_rate': 0.2982292719210495, 'subsample': 0.9950784730280301, 'colsample_bytree': 0.9066294903134606}. Best is trial 0 with value: 0.43116273574958264.
[I 2025-05-19 01:09:16,175] Trial 1 finished with value: 0.434176654574799 and parameters: {'n_estimators': 258, 'max_depth': 8, 'learning_rate': 0.1639148509367559, 'subsample': 0.7773148498553433, 'colsample_bytree': 0.844587899323772}. Best is trial 1 with value: 0.434176654574799.
[I 2025-05-19 01:09:58,188] Trial 2 finished with value: 0.4099533355588319 and parameters: {'n_estimators': 214, 'max_depth': 5, 'learning_rate': 0.12243994322641127, 'subsample': 0.6436534266524245, 'colsample_bytree': 0.6360102329789064}. Best is trial 1 with value: 0.434176654574799.
[I 2025-05-19 01:11:


✅ Resultados guardados en: /content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv
