In [2]:
# 🚗 Paso 1: Montar Drive
from google.colab import drive
drive.mount('/content/drive')

# 📚 Paso 2: Librerías
!pip install catboost --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, roc_curve, precision_recall_curve,
    average_precision_score
)
from tqdm import tqdm
import os, warnings
warnings.filterwarnings("ignore")

# 📁 Paso 3: Rutas
ruta_base = "/content/drive/MyDrive/Datos/6_Base_Modelos_Predictivos.parquet"
ruta_resultados = "/content/drive/MyDrive/Resultados/resultados_comparativos_modelos_turismo.csv"
ruta_roc = "/content/drive/MyDrive/Resultados/15_3_Curva_ROC_CatBoost_SMOTE.png"
ruta_prc = "/content/drive/MyDrive/Resultados/15_3_Curva_PRC_CatBoost_SMOTE.png"
ruta_importancia = "/content/drive/MyDrive/Resultados/15_3_Importancias_CatBoost_SMOTE.csv"

# 📦 Paso 4: Cargar base y preparar
df = pd.read_parquet(ruta_base).drop_duplicates()
df = pd.get_dummies(df, columns=["DEP", "CIIU_Letra"], drop_first=False)
y = df["RQ"]
X = df.drop(columns=["RQ", "NIT", "Año"])

# 🧼 Paso 5: Reemplazo de infinitos
X = X.replace([np.inf, -np.inf], 1e6)

# 🔁 Paso 6: Validación cruzada con SMOTE
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
resultados, roc_curves, prc_curves = [], [], []
loglosses, aucs, y_true_total, y_prob_total = [], [], [], []

for train_idx, test_idx in tqdm(skf.split(X, y), total=10, desc="CatBoost CV"):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_train, y_train)

    model = CatBoostClassifier(verbose=0, random_state=42)
    model.fit(X_res, y_res)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    resultados.append([
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred)
    ])
    loglosses.append(log_loss(y_test, y_prob))
    aucs.append(roc_auc_score(y_test, y_prob))

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    roc_curves.append((fpr, tpr))
    prc_curves.append((recall, precision))

    y_true_total.extend(y_test)
    y_prob_total.extend(y_prob)

# 📊 Paso 7: Métricas finales
res_np = np.array(resultados)
mean = res_np.mean(axis=0)
std = res_np.std(axis=0)
logl_mean, logl_std = np.mean(loglosses), np.std(loglosses)
auc_mean, auc_std = np.mean(aucs), np.std(aucs)

# 📈 Paso 8: Curva ROC
fpr_interp = np.linspace(0, 1, 100)
tprs = [np.interp(fpr_interp, fpr, tpr) for fpr, tpr in roc_curves]
tpr_mean, tpr_std = np.mean(tprs, axis=0), np.std(tprs, axis=0)

plt.figure(figsize=(6, 5))
plt.plot(fpr_interp, tpr_mean, label=f'ROC promedio (AUC = {auc_mean:.3f})')
plt.fill_between(fpr_interp, tpr_mean - tpr_std, tpr_mean + tpr_std, alpha=0.2)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('Falsos Positivos')
plt.ylabel('Verdaderos Positivos')
plt.title('Curva ROC promedio - CatBoost (SMOTE)')
plt.grid()
plt.tight_layout()
plt.savefig(ruta_roc)
plt.close()

# 📈 Paso 9: Curva PRC
recall_interp = np.linspace(0, 1, 100)
precisions = [np.interp(recall_interp, r[::-1], p[::-1]) for r, p in prc_curves]
prec_mean, prec_std = np.mean(precisions, axis=0), np.std(precisions, axis=0)

plt.figure(figsize=(6, 5))
plt.plot(recall_interp, prec_mean, label=f'PRC promedio (AP = {average_precision_score(y_true_total, y_prob_total):.3f})')
plt.fill_between(recall_interp, prec_mean - prec_std, prec_mean + prec_std, alpha=0.2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva PRC promedio - CatBoost (SMOTE)')
plt.grid()
plt.tight_layout()
plt.savefig(ruta_prc)
plt.close()

# 🔝 Paso 10: Importancia de variables (CatBoost nativo)
model_final = CatBoostClassifier(verbose=0, random_state=42)
model_final.fit(X, y)
importancias = model_final.get_feature_importance()
importancia_df = pd.DataFrame({
    'variable': X.columns,
    'importancia': importancias
}).sort_values(by='importancia', ascending=False)
importancia_df.to_csv(ruta_importancia, index=False)

top_vars = importancia_df['variable'].head(3).tolist()
top1, top2, top3 = (top_vars + [None]*3)[:3]

# 🧾 Paso 11: Guardar resumen CSV
if os.path.exists(ruta_resultados):
    resumen = pd.read_csv(ruta_resultados)
    if 'Observación' in resumen.columns:
        resumen = resumen.drop(columns='Observación')
else:
    resumen = pd.DataFrame(columns=[
        'Base', 'Modelo', 'Naturaleza', 'Temporalidad', 'Tipo de aprendizaje',
        'Accuracy', 'Desv. Accuracy', 'Precision', 'Desv. Precision',
        'Recall', 'Desv. Recall', 'F1-score promedio', 'Desviación F1',
        'AUC', 'Desv. AUC', 'LogLoss', 'Desv. LogLoss',
        'Top 1 variable', 'Top 2 variable', 'Top 3 variable'
    ])

idx = resumen[
    (resumen['Base'] == 'Turismo') &
    (resumen['Modelo'] == 'CatBoost (SMOTE)')
].index
idx = idx[0] if len(idx) > 0 else len(resumen)

resumen.loc[idx] = [
    'Turismo', 'CatBoost (SMOTE)', 'Avanzado', 'Estática', 'Supervisado',
    round(mean[0], 4), round(std[0], 4),
    round(mean[1], 4), round(std[1], 4),
    round(mean[2], 4), round(std[2], 4),
    round(mean[3], 4), f"±{std[3]:.4f}",
    round(auc_mean, 4), round(auc_std, 4),
    round(logl_mean, 4), round(logl_std, 4),
    top1, top2, top3
]

resumen.to_csv(ruta_resultados, index=False)
print("✅ Resultados guardados.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

CatBoost CV: 100%|██████████| 10/10 [10:19<00:00, 61.98s/it]


✅ Resultados guardados.
