# XGBoost - Detección de Fraude
## Optimización de hiperparámetros y búsqueda de threshold

In [19]:
import numpy as np
import pandas as pd
import pickle
from scipy.io import arff

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

from xgboost import XGBClassifier

## 1. Carga y Preparación de Datos

In [20]:
# Cargar dataset
data = arff.loadarff("../data/credit_fraud.arff")
df = pd.DataFrame(data[0])
df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)
df.rename(columns={'class': 'status'}, inplace=True)

# Seleccionar features optimizadas del EDA
cat_var = ['over_draft','credit_history', 'purpose', 'Average_Credit_Balance', 
           'employment', 'personal_status', 'property_magnitude', 
           'other_payment_plans', 'housing', 'status']
num_var = ['credit_usage','current_balance','location','cc_age','existing_credits']

df = df[cat_var + num_var]

# Aplicar transformación log1p a current_balance
df['current_balance'] = np.log1p(df['current_balance'])

print(f"Dataset shape: {df.shape}")
print(f"\nDistribución del target:")
print(df.status.value_counts(normalize=True))

Dataset shape: (1000, 15)

Distribución del target:
status
good    0.7
bad     0.3
Name: proportion, dtype: float64


## 2. División Train / Val / Test

In [21]:
# Train+Val (80%) / Test (20%)
df_train_full, df_test = train_test_split(
    df, test_size=0.2, random_state=21, stratify=df['status'])

# Train (60%) / Val (20%)
df_train, df_val = train_test_split(
    df_train_full, test_size=0.25, random_state=21, stratify=df_train_full['status'])

print(f"Train: {len(df_train)} | Val: {len(df_val)} | Test: {len(df_test)}")

Train: 600 | Val: 200 | Test: 200


In [22]:
# Separar X e y
y_train = (df_train.status == 'bad').astype(int).values
y_val = (df_val.status == 'bad').astype(int).values
y_train_full = (df_train_full.status == 'bad').astype(int).values
y_test = (df_test.status == 'bad').astype(int).values

X_train = df_train.drop('status', axis=1).to_dict('records')
X_val = df_val.drop('status', axis=1).to_dict('records')
X_train_full = df_train_full.drop('status', axis=1).to_dict('records')
X_test = df_test.drop('status', axis=1).to_dict('records')

## 3. Búsqueda de Hiperparámetros

In [23]:
# Definir espacio de búsqueda
param_distributions = {
    'xgbclassifier__learning_rate': [0.01, 0.05, 0.1],
    'xgbclassifier__max_depth': [3, 4, 5, 6],
    'xgbclassifier__min_child_weight': [1, 3, 5],
    'xgbclassifier__subsample': [0.7, 0.8, 0.9],
    'xgbclassifier__colsample_bytree': [0.7, 0.8, 0.9],
    'xgbclassifier__gamma': [0, 0.1, 0.2],
    'xgbclassifier__reg_alpha': [0, 0.1, 0.5],
    'xgbclassifier__reg_lambda': [0.5, 1, 2],
    'xgbclassifier__n_estimators': range(150, 350, 10)
}

# Pipeline base
pipeline = make_pipeline(
    DictVectorizer(),
    XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        random_state=21,
        n_jobs=-1,
        scale_pos_weight=5
    )
)

# RandomizedSearch
search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter=30,
    scoring='roc_auc',
    cv=5,
    random_state=21,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

print(f"\nMejor AUC en CV: {search.best_score_:.4f}")
print(f"\nMejores hiperparámetros:")
for param, value in search.best_params_.items():
    print(f"  {param.replace('xgbclassifier__', '')}: {value}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits



Mejor AUC en CV: 0.7833

Mejores hiperparámetros:
  subsample: 0.8
  reg_lambda: 1
  reg_alpha: 0.1
  n_estimators: 240
  min_child_weight: 5
  max_depth: 6
  learning_rate: 0.1
  gamma: 0.2
  colsample_bytree: 0.9


## 4. Evaluación con Train Full

In [24]:
# Obtener mejores parámetros 
best_params = {k.replace('xgbclassifier__', ''): v for k, v in search.best_params_.items()}

# Modelo con mejores parámetros
model_full = make_pipeline(
    DictVectorizer(),
    XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        random_state=21,
        n_jobs=-1,
        scale_pos_weight=5,
        **best_params
    )
)

# Cross-validation en train_full
cv_scores = cross_val_score(model_full, X_train_full, y_train_full, cv=5, scoring='roc_auc', n_jobs=-1)
print(f"AUC en Train Full (CV): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Entrenar y evaluar en test
model_full.fit(X_train_full, y_train_full)
y_pred_test = model_full.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred_test)
print(f"AUC en Test: {test_auc:.4f}")

gap = np.abs(cv_scores.mean() - test_auc)
print(f"Gap: {100*gap:.2f}%")

AUC en Train Full (CV): 0.7580 ± 0.0467
AUC en Test: 0.7688
Gap: 1.08%


In [25]:
best_params

{'subsample': 0.8,
 'reg_lambda': 1,
 'reg_alpha': 0.1,
 'n_estimators': 240,
 'min_child_weight': 5,
 'max_depth': 6,
 'learning_rate': 0.1,
 'gamma': 0.2,
 'colsample_bytree': 0.9}

## 5. Búsqueda de Threshold Óptimo (Matriz de Costos)

In [26]:
# Matriz de costos: FN = 5x, FP = 1x
COST_FN = 5.0
COST_FP = 1.0

thresholds = np.arange(0.1, 0.95, 0.025)
results = []

for threshold in thresholds:
    y_pred_threshold = (y_pred_test >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_threshold).ravel()
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    cost = (fn * COST_FN) + (fp * COST_FP)
    
    results.append({
        'threshold': threshold,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cost': cost,
        'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn
    })

df_results = pd.DataFrame(results)

# Mejor threshold (mínimo costo)
best_idx = df_results['cost'].idxmin()
best_threshold = df_results.loc[best_idx, 'threshold']

print(f"Threshold óptimo: {best_threshold:.3f}")
print(f"Costo mínimo: {df_results.loc[best_idx, 'cost']:.0f}")
print(f"Precision: {df_results.loc[best_idx, 'precision']:.3f}")
print(f"Recall: {df_results.loc[best_idx, 'recall']:.3f}")
print(f"F1: {df_results.loc[best_idx, 'f1']:.3f}")

print(f"\nTop 5 thresholds (menor costo):")
print(df_results.nsmallest(5, 'cost')[['threshold', 'cost', 'precision', 'recall', 'f1']])

Threshold óptimo: 0.100
Costo mínimo: 100
Precision: 0.412
Recall: 0.933
F1: 0.571

Top 5 thresholds (menor costo):
   threshold   cost  precision    recall        f1
0      0.100  100.0   0.411765  0.933333  0.571429
2      0.150  101.0   0.460177  0.866667  0.601156
4      0.200  101.0   0.476636  0.850000  0.610778
3      0.175  103.0   0.467890  0.850000  0.603550
1      0.125  106.0   0.427419  0.883333  0.576087


## 6. Métricas de Clasificación con Threshold Óptimo

In [27]:
y_pred_optimal = (y_pred_test >= best_threshold).astype(int)

print(classification_report(y_test, y_pred_optimal))
print(f"\nConfusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, y_pred_optimal), 
                   columns=['Pred Good', 'Pred Bad'], 
                   index=['True Good', 'True Bad']))

              precision    recall  f1-score   support

           0       0.94      0.43      0.59       140
           1       0.41      0.93      0.57        60

    accuracy                           0.58       200
   macro avg       0.67      0.68      0.58       200
weighted avg       0.78      0.58      0.58       200


Confusion Matrix:
           Pred Good  Pred Bad
True Good         60        80
True Bad           4        56


## 7. Feature Importance

In [28]:
# Extraer modelo y vectorizer del pipeline
xgb_model = model_full.named_steps['xgbclassifier']
vectorizer = model_full.named_steps['dictvectorizer']

importance_df = pd.DataFrame({
    'feature': vectorizer.get_feature_names_out(),
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 features más importantes:")
print(importance_df.head(15).to_string(index=False))

Top 15 features más importantes:
                                      feature  importance
                       over_draft=no checking    0.124716
                Average_Credit_Balance=>=1000    0.038296
                             over_draft=>=200    0.036034
credit_history=critical/other existing credit    0.034637
                             purpose=used car    0.032839
                     other_payment_plans=none    0.029873
               property_magnitude=real estate    0.028387
                            purpose=education    0.027913
                                employment=<1    0.027453
                            employment=4<=X<7    0.026206
           personal_status=female div/dep/mar    0.023389
         property_magnitude=no known property    0.023092
                  Average_Credit_Balance=<100    0.022553
            credit_history=delayed previously    0.022141
                             purpose=radio/tv    0.021939


## 8. Modelo Final para Producción
Entrenar con todos los datos (train + test) usando mejores parámetros

In [29]:
# Preparar todos los datos
y_all = (df.status == 'bad').astype(int).values
X_all = df.drop('status', axis=1).to_dict('records')

# Pipeline final
pipeline_final = make_pipeline(
    DictVectorizer(),
    XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        random_state=21,
        n_jobs=-1,
        scale_pos_weight=5,
        **best_params
    )
)

# Cross-validation final
cv_scores_final = cross_val_score(pipeline_final, X_all, y_all, cv=5, scoring='roc_auc', n_jobs=-1)
print(f"AUC final (todos los datos, CV): {cv_scores_final.mean():.4f} ± {cv_scores_final.std():.4f}")

# Entrenar modelo final
pipeline_final.fit(X_all, y_all)
print("\nModelo final entrenado con todos los datos")

AUC final (todos los datos, CV): 0.7673 ± 0.0183

Modelo final entrenado con todos los datos


## 9. Guardar Modelo

In [30]:
#with open('../models/model_XGB_v0.bin', 'wb') as f_out:
#    pickle.dump(pipeline_final, f_out)

#print("Modelo guardado en: ../models/model_XGB_v0.bin")

## 10. Test de Predicción

In [31]:
# Cargar modelo
with open('../models/model_XGB.bin', 'rb') as f_in:
    loaded_pipeline = pickle.load(f_in)

# Cliente de prueba (valores originales, sin transformación)
client = {
    'over_draft': '0<=X<200',
    'credit_history': 'critical/other existing credit',
    'purpose': 'education',
    'Average_Credit_Balance': 'no known savings',
    'employment': '1<=X<4',
    'personal_status': 'female div/dep/mar',
    'property_magnitude': 'car',
    'other_payment_plans': 'none',
    'housing': 'own',
    'credit_usage': 24.0,
    'current_balance': 1926.0,  # Valor original (se aplicará log1p en producción)
    'location': 3.0,
    'cc_age': 33.0,
    'existing_credits': 2.0
}

# Aplicar transformación necesaria
client['current_balance'] = np.log1p(client['current_balance'])

# Predicción
y_pred = loaded_pipeline.predict_proba([client])[0, 1]
fraude = y_pred >= best_threshold

result = {
    'bad_probability': float(f'{y_pred:.3f}'),
    'fraude': bool(fraude),
    #'threshold_usado': best_threshold
}

print(result)

{'bad_probability': 0.486, 'fraude': True}
