In [97]:
import matplotlib.pyplot as plt
from scipy.io import arff
import seaborn as sns
import pandas as pd  
import numpy as np

TRANSFORMAR DATA 

In [98]:
data=arff.loadarff("../data/credit_fraud.arff")
df=pd.DataFrame(data[0])
df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)
df["status"]=df["class"]
del df["class"]
df['current_balance'] = np.log1p(df['current_balance'])

SELECCIONAR MEJORES FEATURES

In [99]:
cat=['over_draft','credit_history', 'purpose',
           'Average_Credit_Balance', 'employment',
           'personal_status',
           'property_magnitude', 'other_payment_plans', 
           'housing',
           'status'] # objetivo


num=['credit_usage','current_balance','location',
             'cc_age','existing_credits']


df=df[cat+num]

CONJUNTOS DE ENTRENAMIENTO Y EVALUACION

In [100]:
from sklearn.model_selection import train_test_split

In [101]:
df_train_full, df_test=train_test_split(
    df,test_size=0.2,random_state=11,stratify=df['status'])

df_train, df_val=train_test_split(
    df_train_full,test_size=0.25,random_state=11,stratify=df_train_full['status'])

VERIFICAMOS QUE STRATIFY PRODUCIÓ BLOQUE PORPROCIONANES RESPECTO AL OBJETIVO

In [103]:
df_train.status.value_counts(normalize=True)

status
good    0.7
bad     0.3
Name: proportion, dtype: float64

In [104]:
df_val.status.value_counts(normalize=True)

status
good    0.7
bad     0.3
Name: proportion, dtype: float64

In [105]:
df_train_full.status.value_counts(normalize=True)


status
good    0.7
bad     0.3
Name: proportion, dtype: float64

In [106]:
df_test.status.value_counts(normalize=True)

status
good    0.7
bad     0.3
Name: proportion, dtype: float64

SEPARAMOS LA CARACTERISTICAS DEL OBJETIVO

In [None]:
# OBJETIVO
y_train=(df_train.status=='bad').astype(int).values
y_val=(df_val.status=='bad').astype(int).values
y_train_full=(df_train_full.status=='bad').astype(int).values
y_test=(df_test.status=='bad').astype(int).values

# CARACTERISTICAS
X_train=df_train.drop('status',axis=1)
X_val=df_val.drop('status',axis=1)
X_train_full=df_train_full.drop('status',axis=1)
X_test=df_test.drop('status',axis=1)

Número de features en df_train: 15
Columnas: ['over_draft', 'credit_history', 'purpose', 'Average_Credit_Balance', 'employment', 'personal_status', 'property_magnitude', 'other_payment_plans', 'housing', 'status', 'credit_usage', 'current_balance', 'location', 'cc_age', 'existing_credits']


['over_draft',
 'credit_history',
 'purpose',
 'Average_Credit_Balance',
 'employment',
 'personal_status',
 'property_magnitude',
 'other_payment_plans',
 'housing',
 'credit_usage',
 'current_balance',
 'location',
 'cc_age',
 'existing_credits']

VECTORIZAMOS LAS CARACTERISTICAS

In [108]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
dict_train=X_train.to_dict(orient='records')
dict_val=X_val.to_dict(orient='records')
dict_train_full=X_train_full.to_dict(orient='records')
dict_test=X_test.to_dict(orient='records')

dv=DictVectorizer(sparse=False)
X_train=dv.fit_transform(dict_train)
X_val=dv.transform(dict_val)

dv_full = DictVectorizer(sparse=False)
X_train_full = dv_full.fit_transform(dict_train_full)  
X_test = dv_full.transform(dict_test) 

PREPARAMOS UN MODELO GRADIENT BOOSTING PARA CLASIFICACION

In [111]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# ESTABLECEMOS RANGO DE BUSQUEDA PARA PARAMETROS DEL MODELO
param_distributions = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.5, 1, 2],
    'n_estimators': range(190,250,5)
}

# ESPECIFICAMOS UN MODELO XGBOOST
modelv1 = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=5 #desvalance de costo
)

# CONFIGURAMOS LA BUSQUEDA
search = RandomizedSearchCV(
    modelv1,
    param_distributions,
    n_iter=30,
    scoring='roc_auc',
    cv=5,  
    random_state=21,
    n_jobs=-1,
    verbose=1
)

# AJUSTAMOS LA BUSQUEDA A LOS DATOS  
search.fit(X_train, y_train)


print(f"Mejores parámetros: {search.best_params_}")
print(f"Mejor AUC: {search.best_score_:.3f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Mejores parámetros: {'subsample': 0.7, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 230, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.9}
Mejor AUC: 0.78056


REENTRENAMOS CON TRAIN + VAL PARA MODELO DE PRODUCCION

In [113]:
from sklearn.metrics import roc_auc_score

In [127]:
modelv2 = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=5, #desvalance de costo,
    **search.best_params_
)

modelv2.fit(X_train_full, y_train_full)


y_pred_train_full = modelv2.predict_proba(X_train_full)[:, 1]
full_train_auc = roc_auc_score(y_train_full, y_pred_train_full)
print(f"AUC en FullTrain: {full_train_auc:.3f}")

y_pred_test = modelv2.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred_test)
print(f"AUC en Test: {test_auc:.3f}")

gap = full_train_auc - test_auc

if gap<=7:
    print(f"Gap: {gap:.4f} <----- Aceptable")
else: 
    print(f"Gap: {gap:.4f} <----- Overfitting")   

AUC en FullTrain: 0.863
AUC en Test: 0.800
Gap: 0.0623 <----- Aceptable


REVISAMOS LA METRICAS DE CLAFICACION

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = (y_pred_test >= 0.5).astype(int)
print(classification_report(y_test, y_pred))

print(f"\nConfusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, y_pred_binary)))

              precision    recall  f1-score   support

           0       0.90      0.39      0.55       140
           1       0.39      0.90      0.54        60

    accuracy                           0.55       200
   macro avg       0.65      0.65      0.54       200
weighted avg       0.75      0.55      0.55       200


Confusion Matrix:
    0   1
0  55  85
1   6  54


In [116]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# 1. Obtener predicciones probabilísticas (NO binarias)
y_pred_proba = modelv2.predict_proba(X_test)[:, 1]  # Probabilidad de malo

# 2. Probar diferentes thresholds
thresholds = np.arange(0.1, 0.95, 0.025)
results = []

for threshold in thresholds:
    # Aplicar threshold
    y_pred_threshold = (y_pred_proba >= threshold).astype(int)
    
    # Calcular métricas
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_threshold).ravel()
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # COST MATRIX: FN cuesta 5x, FP cuesta 1x
    cost = (fn * 5) + (fp * 1)
    
    results.append({
        'threshold': threshold,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'cost': cost,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn
    })

# 3. Crear DataFrame
df_results = pd.DataFrame(results)
print(df_results.to_string())

# 4. Encontrar mejor threshold (por cost mínimo)
best_idx = df_results['cost'].idxmin()
best_threshold = df_results.loc[best_idx, 'threshold']
print(f"\n✓ Mejor threshold: {best_threshold:.2f}")
print(f"  Precision: {df_results.loc[best_idx, 'precision']:.3f}")
print(f"  Recall: {df_results.loc[best_idx, 'recall']:.3f}")
print(f"  Cost: {df_results.loc[best_idx, 'cost']:.0f}")

    threshold  precision    recall        f1  cost  tp   fp  fn   tn
0       0.100   0.300000  1.000000  0.461538   140  60  140   0    0
1       0.125   0.300000  1.000000  0.461538   140  60  140   0    0
2       0.150   0.300000  1.000000  0.461538   140  60  140   0    0
3       0.175   0.301508  1.000000  0.463320   139  60  139   0    1
4       0.200   0.303030  1.000000  0.465116   138  60  138   0    2
5       0.225   0.309278  1.000000  0.472441   134  60  134   0    6
6       0.250   0.319149  1.000000  0.483871   128  60  128   0   12
7       0.275   0.317204  0.983333  0.479675   132  59  127   1   13
8       0.300   0.325967  0.983333  0.489627   127  59  122   1   18
9       0.325   0.324022  0.966667  0.485356   131  58  121   2   19
10      0.350   0.327684  0.966667  0.489451   129  58  119   2   21
11      0.375   0.339181  0.966667  0.502165   123  58  113   2   27
12      0.400   0.364780  0.966667  0.529680   111  58  101   2   39
13      0.425   0.374194  0.966667

IMPORTANCIA DE LA FEATURES 

In [129]:
import matplotlib.pyplot as plt

importance = modelv2.feature_importances_
feature_names = dv_full.get_feature_names_out()

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importance}).sort_values('importance', ascending=False)

print(importance_df.head(15))

                                          feature  importance
29                         over_draft=no checking    0.104563
26                            over_draft=0<=X<200    0.065282
38                               purpose=business    0.037875
23                       other_payment_plans=bank    0.034887
7   credit_history=critical/other existing credit    0.034417
27                                  over_draft=<0    0.033876
8               credit_history=delayed previously    0.030020
37                 property_magnitude=real estate    0.027939
47                               purpose=used car    0.027303
18                               existing_credits    0.027216
15                                  employment=<1    0.025643
28                               over_draft=>=200    0.025561
24                       other_payment_plans=none    0.024884
19                               housing=for free    0.024829
11                                   credit_usage    0.023623
