# Modelo GB con hiperparametros

In [41]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from imblearn.under_sampling import RandomUnderSampler



### Funciones

In [42]:
def evaluate_model(y_test, pred, pred_proba = None):
    if pred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(y_test, pred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(y_test, pred)))
    print('Classification report: \n{}\n'.format(classification_report(y_test, pred)))

### Division de los datos en train, test y validacion

In [43]:
new_df_base = pd.read_csv("../data/new_df_base.csv")
new_df_base.head()


Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,housing_status_BE,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
0,1,0.9,0.166828,,88.0,50,0.020925,,769,10650.765523,...,0,0,0,1,0,0,0,0,1,0
1,1,0.9,0.296286,,144.0,50,0.005418,,366,534.047319,...,0,0,0,1,0,0,0,0,1,0
2,1,0.9,0.044985,,132.0,40,3.108549,,870,4048.534263,...,0,0,0,1,0,0,0,1,0,0
3,1,0.9,0.159511,,22.0,50,0.019079,,810,3457.064063,...,0,0,0,1,0,1,0,0,0,0
4,1,0.9,0.596414,,218.0,50,0.004441,,890,5020.341679,...,0,0,0,1,0,0,1,0,0,0


In [44]:
# Separate the feature matrix and target variable
X = new_df_base.drop(['fraud_bool'], axis=1)
y = new_df_base['fraud_bool']

# Impute missing values with mean
X = X.apply(lambda x: x.fillna(x.mean()))

In [45]:
# Dropping constant features from Variance Threshold test
X.drop(['device_fraud_count'], axis=1, inplace=True)

# # Selecting features from ExtraTreesClassifier test
# selected_features = [feature.replace('scaled__', '') for feature in best_extra_features]
# X = X[selected_features]

In [46]:
X_train_0, X_test, y_train_0, y_test = train_test_split(X, y, test_size=0.2, random_state=12)


In [47]:
# Crear el objeto RandomUnderSampler con la proporción deseada
under_sampler = RandomUnderSampler(sampling_strategy=0.1, random_state=12)

# Aplicar undersampling a tus datos
X_train_2, y_train_2 = under_sampler.fit_resample(X_train_0, y_train_0)

# Verificar la proporción después del undersampling
print('Undersampled dataset shape %s' % Counter(y_train_2))


Undersampled dataset shape Counter({0: 88940, 1: 8894})


### Entrenamiento del modelo con hiperparametros

In [48]:
gb_model = Pipeline([
    ('classifier', GradientBoostingClassifier(random_state=12))
])

In [49]:
#Definir los parámetros a probar
from sklearn.model_selection import RandomizedSearchCV
gb_params = {
    'classifier__max_depth': [2, 4, 6, 8, 10],
    'classifier__learning_rate': [0.05, 0.1, 0.15, 0.20],
    'classifier__n_estimators': [20, 40, 60, 80, 100],
    'classifier__min_samples_split': [2, 3, 4],
    'classifier__min_samples_leaf': [1, 2, 3, 4],
    'classifier__subsample': [0.6, 0.8, 1.0]
}

CV= RandomizedSearchCV(estimator=gb_model, param_distributions=gb_params, cv=3, n_jobs=-1, verbose=2)

                                      

In [50]:
# Entrenar el modelo con los datos de entrenamiento
GB_models=CV.fit(X_train_2, y_train_2)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [51]:
# Lo guardamos en un archivo pickle

#with open('../models/GB_model_01.pkl', 'wb') as file:
#    pickle.dump(GB_models, file)

In [52]:
# Cargamos el modelo GB
with open('../models/GB_model_01.pkl', 'rb') as file:
    GB_model = pickle.load(file)

# Prediccion con el modelo
pred_gb = GB_model.best_estimator_.predict(X_test)

# Prediccion de probabilidades con el modelo
pred_proba_gb = GB_model.best_estimator_.predict_proba(X_test)

In [53]:
# Evaluación del modelo
evaluate_model(y_test, pred_gb)

Accuracy of the model: 0.97783

Classification report: 
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    197865
           1       0.19      0.32      0.24      2135

    accuracy                           0.98    200000
   macro avg       0.59      0.65      0.61    200000
weighted avg       0.98      0.98      0.98    200000


