In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2, VarianceThreshold

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB

from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE, SVMSMOTE
from imblearn.over_sampling import RandomOverSampler

from imblearn.ensemble import BalancedBaggingClassifier

import pickle

# Dataset Description

You are provided with an anonymized dataset containing a large number of numeric variables. The "TARGET" column is the variable to predict. It equals one for unsatisfied customers and 0 for satisfied customers.

The task is to predict the probability that each customer in the test set is an unsatisfied customer.

# Load data

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,151829,2,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.490000,0
76016,151830,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.520000,0
76017,151835,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.150000,0
76018,151836,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.160000,0


In [5]:
train_data.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [6]:
data = train_data.drop(columns=['TARGET', 'ID'], axis=1)
target = train_data['TARGET']

**Replace value -999999 in var 3**

In [7]:
data.var3 = data.var3.replace(-999999, 2)

# Split train set and test set

In [8]:
data.var3.describe()

count    76020.000000
mean         2.716483
std          9.447971
min          0.000000
25%          2.000000
50%          2.000000
75%          2.000000
max        238.000000
Name: var3, dtype: float64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=16)

print(X_train.shape, X_test.shape)

(53214, 369) (22806, 369)


In [10]:
print("Target count y_train:")
print(y_train.value_counts())
print("\nTarget count y_test:")
print(y_test.value_counts())

Target count y_train:
0    51107
1     2107
Name: TARGET, dtype: int64

Target count y_test:
0    21905
1      901
Name: TARGET, dtype: int64


**Drop constant column**

In [11]:
std_train = X_train.std()
drop_col = std_train[std_train == 0].index
drop_col

Index(['ind_var2_0', 'ind_var2', 'ind_var18_0', 'ind_var18', 'ind_var27_0',
       'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var41', 'ind_var46_0',
       'ind_var46', 'num_var18_0', 'num_var18', 'num_var27_0', 'num_var28_0',
       'num_var28', 'num_var27', 'num_var41', 'num_var46_0', 'num_var46',
       'saldo_var18', 'saldo_var28', 'saldo_var27', 'saldo_var41',
       'saldo_var46', 'delta_imp_amort_var18_1y3', 'delta_imp_reemb_var33_1y3',
       'delta_num_reemb_var33_1y3', 'imp_amort_var18_hace3',
       'imp_amort_var18_ult1', 'imp_amort_var34_hace3',
       'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3',
       'imp_reemb_var33_ult1', 'imp_trasp_var17_out_hace3',
       'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1',
       'num_reemb_var13_hace3', 'num_reemb_var33_hace3',
       'num_reemb_var33_ult1', 'num_trasp_var17_out_hace3',
       'num_trasp_var33_out_hace3', 'saldo_var2_ult1',
       'saldo_medio_var13_medio_hace3'],
      dtype='object')

In [12]:
X_train = X_train.drop(columns=drop_col, axis=1)
X_test = X_test.drop(columns=drop_col, axis=1)

print(X_train.shape, X_test.shape)

(53214, 324) (22806, 324)


**Drop duplicate columns**

In [13]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []
    for t, v in groups.items():
        dcols = frame[v].to_dict(orient="list")

        vs = list(dcols.values())
        ks = list(dcols.keys())
        lvs = len(vs)

        for i in range(lvs):
            for j in range(i+1,lvs):
                if vs[i] == vs[j]: 
                    dups.append(ks[i])
                    break

    return dups

dup_col = duplicate_columns(X_train)

In [14]:
X_train = X_train.drop(columns=dup_col, axis=1)
X_test = X_test.drop(columns=dup_col, axis=1)

In [15]:
X_train

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
6363,2,62,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33746.310000
28436,2,33,0.0,0.0,96.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150825.420000
6944,2,86,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
18699,2,24,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
21200,2,30,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,486302.430000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47519,2,36,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101840.760000
16765,2,23,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62133.270000
37316,2,22,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,266126.340000
50497,2,28,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62214.990000


**Drop quasi-constant feature use Variance Threshold**

In [16]:
thres = 0.002
sel = VarianceThreshold(threshold=thres)
sel.fit(X_train)

X_train_sel = sel.transform(X_train)
X_test_sel = sel.transform(X_test)

selected_columns = X_train.columns[sel.get_support(indices=True)]

X_train = pd.DataFrame(X_train_sel, columns=selected_columns)
X_test = pd.DataFrame(X_test_sel, columns=selected_columns)

In [17]:
X_train.describe()

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
count,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,...,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0,53214.0
mean,2.707088,33.183636,84.166756,71.916898,119.059727,3.895629,6.876538,0.503707,0.703992,3.683849,...,0.266614,5.931756,1.117222,9.226325,6.377532,35.295396,2.452425,85.785084,64.177261,117378.4
std,9.416571,12.947405,1463.713898,344.447465,556.044598,102.497931,161.535245,35.749521,42.801051,108.521963,...,38.10302,363.526971,87.387975,559.567481,372.086199,2282.510651,173.842642,4559.309473,3231.356683,190723.9
min,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75
25%,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67912.06
50%,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106523.4
75%,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118547.0
max,235.0,105.0,135000.0,12888.03,21024.81,8237.82,10351.95,6600.0,6600.0,8237.82,...,7331.34,43406.22,11047.77,63317.19,42767.16,438329.22,24650.01,681462.9,397884.3,22034740.0


**Fit + Evaluate model**

In [18]:
def model_score(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    
    precision = precision_score(y_test,pred).round(3)
    recall = recall_score(y_test,pred).round(3)
    F1_score = f1_score(y_test,pred).round(3)
    auc = roc_auc_score(y_test, pred).round(3)
    
    rp = classification_report(y_test, pred)
    
    return precision, recall, F1_score, auc, rp

**Save model**

In [19]:
# Use to save the best model to submits on Kaggle
def save_model(model, sc, num_features):
    model_name = model.__class__.__name__
    
    if sc == 0:
        scaler = "noScale"
    else:
        scaler = sc.__class__.__name__
    
    model_path = f'model/{model_name}'
    
    filename = f"{model_path}/{model_name}_{scaler}_{num_features}.pkl" 
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

**Scaler + Feature selection**

In [20]:
scaler = [StandardScaler(), MinMaxScaler(), RobustScaler()]
percentile = [90, 85, 80, 70, 60, 50]

def eval_scale(model, detail, X_train, y_train, X_test, y_test, table):
    for sc in scaler:
        _X_train = sc.fit_transform(X_train)
        _X_test = sc.transform(X_test)
        
        num_features = _X_train.shape[1]
            
        precision, recall, F1_score, auc, rp = model_score(model, _X_train, y_train, _X_test, y_test)
        
        print(f"{sc.__class__.__name__}:\n")
        print(rp)
        print('\n================================')
        
        save_model(model, sc, num_features)
            
        table.loc[table.shape[0]] = [model.__class__.__name__,
                                         detail,
                                         sc.__class__.__name__, '-',
                                         num_features, 
                                         precision, 
                                         recall, 
                                         F1_score, 
                                         auc]

#def eval_chi2(model, detail, X_train, X_test, y_train, y_test, table):
    
    
def eval_fclassif(model, detail, X_train, y_train, X_test, y_test, table):
    for p in percentile:
        for sc in scaler:
            _X_train = sc.fit_transform(X_train)
            _X_test = sc.transform(X_test)
                  
            fc = SelectPercentile(f_classif, percentile=p)
            fc.fit(_X_train, y_train)

            _X_train = fc.transform(_X_train)
            _X_test = fc.transform(_X_test)
            
            num_features = _X_train.shape[1]
            
            precision, recall, F1_score, auc, rp = model_score(model, _X_train, y_train, _X_test, y_test)
            
            print(f"{sc.__class__.__name__}, select {num_features} features (percentile = {p}%):\n")
            print(rp)
            print('\n================================')
            
            save_model(model, sc, num_features)
            
            table.loc[table.shape[0]] = [model.__class__.__name__,
                                         detail,
                                         sc.__class__.__name__, 'UFS(f_classif)',
                                         f"{num_features} ({p}%)",
                                         precision, 
                                         recall, 
                                         F1_score, 
                                         auc]

#def eval_rfecv(model, detail, X_train, y_train, X_test, y_test, table):
    
            
def eval_all(model, detail, X_train, y_train, X_test, y_test, table):
    
    # No scale + No use UFS
    precision, recall, F1_score, auc, rp= model_score(model, X_train, y_train, X_test, y_test)
    num_features = X_train.shape[1]
    save_model(model, 0, num_features)
    table.loc[table.shape[0]] = [model.__class__.__name__,
                                         detail,
                                         '-', '-',
                                         num_features, 
                                         precision, 
                                         recall, 
                                         F1_score, 
                                         auc]
    # Only scaler
    eval_scale(model, detail, X_train, y_train, X_test, y_test, table)
    
    # Univariate feature selection + scalers
    eval_fclassif(model, detail, X_train, y_train, X_test, y_test, table)

In [21]:
base_evaluation_table = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Scaler':[],
                           'Feature selection':[],
                           'No. features': [],
                           'Precision':[],
                           'Recall':[],
                           'F1-score':[], 
                           'AUC':[]})

# Logistic Regression

In [226]:
logreg_evaluation = base_evaluation_table.copy()

In [119]:
logreg1 = LogisticRegression(max_iter=500, 
                             penalty='l2', 
                             solver = 'liblinear',
                             class_weight='balanced',
                             C = 100,
                             random_state=12)
detail = 'liblinear, l2, C = 100'
eval_all(logreg1, detail, X_train, y_train, X_test, y_test, logreg_evaluation)

StandardScaler:

              precision    recall  f1-score   support

           0       0.99      0.69      0.82     21905
           1       0.09      0.76      0.17       901

    accuracy                           0.70     22806
   macro avg       0.54      0.73      0.49     22806
weighted avg       0.95      0.70      0.79     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.73      0.49     22806
weighted avg       0.95      0.69      0.79     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.98      0.50      0.66     21905
           1       0.06      0.81      0.12       901

    accuracy                           0.51     22806
   macro avg       0.52      0.65      0.39     22806
weighted avg       0.95  

RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.62      0.76     21905
           1       0.08      0.80      0.15       901

    accuracy                           0.63     22806
   macro avg       0.53      0.71      0.46     22806
weighted avg       0.95      0.63      0.74     22806






In [121]:
logreg_evaluation = logreg_evaluation.sort_values(by='AUC', ascending=False, ignore_index=True)
logreg_evaluation

Unnamed: 0,Model,Details,Scaler,Feature selection,No. features,Precision,Recall,F1-score,AUC
0,LogisticRegression,"liblinear, l2, C = 100",StandardScaler,UFS(f_classif),187 (70%),0.093,0.764,0.166,0.729
1,LogisticRegression,"liblinear, l2, C = 100",StandardScaler,UFS(f_classif),240 (90%),0.093,0.761,0.166,0.728
2,LogisticRegression,"liblinear, l2, C = 100",StandardScaler,UFS(f_classif),213 (80%),0.093,0.761,0.166,0.728
3,LogisticRegression,"liblinear, l2, C = 100",StandardScaler,-,267,0.093,0.762,0.166,0.728
4,LogisticRegression,"liblinear, l2, C = 100",StandardScaler,UFS(f_classif),227 (85%),0.093,0.76,0.165,0.727
5,LogisticRegression,"liblinear, l2, C = 100",MinMaxScaler,UFS(f_classif),187 (70%),0.092,0.76,0.165,0.726
6,LogisticRegression,"liblinear, l2, C = 100",MinMaxScaler,-,267,0.092,0.759,0.164,0.726
7,LogisticRegression,"liblinear, l2, C = 100",StandardScaler,UFS(f_classif),160 (60%),0.093,0.758,0.165,0.726
8,LogisticRegression,"liblinear, l2, C = 100",MinMaxScaler,UFS(f_classif),213 (80%),0.092,0.758,0.164,0.725
9,LogisticRegression,"liblinear, l2, C = 100",MinMaxScaler,UFS(f_classif),240 (90%),0.092,0.759,0.164,0.725


In [122]:
logreg_evaluation.to_csv('logreg_evaluation.csv', index=False)

# KNN

In [218]:
knn_evaluation = base_evaluation_table.copy()

In [219]:
n_neighbors = 3
weights = 'distance'

knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm = 'ball_tree', n_jobs=5)
detail = f"n_neighbors={n_neighbors}, weights={weights}"

eval_all(knn, detail, X_train, y_train, X_test, y_test, knn_evaluation)

StandardScaler:

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     21905
           1       0.15      0.07      0.10       901

    accuracy                           0.95     22806
   macro avg       0.56      0.53      0.54     22806
weighted avg       0.93      0.95      0.94     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     21905
           1       0.14      0.08      0.10       901

    accuracy                           0.95     22806
   macro avg       0.55      0.53      0.54     22806
weighted avg       0.93      0.95      0.94     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     21905
           1       0.14      0.07      0.09       901

    accuracy                           0.95     22806
   macro avg       0.55      0.53      0.53     22806
weighted avg       0.93  

RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     21905
           1       0.15      0.07      0.10       901

    accuracy                           0.95     22806
   macro avg       0.56      0.53      0.54     22806
weighted avg       0.93      0.95      0.94     22806




In [220]:
knn_evaluation = knn_evaluation.sort_values(by='AUC', ascending=False, ignore_index=True)
knn_evaluation

Unnamed: 0,Model,Details,Scaler,Feature selection,No. features,Precision,Recall,F1-score,AUC
0,KNeighborsClassifier,"n_neighbors=3, weights=distance",StandardScaler,UFS(f_classif),227 (85%),0.153,0.077,0.102,0.53
1,KNeighborsClassifier,"n_neighbors=3, weights=distance",StandardScaler,UFS(f_classif),213 (80%),0.152,0.077,0.102,0.53
2,KNeighborsClassifier,"n_neighbors=3, weights=distance",StandardScaler,UFS(f_classif),133 (50%),0.144,0.075,0.099,0.529
3,KNeighborsClassifier,"n_neighbors=3, weights=distance",StandardScaler,UFS(f_classif),240 (90%),0.151,0.075,0.101,0.529
4,KNeighborsClassifier,"n_neighbors=3, weights=distance",MinMaxScaler,UFS(f_classif),213 (80%),0.141,0.075,0.098,0.528
5,KNeighborsClassifier,"n_neighbors=3, weights=distance",StandardScaler,UFS(f_classif),187 (70%),0.149,0.074,0.099,0.528
6,KNeighborsClassifier,"n_neighbors=3, weights=distance",RobustScaler,UFS(f_classif),160 (60%),0.143,0.074,0.098,0.528
7,KNeighborsClassifier,"n_neighbors=3, weights=distance",MinMaxScaler,UFS(f_classif),160 (60%),0.14,0.074,0.097,0.528
8,KNeighborsClassifier,"n_neighbors=3, weights=distance",StandardScaler,UFS(f_classif),160 (60%),0.143,0.073,0.097,0.528
9,KNeighborsClassifier,"n_neighbors=3, weights=distance",RobustScaler,UFS(f_classif),187 (70%),0.142,0.074,0.098,0.528


# SVM

In [70]:
svm_evaluation = base_evaluation_table.copy()

In [71]:
C = 10
penalty = 'l2' 

svm = LinearSVC(C=C, penalty = penalty, dual = False, class_weight='balanced')
detail = f"C={C}, penalty={penalty}"

eval_all(svm, detail, X_train, y_train, X_test, y_test, svm_evaluation)



StandardScaler:

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.99      0.43      0.60     21905
           1       0.06      0.84      0.11       901

    accuracy                           0.45     22806
   macro avg       0.52      0.64      0.36     22806
weighted avg       0.95  



StandardScaler, select 240 features (percentile = 90%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


MinMaxScaler, select 240 features (percentile = 90%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


RobustScaler, select 240 features (percentile = 90%):

              precision    recall  f1-score   support

           0       0.99      0.46      0.62     21905
           1       0.06      0.83      0.11       901

    accuracy 



StandardScaler, select 227 features (percentile = 85%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


MinMaxScaler, select 227 features (percentile = 85%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


RobustScaler, select 227 features (percentile = 85%):

              precision    recall  f1-score   support

           0       0.98      0.45      0.61     21905
           1       0.06      0.83      0.11       901

    accuracy 



StandardScaler, select 213 features (percentile = 80%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


MinMaxScaler, select 213 features (percentile = 80%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


RobustScaler, select 213 features (percentile = 80%):

              precision    recall  f1-score   support

           0       0.98      0.45      0.62     21905
           1       0.06      0.83      0.11       901

    accuracy 



StandardScaler, select 187 features (percentile = 70%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


MinMaxScaler, select 187 features (percentile = 70%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.76      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.78     22806


RobustScaler, select 187 features (percentile = 70%):

              precision    recall  f1-score   support

           0       0.98      0.43      0.60     21905
           1       0.06      0.84      0.11       901

    accuracy 



StandardScaler, select 160 features (percentile = 60%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.79     22806


MinMaxScaler, select 160 features (percentile = 60%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.79     22806


RobustScaler, select 160 features (percentile = 60%):

              precision    recall  f1-score   support

           0       0.99      0.42      0.59     21905
           1       0.06      0.85      0.11       901

    accuracy 



StandardScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.79     22806


MinMaxScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.69      0.81     21905
           1       0.09      0.75      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.79     22806


RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.64      0.78     21905
           1       0.08      0.79      0.15       901

    accuracy 



In [73]:
svm_evaluation = svm_evaluation.sort_values(by='AUC', ascending=False, ignore_index=True)
svm_evaluation

Unnamed: 0,Model,Details,Scaler,Feature selection,No. features,Precision,Recall,F1-score,AUC
0,LinearSVC,"C=10, penalty=l2",StandardScaler,UFS(f_classif),187 (70%),0.091,0.757,0.162,0.723
1,LinearSVC,"C=10, penalty=l2",MinMaxScaler,UFS(f_classif),187 (70%),0.091,0.756,0.162,0.722
2,LinearSVC,"C=10, penalty=l2",MinMaxScaler,UFS(f_classif),240 (90%),0.09,0.756,0.161,0.722
3,LinearSVC,"C=10, penalty=l2",StandardScaler,UFS(f_classif),227 (85%),0.091,0.757,0.162,0.722
4,LinearSVC,"C=10, penalty=l2",MinMaxScaler,UFS(f_classif),227 (85%),0.09,0.756,0.162,0.722
5,LinearSVC,"C=10, penalty=l2",StandardScaler,UFS(f_classif),213 (80%),0.091,0.757,0.162,0.722
6,LinearSVC,"C=10, penalty=l2",StandardScaler,-,267,0.091,0.755,0.162,0.722
7,LinearSVC,"C=10, penalty=l2",StandardScaler,UFS(f_classif),160 (60%),0.091,0.754,0.162,0.722
8,LinearSVC,"C=10, penalty=l2",MinMaxScaler,UFS(f_classif),213 (80%),0.09,0.756,0.161,0.721
9,LinearSVC,"C=10, penalty=l2",MinMaxScaler,-,267,0.09,0.755,0.161,0.721


# Kernel SVM

In [229]:
kernel_svm_evaluation = base_evaluation_table.copy()

In [234]:
C=10

rbf_svc = SVC(kernel='rbf', C=C, class_weight='balanced', random_state=12)

detail = f"kernel = rbf, C={C}"

eval_all(rbf_svc, detail, X_train, y_train, X_test, y_test, kernel_svm_evaluation)

StandardScaler:

              precision    recall  f1-score   support

           0       0.98      0.75      0.85     21905
           1       0.10      0.70      0.18       901

    accuracy                           0.75     22806
   macro avg       0.54      0.72      0.52     22806
weighted avg       0.95      0.75      0.83     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.98      0.69      0.81     21905
           1       0.09      0.74      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.79     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.99      0.01      0.01     21905
           1       0.04      1.00      0.08       901

    accuracy                           0.05     22806
   macro avg       0.51      0.50      0.04     22806
weighted avg       0.95  

RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.09      0.16     21905
           1       0.04      0.99      0.08       901

    accuracy                           0.12     22806
   macro avg       0.52      0.54      0.12     22806
weighted avg       0.96      0.12      0.16     22806




In [235]:
C=10

poly_svc = SVC(kernel='poly', C=C, class_weight='balanced', random_state=12)

detail = f"kernel = poly, C={C}"

eval_all(poly_svc, detail, X_train, y_train, X_test, y_test, kernel_svm_evaluation)

StandardScaler:

              precision    recall  f1-score   support

           0       0.98      0.75      0.85     21905
           1       0.09      0.62      0.16       901

    accuracy                           0.74     22806
   macro avg       0.54      0.68      0.50     22806
weighted avg       0.94      0.74      0.82     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.98      0.69      0.81     21905
           1       0.09      0.74      0.16       901

    accuracy                           0.69     22806
   macro avg       0.54      0.72      0.49     22806
weighted avg       0.95      0.69      0.79     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.99      0.01      0.01     21905
           1       0.04      1.00      0.08       901

    accuracy                           0.05     22806
   macro avg       0.51      0.50      0.04     22806
weighted avg       0.95  

RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.06      0.11     21905
           1       0.04      0.99      0.08       901

    accuracy                           0.09     22806
   macro avg       0.52      0.52      0.09     22806
weighted avg       0.96      0.09      0.11     22806




In [236]:
C=10

sigmoid_svc = SVC(kernel='sigmoid', C=C, class_weight='balanced',random_state=12)

detail = f"kernel = sigmoid, C={C}"

eval_all(sigmoid_svc, detail, X_train, y_train, X_test, y_test, kernel_svm_evaluation)

StandardScaler:

              precision    recall  f1-score   support

           0       0.98      0.65      0.78     21905
           1       0.07      0.67      0.13       901

    accuracy                           0.65     22806
   macro avg       0.53      0.66      0.46     22806
weighted avg       0.94      0.65      0.76     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.98      0.57      0.72     21905
           1       0.06      0.72      0.12       901

    accuracy                           0.57     22806
   macro avg       0.52      0.64      0.42     22806
weighted avg       0.94      0.57      0.69     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.98      0.01      0.01     21905
           1       0.04      1.00      0.08       901

    accuracy                           0.05     22806
   macro avg       0.51      0.50      0.05     22806
weighted avg       0.94  

RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.08      0.15     21905
           1       0.04      0.97      0.08       901

    accuracy                           0.12     22806
   macro avg       0.51      0.53      0.12     22806
weighted avg       0.95      0.12      0.15     22806




In [237]:
kernel_svm_evaluation = kernel_svm_evaluation.sort_values(by='AUC', ascending=False, ignore_index=True)
kernel_svm_evaluation.to_csv('kernel_svm_evaluation.csv', index=False)
kernel_svm_evaluation

Unnamed: 0,Model,Details,Scaler,Feature selection,No. features,Precision,Recall,F1-score,AUC
0,SVC,"kernel = rbf, C=10",StandardScaler,UFS(f_classif),133 (50%),0.108,0.688,0.187,0.727
1,SVC,"kernel = rbf, C=10",StandardScaler,UFS(f_classif),187 (70%),0.106,0.685,0.184,0.724
2,SVC,"kernel = rbf, C=10",StandardScaler,-,267,0.103,0.697,0.180,0.724
3,SVC,"kernel = rbf, C=10",StandardScaler,UFS(f_classif),160 (60%),0.107,0.683,0.185,0.724
4,SVC,"kernel = rbf, C=10",StandardScaler,UFS(f_classif),227 (85%),0.105,0.686,0.183,0.723
...,...,...,...,...,...,...,...,...,...
61,SVC,"kernel = poly, C=10",RobustScaler,UFS(f_classif),213 (80%),0.040,0.998,0.076,0.502
62,SVC,"kernel = poly, C=10",RobustScaler,UFS(f_classif),187 (70%),0.040,0.998,0.076,0.502
63,SVC,"kernel = sigmoid, C=10",RobustScaler,-,267,0.040,0.997,0.076,0.502
64,SVC,"kernel = poly, C=10",RobustScaler,UFS(f_classif),160 (60%),0.040,0.998,0.076,0.502


# Naive Bayes

In [239]:
nb_evaluation = base_evaluation_table.copy()

In [251]:
nor = RobustScaler()
_X_train = pd.DataFrame(nor.fit_transform(X_train), columns= X_train.columns)
_X_train

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,0.0,2.1875,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.437292
1,0.0,0.3750,0.0,0.0,96.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.874930
2,0.0,3.6875,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213045
3,0.0,-0.1875,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213045
4,0.0,0.1875,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.500340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53209,0.0,0.5625,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.092479
53210,0.0,-0.2500,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.876672
53211,0.0,-0.3125,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.152033
53212,0.0,0.0625,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.875058


In [242]:
alpha = 0.1

cnb = ComplementNB(alpha=alpha)

detail = f"alpha={alpha}"

eval_all(cnb, detail, X_train, y_train, X_test, y_test, nb_evaluation)

ValueError: Negative values in data passed to ComplementNB (input X)

In [None]:
nb_evaluation = nb_evaluation.sort_values(by='AUC', ascending=False, ignore_index=True)
nb_evaluation.to_csv('nb_evaluation.csv', index=False)
nb_evaluation

# Decision Tree

# Random Forest

# Ada Boost

# Gradient Boosting

# XGBoost

In [214]:
xgb_evaluation = base_evaluation_table.copy()

In [215]:
n_estimators = 100
max_depth = 5
learning_rate=0.05
subsample=1.0
colsample_bytree=0.5
min_child_weight = 4

xgb = XGBClassifier(n_estimators=n_estimators,
                    max_depth = max_depth,
                    learning_rate=learning_rate, 
                    subsample=subsample,
                    colsample_bytree=colsample_bytree,
                    min_child_weight = min_child_weight,
                    scale_pos_weight = 24)

detail = detail = f"n_estimators: {n_estimators}, max_depth: {max_depth}, learning_rate: {learning_rate}, subsample: {subsample}, colsample_bytree: {colsample_bytree}, min_child_weight: {min_child_weight}"


eval_all(xgb, detail, X_train, y_train, X_test, y_test, xgb_evaluation)

StandardScaler:

              precision    recall  f1-score   support

           0       0.99      0.80      0.88     21905
           1       0.13      0.75      0.22       901

    accuracy                           0.79     22806
   macro avg       0.56      0.77      0.55     22806
weighted avg       0.95      0.79      0.86     22806


MinMaxScaler:

              precision    recall  f1-score   support

           0       0.99      0.80      0.88     21905
           1       0.13      0.75      0.22       901

    accuracy                           0.79     22806
   macro avg       0.56      0.77      0.55     22806
weighted avg       0.95      0.79      0.86     22806


RobustScaler:

              precision    recall  f1-score   support

           0       0.99      0.80      0.88     21905
           1       0.13      0.75      0.22       901

    accuracy                           0.79     22806
   macro avg       0.56      0.77      0.55     22806
weighted avg       0.95  

RobustScaler, select 133 features (percentile = 50%):

              precision    recall  f1-score   support

           0       0.99      0.79      0.88     21905
           1       0.13      0.75      0.22       901

    accuracy                           0.79     22806
   macro avg       0.56      0.77      0.55     22806
weighted avg       0.95      0.79      0.85     22806




In [216]:
xgb_evaluation = xgb_evaluation.sort_values(by='AUC', ascending=False, ignore_index=True)
xgb_evaluation

Unnamed: 0,Model,Details,Scaler,Feature selection,No. features,Precision,Recall,F1-score,AUC
0,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",-,-,267,0.132,0.751,0.224,0.774
1,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",MinMaxScaler,-,267,0.132,0.751,0.224,0.774
2,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",RobustScaler,-,267,0.132,0.751,0.224,0.774
3,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",StandardScaler,UFS(f_classif),160 (60%),0.131,0.75,0.223,0.773
4,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",StandardScaler,UFS(f_classif),187 (70%),0.133,0.747,0.225,0.773
5,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",StandardScaler,UFS(f_classif),227 (85%),0.131,0.751,0.223,0.773
6,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",MinMaxScaler,UFS(f_classif),227 (85%),0.131,0.751,0.223,0.773
7,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",RobustScaler,UFS(f_classif),227 (85%),0.131,0.751,0.223,0.773
8,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",StandardScaler,UFS(f_classif),213 (80%),0.131,0.748,0.223,0.772
9,XGBClassifier,"n_estimators: 100, max_depth: 5, learning_rate...",RobustScaler,UFS(f_classif),213 (80%),0.131,0.748,0.224,0.772


**Tuning parameters with GridSearch**

In [151]:
std = StandardScaler()
_X_train = std.fit_transform(X_train)
_X_test = std.transform(X_test)
                  
fc = SelectPercentile(f_classif, percentile=80)
fc.fit(_X_train, y_train)

_X_train = fc.transform(_X_train)
_X_test = fc.transform(_X_test)

In [206]:
parameters = {'n_estimators': [80, 100, 200],
              'learning_rate' : [0.05,0.08,0.1],
              'max_depth': [3,5],
              'min_child_weight': [2,3,4]}

tuning_xgb = GridSearchCV(xgb, param_grid=parameters, scoring='roc_auc',cv=5,verbose=3)
tuning_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=80;, score=0.833 total time=   1.6s
[CV 2/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=80;, score=0.828 total time=   1.6s
[CV 3/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=80;, score=0.827 total time=   1.6s
[CV 4/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=80;, score=0.837 total time=   1.6s
[CV 5/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=80;, score=0.844 total time=   1.5s
[CV 1/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=100;, score=0.833 total time=   1.9s
[CV 2/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=100;, score=0.829 total time=   1.8s
[CV 3/5] END learning_rate=0.05, max_depth=3, min_child_weight=2, n_estimators=100;, score=0.828 total time=   1.9s
[CV 4/5] END le

[CV 2/5] END learning_rate=0.05, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.827 total time=   4.6s
[CV 3/5] END learning_rate=0.05, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.824 total time=   4.6s
[CV 4/5] END learning_rate=0.05, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.834 total time=   4.6s
[CV 5/5] END learning_rate=0.05, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.838 total time=   4.6s
[CV 1/5] END learning_rate=0.05, max_depth=5, min_child_weight=4, n_estimators=80;, score=0.830 total time=   2.1s
[CV 2/5] END learning_rate=0.05, max_depth=5, min_child_weight=4, n_estimators=80;, score=0.826 total time=   2.1s
[CV 3/5] END learning_rate=0.05, max_depth=5, min_child_weight=4, n_estimators=80;, score=0.829 total time=   2.1s
[CV 4/5] END learning_rate=0.05, max_depth=5, min_child_weight=4, n_estimators=80;, score=0.840 total time=   2.0s
[CV 5/5] END learning_rate=0.05, max_depth=5, min_child_weight=4, n_estimato

[CV 3/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=100;, score=0.826 total time=   2.5s
[CV 4/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=100;, score=0.834 total time=   2.5s
[CV 5/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=100;, score=0.837 total time=   2.5s
[CV 1/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=200;, score=0.824 total time=   4.6s
[CV 2/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=200;, score=0.822 total time=   4.6s
[CV 3/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=200;, score=0.818 total time=   4.6s
[CV 4/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=200;, score=0.830 total time=   4.6s
[CV 5/5] END learning_rate=0.08, max_depth=5, min_child_weight=2, n_estimators=200;, score=0.829 total time=   4.6s
[CV 1/5] END learning_rate=0.08, max_depth=5, min_child_weight=3, n_esti

[CV 5/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=80;, score=0.843 total time=   1.6s
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=100;, score=0.833 total time=   1.9s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=100;, score=0.830 total time=   1.9s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=100;, score=0.829 total time=   1.8s
[CV 4/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=100;, score=0.838 total time=   1.8s
[CV 5/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=100;, score=0.842 total time=   1.8s
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=200;, score=0.831 total time=   3.4s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=200;, score=0.830 total time=   3.4s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=200

In [207]:
tuning_xgb.best_params_

{'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 3,
 'n_estimators': 80}

In [208]:
pred = tuning_xgb.predict(X_test)
print(classification_report(y_test, pred))
auc = roc_auc_score(y_test, pred).round(3)
print(auc)

              precision    recall  f1-score   support

           0       0.99      0.77      0.87     21905
           1       0.12      0.77      0.21       901

    accuracy                           0.77     22806
   macro avg       0.56      0.77      0.54     22806
weighted avg       0.95      0.77      0.84     22806

0.774


# Hist Gradient Boosting

# LightGBM

# Voting

# Stacking

# Neural Network

# Submission Kaggle

In [180]:
id_test = test_data['ID']
X_test_sub = pd.DataFrame(test_data.drop(['ID'], axis=1))

In [181]:
X_test_sub

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.100000
1,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.720000
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.950000
3,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.610000
4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.730000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75813,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40243.200000
75814,2,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146961.300000
75815,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,167299.770000
75816,2,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


In [168]:
sc = StandardScaler()
_X_train = sc.fit_transform(X_train)
               
fc = SelectPercentile(f_classif, percentile=80)
fc.fit(_X_train, y_train)

In [182]:
X_test_sub = X_test_sub.drop(columns=dup_col, axis=1)
X_test_sub = X_test_sub.drop(columns=drop_col, axis=1)

X_test_sub = sel.transform(X_test_sub)
# X_test_sub = sc.transform(X_test_sub)
# X_test_sub = fc.transform(X_test_sub)

In [183]:
X_test_sub.shape

(75818, 267)

In [202]:
with open('model/XGBClassifier/XGBClassifier_noScale_267.pkl', 'rb') as f:
    clf = pickle.load(f)

In [203]:
y_pred= clf.predict_proba(X_test_sub)[:,1]

In [204]:
submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("submission.csv", index=False)