In [22]:
import pandas as pd

df = pd.read_csv('bank+marketing/bank-additional/bank-additional-full.csv', delimiter=';')
df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [23]:
cols_to_drop = ['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
df = df.drop(columns=cols_to_drop)
df = df.rename(columns={'job': 'job_type', 'default': 'default_status', 
                                                   'housing': 'housing_loan_status', 'loan': 'personal_loan_status', 
                                                   'contact': 'contact_type', 'month': 'contact_month', 
                                                   'day_of_week': 'contact_day_of_week', 'campaign': 'num_contacts', 
                                                   'pdays': 'days_last_contact', 'previous': 'previous_contacts', 
                                                   'poutcome': 'previous_outcome', 
                                                   'y': 'result'
                                                    })
df.head()

Unnamed: 0,age,job_type,marital,education,default_status,housing_loan_status,personal_loan_status,contact_type,contact_month,contact_day_of_week,num_contacts,days_last_contact,previous_contacts,previous_outcome,result
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,no


In [34]:
df['housing_loan_status'].value_counts()

housing_loan_status
yes        21576
no         18622
unknown      990
Name: count, dtype: int64

In [36]:
df['personal_loan_status'].value_counts()

personal_loan_status
no         33950
yes         6248
unknown      990
Name: count, dtype: int64

In [37]:
df['contact_type'].value_counts()

contact_type
cellular     26144
telephone    15044
Name: count, dtype: int64

In [39]:
df.value_counts('contact_month')

contact_month
may    13769
jul     7174
aug     6178
jun     5318
nov     4101
apr     2632
oct      718
sep      570
mar      546
dec      182
Name: count, dtype: int64

In [27]:
df['result'].value_counts()

result
no     36548
yes     4640
Name: count, dtype: int64

In [50]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='result')
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)
X_train.columns


Index(['age', 'job_type', 'marital', 'education', 'default_status',
       'housing_loan_status', 'personal_loan_status', 'contact_type',
       'contact_month', 'contact_day_of_week', 'num_contacts',
       'days_last_contact', 'previous_contacts', 'previous_outcome'],
      dtype='object')

In [105]:
import numpy as np
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

from skopt import BayesSearchCV
from skopt.space import Real, Integer


ordinal_columns = ['contact_month', 'contact_day_of_week']
one_hot_columns = ['contact_type', 'marital', 'default_status', 'housing_loan_status', 'personal_loan_status', 'previous_outcome']
lbl_bin_columns = ['job_type', 'education']
ordinal_encoder = OrdinalEncoder(categories=[
                                              ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
                                              ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
                                            ]
                                )
one_hot_encoder = OneHotEncoder(sparse=False)
lbl_bin_encoder = ce.BinaryEncoder()
ct = ColumnTransformer(
        transformers=[
            ('ordinal', ordinal_encoder, ordinal_columns),
            ('onehot', one_hot_encoder, one_hot_columns),
            ('binary', lbl_bin_encoder, lbl_bin_columns),
        ],
        remainder='passthrough'
    )
pipe = Pipeline(steps=
                (
                  ('preprocess', ct),
                  ('clf', XGBClassifier(random_state=8))
                ))
search_space = {
    # 'clf__max_depth': np.arange(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}
opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=8)
opt.fit(X_train, y_train)



ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['no' 'yes']

In [115]:
ct = ColumnTransformer(
        transformers=[
            ('ordinal', ordinal_encoder, ordinal_columns),
            ('onehot', one_hot_encoder, one_hot_columns),
            ('binary', lbl_bin_encoder, lbl_bin_columns),
        ],
        remainder='passthrough'
    )
pipe = Pipeline(steps=
                (
                  ('preprocess', ct)
                #   ('clf', XGBClassifier(random_state=8))
                ))

transformed_data = ct.fit_transform(X_train)
d = pd.DataFrame(transformed_data)
d.columns = ct.get_feature_names_out()
d.head()
d.dtypes



ordinal__contact_month                  float64
ordinal__contact_day_of_week            float64
onehot__contact_type_cellular           float64
onehot__contact_type_telephone          float64
onehot__marital_divorced                float64
onehot__marital_married                 float64
onehot__marital_single                  float64
onehot__marital_unknown                 float64
onehot__default_status_no               float64
onehot__default_status_unknown          float64
onehot__default_status_yes              float64
onehot__housing_loan_status_no          float64
onehot__housing_loan_status_unknown     float64
onehot__housing_loan_status_yes         float64
onehot__personal_loan_status_no         float64
onehot__personal_loan_status_unknown    float64
onehot__personal_loan_status_yes        float64
onehot__previous_outcome_failure        float64
onehot__previous_outcome_nonexistent    float64
onehot__previous_outcome_success        float64
binary__job_type_0                      

In [111]:
X_train.head()

Unnamed: 0,age,job_type,marital,education,default_status,housing_loan_status,personal_loan_status,contact_type,contact_month,contact_day_of_week,num_contacts,days_last_contact,previous_contacts,previous_outcome
16436,32,technician,single,high.school,no,yes,no,cellular,jul,wed,1,999,0,nonexistent
28275,32,technician,single,university.degree,no,no,no,cellular,apr,wed,3,999,0,nonexistent
27189,44,housemaid,married,high.school,no,no,no,cellular,nov,fri,6,999,1,failure
33150,48,blue-collar,married,basic.9y,no,yes,yes,cellular,may,tue,1,999,1,failure
37445,33,admin.,single,university.degree,no,yes,no,cellular,aug,wed,2,999,2,failure
