In [None]:
import pandas as pd

df = pd.read_csv('bank+marketing/bank-additional/bank-additional-full.csv', delimiter=';')
df.head()


In [None]:
cols_to_drop = ['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
df = df.drop(columns=cols_to_drop)
df = df.rename(columns={'job': 'job_type', 'default': 'default_status', 
                                                   'housing': 'housing_loan_status', 'loan': 'personal_loan_status', 
                                                   'contact': 'contact_type', 'month': 'contact_month', 
                                                   'day_of_week': 'contact_day_of_week', 'campaign': 'num_contacts', 
                                                   'pdays': 'days_last_contact', 'previous': 'previous_contacts', 
                                                   'poutcome': 'previous_outcome', 
                                                   'y': 'result'
                                                    })
df['result'] = df['result'].replace({'yes': 1, 'no': 0})
df.head()


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='result')
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)




In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

from skopt import BayesSearchCV
from skopt.space import Real, Integer


ordinal_columns = ['contact_month', 'contact_day_of_week']
one_hot_columns = ['contact_type', 'marital', 'default_status', 'housing_loan_status', 'personal_loan_status', 'previous_outcome']
lbl_bin_columns = ['job_type', 'education']
ordinal_encoder = OrdinalEncoder(categories=[
                                              ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
                                              ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
                                            ]
                                )
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown = "ignore")
lbl_bin_encoder = ce.BinaryEncoder()
ct = ColumnTransformer(
        transformers=[
            ('ordinal', ordinal_encoder, ordinal_columns),
            ('onehot', one_hot_encoder, one_hot_columns),
            ('binary', lbl_bin_encoder, lbl_bin_columns),
        ],
        remainder='passthrough'
    )
pipe = Pipeline(steps=
                [
                  ('preprocess', ct),
                  ('clf', XGBClassifier(random_state=8))
                ], verbose=True)

search_space = {
    # 'clf__max_depth': np.arange(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}
opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=8, verbose=True) 
opt.fit(X_train, y_train)

In [None]:
opt.best_estimator_


In [None]:
opt.best_score_

In [None]:
opt.score(X_test, y_test)

In [None]:
opt.predict(X_test)


In [None]:
opt.predict_proba(X_test)

In [None]:
opt.best_estimator_.steps

In [None]:
from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
xgboost_model.get_booster().feature_names = ct.get_feature_names_out().tolist()
plot_importance(xgboost_model)

In [165]:
ct = ColumnTransformer(
        transformers=[
            ('ordinal', ordinal_encoder, ordinal_columns),
            ('onehot', one_hot_encoder, one_hot_columns),
            ('binary', lbl_bin_encoder, lbl_bin_columns),
        ],
        remainder='passthrough'
    )

transformed_data = ct.fit_transform(X_train)
d = pd.DataFrame(transformed_data)
d.columns = ct.get_feature_names_out()
d.head()
d.dtypes
d.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 32 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ordinal__contact_month                32950 non-null  float64
 1   ordinal__contact_day_of_week          32950 non-null  float64
 2   onehot__contact_type_cellular         32950 non-null  float64
 3   onehot__contact_type_telephone        32950 non-null  float64
 4   onehot__marital_divorced              32950 non-null  float64
 5   onehot__marital_married               32950 non-null  float64
 6   onehot__marital_single                32950 non-null  float64
 7   onehot__marital_unknown               32950 non-null  float64
 8   onehot__default_status_no             32950 non-null  float64
 9   onehot__default_status_unknown        32950 non-null  float64
 10  onehot__default_status_yes            32950 non-null  float64
 11  onehot__housing



In [None]:
y_train.head()
y_train.dtypes