In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, cv, Pool
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [2]:
all_data = pd.read_csv('Bank Marketing.csv')

In [3]:
all_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


### [Dataset Source](https://www.kaggle.com/datasets/alexkataev/bank-marketing-data-set)


### Attribute Information:
#### Bank client data:
<ul>
<li><b>Age</b> (numeric)</li>
<li><b>Job</b> : type of job (categorical: 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown')</li>
<li><b>Marital</b> : marital status (categorical: 'divorced', 'married', 'single', 'unknown' ; note: 'divorced' means divorced or widowed)</li>
<li><b>Education</b> (categorical: 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'unknown')</li>
<li><b>Default</b>: has credit in default? (categorical: 'no', 'yes', 'unknown')</li>
<li><b>Housing</b>: has housing loan? (categorical: 'no', 'yes', 'unknown')</li>
<li><b>Loan</b>: has personal loan? (categorical: 'no', 'yes', 'unknown')</li>
</ul>

#### Related with the last contact of the current campaign:
<ul>
<li><b>Contact</b>: contact communication type (categorical:
'cellular','telephone')</li>
<li><b>Month</b>: last contact month of year (categorical: 'jan', 'feb', 'mar',
…, 'nov', 'dec')</li>
<li><b>Day_of_week</b>: last contact day of the week (categorical:
'mon','tue','wed','thu','fri')</li>
<li><b>Duration</b>: last contact duration, in seconds (numeric). Important
note: this attribute highly affects the output target (e.g., if
duration=0 then y='no'). Yet, the duration is not known before a call
is performed. Also, after the end of the call y is obviously known.
Thus, this input should only be included for benchmark purposes and
should be discarded if the intention is to have a realistic
predictive model.</li>
</ul>

#### Other attributes:
</ul>
<li><b>Campaign</b>: number of contacts performed during this campaign and for
this client (numeric, includes last contact)</li>
<li><b>Pdays</b>: number of days that passed by after the client was last
contacted from a previous campaign (numeric; 999 means client was not
previously contacted)</li>
<li><b>Previous</b>: number of contacts performed before this campaign and for
this client (numeric)</li>
<li><b>Poutcome</b>: outcome of the previous marketing campaign (categorical:
'failure','nonexistent','success')</li>
</ul>

#### Output variable (desired target):
</ul>
<li><b>y</b> - has the client subscribed a term deposit? (binary: 'yes', 'no')</li>

We will drop 'contact', 'day_of_week', 'month', 'duration' because contact information of current compaign strongly correlate with target variable. We will also drop information about previous campaign because it has a lot of nan values.

In [4]:
all_data.drop(columns = ['contact', 'day_of_week', 'month', 'duration', 'pdays', 'previous', 'poutcome'], inplace = True)

To keep it easy lets also drop customers with blank information in either attribure.

In [5]:
all_data.dropna(inplace = True)
all_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,1,no
1,44,technician,single,secondary,no,29,yes,no,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,1,no
5,35,management,married,tertiary,no,231,yes,no,1,no
6,28,management,single,tertiary,no,447,yes,yes,1,no
...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,3,yes
45207,71,retired,divorced,primary,no,1729,no,no,2,yes
45208,72,retired,married,secondary,no,5715,no,no,5,yes
45209,57,blue-collar,married,secondary,no,668,no,no,4,no


Now this is the dataset we will be working with. First, lets perform ordinal encoding of our object-type attributes.

In [6]:
s = (all_data.dtypes == 'object')
object_cols = list(s[s].index)
ordinal_encoder = OrdinalEncoder(dtype = int)
tmp = all_data.copy()
tmp[object_cols] = ordinal_encoder.fit_transform(tmp[object_cols])

X, y = tmp.drop(columns = 'y'), tmp['y']

In [7]:
skf = StratifiedKFold(shuffle = True, random_state=42)

## Training the models

In [8]:
results = pd.DataFrame(columns = ['f1', 'accuracy'])

### Decision tree

In [9]:
dt_params = {'max_depth': [5, 10, 50, 100, 200],
             'min_samples_split': [2, 10, 20],
             'min_samples_leaf': [1, 5, 10]}

In [10]:
dt_clf = GridSearchCV(tree.DecisionTreeClassifier(),
                      dt_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)
dt_clf.fit(X, y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [11]:
dt_f1_mean = dt_clf.cv_results_['mean_test_f1'][dt_clf.cv_results_['rank_test_f1'].argmin()]
dt_acc_mean = dt_clf.cv_results_['mean_test_accuracy'][dt_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['Decision Tree'] = [dt_f1_mean, dt_acc_mean]

In [12]:
results

Unnamed: 0,f1,accuracy
Decision Tree,0.272787,0.818443


### Random forest

In [13]:
rf_params = {'n_estimators': [100, 200],
             'max_depth': [10, None],
             'min_samples_split': [2, 10],
             'min_samples_leaf': [1, 10]}

In [14]:
rf_clf = GridSearchCV(RandomForestClassifier(),
                      rf_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)
rf_clf.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [15]:
rf_f1_mean = rf_clf.cv_results_['mean_test_f1'][rf_clf.cv_results_['rank_test_f1'].argmin()]
rf_acc_mean = rf_clf.cv_results_['mean_test_accuracy'][rf_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['Random Forest'] = [rf_f1_mean, rf_acc_mean]

### xGboost

In [16]:
xgb_params = {'min_child_weight': [1, 5, 10],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5]}

In [17]:
xgb = XGBClassifier(learning_rate=0.02,
                    n_estimators=600,
                    objective='binary:logistic',
                    silent=True)

In [18]:
xgb_rs = RandomizedSearchCV(xgb,
                            param_distributions = xgb_params,
                            n_iter = 30,
                            scoring = ['f1', 'accuracy'],
                            refit = 'f1',
                            n_jobs = -1,
                            cv = skf,
                            verbose = 1)
xgb_rs.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Parameters: { "silent" } are not used.



In [19]:
xgb_f1_mean = xgb_rs.cv_results_['mean_test_f1'][xgb_rs.cv_results_['rank_test_f1'].argmin()]
xgb_acc_mean = xgb_rs.cv_results_['mean_test_accuracy'][xgb_rs.cv_results_['rank_test_f1'].argmin()]

results.loc['xGboost'] = [xgb_f1_mean, xgb_acc_mean]

### CatBoost

In [20]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat = CatBoostClassifier(cat_features=cat_features)

In [21]:
def cv_scores(cv_data):
    cv_data.head(10)

    best_acc_value = cv_data['test-Accuracy-mean'].max()
    best_acc_iter = cv_data['test-Accuracy-mean'].values.argmax()

    best_f1_value = cv_data['test-F1-mean'].max()
    best_f1_iter = cv_data['test-F1-mean'].values.argmax()
    return best_f1_value, best_acc_value

In [22]:
cat_params = {'loss_function': 'Logloss',
              'iterations': 100,
              'custom_loss': ['F1', 'Accuracy'],
              'learning_rate': 0.5,}

In [23]:
train_pool = Pool(data=X, label=y, cat_features=cat_features, has_header=True)
cv_data = cv(params = cat_params,
             pool = train_pool,
             verbose = False,
             folds = skf,
             plot=True)

cat_f1_mean, cat_acc_mean = cv_scores(cv_data)

results.loc['CatBoost'] = [cat_f1_mean, cat_acc_mean]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.3276070129
bestIteration = 16

Training on fold [1/5]

bestTest = 0.3292231054
bestIteration = 11

Training on fold [2/5]

bestTest = 0.3274259987
bestIteration = 22

Training on fold [3/5]

bestTest = 0.3259496309
bestIteration = 25

Training on fold [4/5]

bestTest = 0.3331467701
bestIteration = 7



### kNN

Since KNN cant handle categorical features, we will one-hot-encode every categorical feature and then reduce dimensionality with svd decomposition.

In [24]:
oh_enc = OneHotEncoder(drop = 'first')
X_oh = oh_enc.fit_transform(X)

In [25]:
decomp = TruncatedSVD()
knn = KNeighborsClassifier()

knn_pipe = Pipeline(steps=[('svd', decomp),
                           ('knn', knn)])

knn_params = {"svd__n_components": [2, 5, 10],
              "svd__n_iter": [5],
              "knn__n_neighbors": [1, 2, 3, 4, 5],
              "knn__weights" : ["uniform", "distance"],
              "knn__metric" : ["euclidean"]}

In [26]:
knn_clf = GridSearchCV(knn_pipe,
                      knn_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)

knn_clf.fit(X_oh, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [27]:
knn_f1_mean = knn_clf.cv_results_['mean_test_f1'][knn_clf.cv_results_['rank_test_f1'].argmin()]
knn_acc_mean = knn_clf.cv_results_['mean_test_accuracy'][knn_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['KNN'] = [knn_f1_mean, knn_acc_mean]

### Naive Bayes

In [28]:
gnb = GaussianNB()
gnb_pipe = Pipeline(steps=[('svd', decomp),
                           ('gnb', gnb)])
gnb_params = {"svd__n_components": [2, 5, 10],
              "svd__n_iter": [5]}

In [29]:
gnb_clf = GridSearchCV(gnb_pipe,
                       gnb_params,
                       cv = skf,
                       n_jobs = -1,
                       scoring = ['f1', 'accuracy'],
                       refit = 'f1',
                       verbose = 1)

gnb_clf.fit(X_oh, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [30]:
gnb_f1_mean = gnb_clf.cv_results_['mean_test_f1'][gnb_clf.cv_results_['rank_test_f1'].argmin()]
gnb_acc_mean = gnb_clf.cv_results_['mean_test_accuracy'][gnb_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['GaussianNB'] = [gnb_f1_mean, gnb_acc_mean]

### Logistic Regression

In [31]:
lr = LogisticRegression(max_iter=1000)
lr_params = [{'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 10],
              'solver': ['liblinear']},

             {'penalty': [None],
              'solver': ['lbfgs']},]

lr_clf = GridSearchCV(lr,
                      lr_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)

lr_clf.fit(X_oh, y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
lr_f1_mean = lr_clf.cv_results_['mean_test_f1'][lr_clf.cv_results_['rank_test_f1'].argmin()]
lr_acc_mean = lr_clf.cv_results_['mean_test_accuracy'][lr_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['LogisticRegression'] = [lr_f1_mean, lr_acc_mean]

## Results

In [33]:
results

Unnamed: 0,f1,accuracy
Decision Tree,0.272787,0.818443
Random Forest,0.250509,0.878429
xGboost,0.06606,0.882944
CatBoost,0.118206,0.883847
KNN,0.232105,0.819137
GaussianNB,0.024373,0.882296
LogisticRegression,0.210693,0.87322


In [34]:
results.to_csv('bank_marketing_resuts.csv')

In [35]:
results.round(3)

Unnamed: 0,f1,accuracy
Decision Tree,0.273,0.818
Random Forest,0.251,0.878
xGboost,0.066,0.883
CatBoost,0.118,0.884
KNN,0.232,0.819
GaussianNB,0.024,0.882
LogisticRegression,0.211,0.873
