In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('bank-full.csv', sep=';')

# checking for null values
df.isna().sum() # no null values found 

# checking for duplicates
df.duplicated().sum() # no duplicates found

X = df.drop(columns="y")
y = df['y'].map({'yes':1, "no":0})

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude='object').columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)
print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)






Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [8]:
# fitting support vector machine model 
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

# initializing SVM models with different kernels

svm_rbf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("svm", SVC(kernel="rbf", gamma='scale', C=1.0, class_weight="balanced", random_state=42))
])

# train the models
svm_rbf.fit(X_train,y_train)

# make predictions
y_pred_rbf = svm_rbf.predict(X_test)

# evaluating the models
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# evaluating the rbf kernel
accuracy_rbf = accuracy_score(y_test,y_pred_rbf)
cm_rbf = confusion_matrix(y_test, y_pred_rbf)
class_report_rbf = classification_report(y_test,y_pred_rbf)
print("Accuracy for RBF Kernel SVM: ", accuracy_rbf)
print("Confusion Matrix for RBF Kernel SVM:\n", cm_rbf)
print("Classification Report for RBF Kernel SVM:\n", class_report_rbf)

Accuracy for RBF Kernel SVM:  0.845626451398872
Confusion Matrix for RBF Kernel SVM:
 [[6725 1260]
 [ 136  922]]
Classification Report for RBF Kernel SVM:
               precision    recall  f1-score   support

           0       0.98      0.84      0.91      7985
           1       0.42      0.87      0.57      1058

    accuracy                           0.85      9043
   macro avg       0.70      0.86      0.74      9043
weighted avg       0.91      0.85      0.87      9043



In [9]:
# permutation importance on model with duration feature included
from sklearn.inspection import permutation_importance
# Transform the test data to match the feature names
pfi = permutation_importance(svm_rbf, X_test, y_test, n_repeats=10, random_state=42)

# getting feature names out after one-hot encoding

# creating a dataframe for feature importance
pfi_df = pd.DataFrame({
    'Feature': X_test.columns,
    "Importance_mean": pfi.importances_mean,
    "Importance_std": pfi.importances_std
}).sort_values(by="Importance_mean", ascending=False)

pfi_df.head(20)


Unnamed: 0,Feature,Importance_mean,Importance_std
10,month,0.102964,0.002025
11,duration,0.068716,0.002561
8,contact,0.046124,0.002026
9,day,0.039909,0.002611
13,pdays,0.019573,0.001199
15,poutcome,0.009543,0.001156
6,housing,0.008957,0.001251
14,previous,0.003627,0.001102
7,loan,0.000232,0.000711
4,default,-0.0001,0.000152


In [4]:
# model without duration 
# dropping the duration feature
from sklearn.inspection import permutation_importance


X_no_duration = X.drop(columns=['duration'])

# splitting data into training and testing sets
X_train_no_duration, X_test_no_duration, y_train, y_test = train_test_split(X_no_duration, y, test_size=0.2, random_state=42, stratify=y)

cat_no_duration = X_no_duration.select_dtypes(include='object').columns
num_no_duration = X_no_duration.select_dtypes(exclude='object').columns

preprocessor_no_duration = ColumnTransformer(
    [
        ("num", StandardScaler(), num_no_duration),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_no_duration)

    ]
)

# svm model
svm_rbf_no_duration = Pipeline([
    ('preprocessor', preprocessor_no_duration),
    ("svm", SVC(kernel='rbf', gamma='scale', C=1.0, class_weight='balanced', random_state=42))
])

# fitting the model
svm_rbf_no_duration.fit(X_train_no_duration, y_train)


# pfi on model without duration feature 
pfi_no_duration = permutation_importance(svm_rbf_no_duration, X_test_no_duration,y_test, n_repeats=10, random_state=42)

# creating a dataframe for feature importance
pfi_no_duration_df = pd.DataFrame({
    "Feature": X_no_duration.columns,
    "Importance_mean": pfi_no_duration.importances_mean,
    "Importance_std": pfi_no_duration.importances_std
}).sort_values(by="Importance_mean", ascending = False)

pfi_no_duration_df.head(20)

Unnamed: 0,Feature,Importance_mean,Importance_std
10,month,0.060994,0.005178
8,contact,0.034033,0.006324
9,day,0.017127,0.005212
12,pdays,0.008508,0.004818
14,poutcome,0.008066,0.002617
2,marital,0.003646,0.004076
13,previous,0.003536,0.004245
5,balance,0.001989,0.006208
11,campaign,0.001436,0.002884
1,job,0.001215,0.004878


In [5]:
X_used = X_no_duration

# preprocessor for Logistic regression
cat_cols = X_used.select_dtypes(include='object').columns
num_cols = X_used.select_dtypes(exclude="object").columns


preprocessor_logistic = ColumnTransformer(
    [
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# preprocessor for random forest
preprocessor_rf = ColumnTransformer(
    [
        ('num','passthrough',num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# logistic regression & random forest models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

logistic_model = Pipeline([
    ("preprocessor", preprocessor_logistic),
    ("logistic regression", LogisticRegression(class_weight='balanced', random_state=42,max_iter=1000))
])

rf_model = Pipeline([
    ('preprocessor', preprocessor_rf),
    ("random forest", RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])

# splitting data into training and testing sets
X_used_train, X_used_test, y_train, y_test = train_test_split(X_used, y, test_size=0.2, random_state=42, stratify=y)

# fitting the models
logistic_model.fit(X_used_train,y_train)
rf_model.fit(X_used_train, y_train)
svm_rbf_no_duration.fit(X_used_train, y_train)

# evaluating the models
y_pred_logistic = logistic_model.predict(X_used_test)
y_pred_rf = rf_model.predict(X_used_test)
y_pred_svm = svm_rbf_no_duration.predict(X_used_test)

print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logistic))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Random forest confusion matrix:\n" + str(confusion_matrix(y_test, y_pred_rf)))




Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.71      0.81       801
           1       0.22      0.63      0.33       104

    accuracy                           0.70       905
   macro avg       0.58      0.67      0.57       905
weighted avg       0.86      0.70      0.76       905

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.94       801
           1       0.68      0.12      0.21       104

    accuracy                           0.89       905
   macro avg       0.79      0.56      0.58       905
weighted avg       0.87      0.89      0.86       905

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.82      0.87       801
           1       0.29      0.58      0.39       104

    accuracy                           0.79       905
   macro avg      

In [6]:
 # creating an ensemble model using model stacking 
from sklearn.ensemble import StackingClassifier
estimators = [
    ('logistic regression', logistic_model),
    ('random_forest', rf_model),
    ("svm", svm_rbf_no_duration)
]
final_estimator = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)
stacking_model.fit(X_used_train, y_train)
y_pred_stacking = stacking_model.predict(X_used_test)
print("Stacking Classifier Classification Report:\n", classification_report(y_test, y_pred_stacking))

# permutation importance on the stacking model 
pfi_stacking = permutation_importance(stacking_model, X_used_test, y_test,n_repeats=10, random_state=42)

# creating a dataframe for feature importance
pfi_stacking_df = pd.DataFrame({
    "Feature": X_used.columns,
    "Importance_mean": pfi_stacking.importances_mean, 
    "Importance_std": pfi_stacking.importances_std
}).sort_values(by="Importance_mean", ascending= False)

pfi_stacking_df.head(20)

Stacking Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.76      0.84       801
           1       0.26      0.64      0.37       104

    accuracy                           0.75       905
   macro avg       0.60      0.70      0.60       905
weighted avg       0.86      0.75      0.79       905



Unnamed: 0,Feature,Importance_mean,Importance_std
10,month,0.074696,0.006428
8,contact,0.038343,0.003924
14,poutcome,0.036022,0.005463
9,day,0.023094,0.004377
12,pdays,0.019337,0.003538
1,job,0.014033,0.004165
13,previous,0.009282,0.004051
2,marital,0.009061,0.005386
5,balance,0.008177,0.004922
6,housing,0.00663,0.003922


In [36]:

preprocessor_logistic = ColumnTransformer(
    [
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# preprocessor for random forest
preprocessor_rf = ColumnTransformer(
    [
        ('num','passthrough',num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# logistic regression & random forest models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

logistic_model = Pipeline([
    ("preprocessor", preprocessor_logistic),
    ("logistic regression", LogisticRegression(class_weight='balanced', random_state=42,max_iter=1000))
])

rf_model = Pipeline([
    ('preprocessor', preprocessor_rf),
    ("random forest", RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
])

# splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# fitting the models
logistic_model.fit(X_train,y_train)
rf_model.fit(X_train, y_train)
svm_rbf_no_duration.fit(X_train, y_train)

# evaluating the models
y_pred_logistic = logistic_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_svm = svm_rbf_no_duration.predict(X_test)

from sklearn.ensemble import StackingClassifier
estimators = [
    ('logistic regression', logistic_model),
    ('random_forest', rf_model),
    ("svm", svm_rbf_no_duration)
]
final_estimator = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5, stack_method="auto", passthrough=False)
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)
print("Stacking Classifier Classification Report:\n", classification_report(y_test, y_pred_stacking))

# permutation importance on the stacking model 
pfi_stacking = permutation_importance(stacking_model, X_test, y_test,n_repeats=10, random_state=42, scoring='f1')

# creating a dataframe for feature importance
pfi_stacking_df = pd.DataFrame({
    "Feature": X_test.columns,
    "Importance_mean": pfi_stacking.importances_mean, 
    "Importance_std": pfi_stacking.importances_std
}).sort_values(by="Importance_mean", ascending= False)

pfi_stacking_df.head(20)

Stacking Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.76      0.84       801
           1       0.26      0.64      0.37       104

    accuracy                           0.75       905
   macro avg       0.60      0.70      0.60       905
weighted avg       0.86      0.75      0.79       905



Unnamed: 0,Feature,Importance_mean,Importance_std
10,month,0.056702,0.008398
8,contact,0.040827,0.007735
15,poutcome,0.027892,0.007415
1,job,0.026027,0.005521
9,day,0.024759,0.00685
13,pdays,0.024105,0.006019
14,previous,0.014195,0.006088
0,age,0.01009,0.007685
5,balance,0.008674,0.006653
12,campaign,0.00865,0.007098
