In [54]:
#cleaned data hajara 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('bank-full.csv', sep=';')

# checking for null values
df.isna().sum() # no null values found 

# checking for duplicates
df.duplicated().sum() # no duplicates found

X = df.drop(columns="y")
y = df['y'].map({'yes':1, "no":0})

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude='object').columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)
print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

df['poutcome'].value_counts()

df.info()

Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
<class 'pandas.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        45211 non-null  int64
 1   job        45211 non-null  str  
 2   marital    45211 non-null  str  
 3   education  45211 non-null  str  
 4   default    45211 non-null  str  
 5   balance    45211 non-null  int64
 6   housing    45211 non-null  str  
 7   loan       45211 non-null  str  
 8   contact    45211 non-null  str  
 9   day        45211 non-null  int64
 10  month      45211 non-null  str  
 11  duration   45211 non-null  int64
 12  campaign   45211 non-null  int64
 13  pdays      45211 non-null  int64
 14  previous   45211 non-null  int64
 15  poutcome   45211 non-null  str  
 16  y     

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [55]:
#implementing random forest classifier
 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
#import warnings warnings.filterwarnings('ignore')

#spliting into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

rf_classifier.fit(X_train, y_train) #train the model 

y_pred = rf_classifier.predict(X_test) #make predictions on the test set

#evaluate 
accuracy_rf = accuracy_score(y_test, y_pred) 
cm_rf = confusion_matrix(y_test, y_pred)
classification_rep_rf = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_rf:.4f}')
print(f'\nConfusion Matrix:\n{cm_rf}')
print(f'\nClassification Report:\n{classification_rep_rf}')

Accuracy: 0.9066

Confusion Matrix:
[[7737  215]
 [ 630  461]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7952
           1       0.68      0.42      0.52      1091

    accuracy                           0.91      9043
   macro avg       0.80      0.70      0.74      9043
weighted avg       0.90      0.91      0.90      9043



In [56]:
#permutation feature importance random forest 
from sklearn.inspection import permutation_importance

pfi_rf = permutation_importance(rf_classifier, X_test, y_test, n_repeats=10, random_state=42)

fnames = rf_classifier.named_steps['preprocessor'].get_feature_names_out()

pfi_rf_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_rf.importances_mean,
    'importance_std': pfi_rf.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_rf_df.head(20)

Unnamed: 0,feature,importance_mean,importance_std
11,duration,0.041126,0.001545
10,month,0.016046,0.000919
15,poutcome,0.011545,0.001329
8,contact,0.007221,0.001287
6,housing,0.004567,0.000912
9,day,0.003682,0.000678
2,marital,0.003174,0.001128
5,balance,0.001736,0.001037
3,education,0.001415,0.001005
0,age,0.001349,0.000603


In [57]:
#implementing binomial logistic regression
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr', LogisticRegression(max_iter=10000, random_state=42))
])

logreg.fit(X_train, y_train) #train model 

#pred 
y_pred_logreg = logreg.predict(X_test)

#evaluate 
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg)

print(f'Logistic Regression Accuracy: {accuracy_logreg:.4f}')
print(f'\nConfusion Matrix:\n{cm_logreg}')
print(f'\nClassification Report:\n{classification_rep_logreg}')

Logistic Regression Accuracy: 0.8987

Confusion Matrix:
[[7754  198]
 [ 718  373]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.94      7952
           1       0.65      0.34      0.45      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.66      0.70      9043
weighted avg       0.88      0.90      0.88      9043



In [58]:
#permutation feature importance 

pfi_logreg = permutation_importance(logreg, X_test, y_test, n_repeats=10, random_state=42)

fnames = logreg.named_steps['preprocessor'].get_feature_names_out()

pfi_logreg_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_rf.importances_mean,
    'importance_std': pfi_rf.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_logreg_df.head(20)


Unnamed: 0,feature,importance_mean,importance_std
11,duration,0.041126,0.001545
10,month,0.016046,0.000919
15,poutcome,0.011545,0.001329
8,contact,0.007221,0.001287
6,housing,0.004567,0.000912
9,day,0.003682,0.000678
2,marital,0.003174,0.001128
5,balance,0.001736,0.001037
3,education,0.001415,0.001005
0,age,0.001349,0.000603


In [59]:
#support vector machine 
from sklearn.svm import SVC

#spliting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#initializing SVM model
svm_rbf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', SVC(kernel='rbf', random_state=42))
])

svm_rbf.fit(X_train, y_train) #train the model

y_pred_svm_rbf = svm_rbf.predict(X_test) #make predictions on the test set

#evaluate
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
cm_svm_rbf = confusion_matrix(y_test, y_pred_svm_rbf)
class_rep_svm_rbf = classification_report(y_test, y_pred_svm_rbf)

print(f'SVM (RBF Kernel) Accuracy: {accuracy_svm_rbf:.4f}')
print(f'\nConfusion Matrix:\n{cm_svm_rbf}')
print(f'\nClassification Report:\n{class_rep_svm_rbf}')

SVM (RBF Kernel) Accuracy: 0.9027

Confusion Matrix:
[[7774  178]
 [ 702  389]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      7952
           1       0.69      0.36      0.47      1091

    accuracy                           0.90      9043
   macro avg       0.80      0.67      0.71      9043
weighted avg       0.89      0.90      0.89      9043



In [60]:
#permutation feature importance for SVM
pfi_svm_rbf = permutation_importance(svm_rbf, X_test, y_test, n_repeats=10, random_state=42)

fnames = svm_rbf.named_steps['preprocessor'].get_feature_names_out()

pfi_svm_rbf_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_svm_rbf.importances_mean,
    'importance_std': pfi_svm_rbf.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_svm_rbf_df.head(20)

Unnamed: 0,feature,importance_mean,importance_std
11,duration,0.033053,0.00225
15,poutcome,0.009001,0.001033
10,month,0.007641,0.000885
13,pdays,0.004202,0.001036
8,contact,0.002809,0.000217
9,day,0.002543,0.00065
6,housing,0.002024,0.000802
0,age,0.001924,0.000577
1,job,0.00167,0.000567
12,campaign,0.001338,0.000728


In [64]:
#implementing KNN
from sklearn.neighbors import KNeighborsClassifier

#spliting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

knn.fit(X_train, y_train) #train the model
y_pred_knn = knn.predict(X_test) #make predictions on the test set

#evaluate
accuracy_knn = accuracy_score(y_test, y_pred_knn)
cm_knn = confusion_matrix(y_test, y_pred_knn)
class_rep_knn = classification_report(y_test, y_pred_knn)

print(f'KNN Accuracy: {accuracy_knn:.4f}')
print(f'\nConfusion Matrix:\n{cm_knn}')
print(f'\nClassification Report:\n{class_rep_knn}')

KNN Accuracy: 0.8985

Confusion Matrix:
[[7713  239]
 [ 679  412]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7952
           1       0.63      0.38      0.47      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.67      0.71      9043
weighted avg       0.88      0.90      0.89      9043



In [65]:
#permutation feature importance for KNN
pfi_knn = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42)

fnames = knn.named_steps['preprocessor'].get_feature_names_out()

pfi_knn_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_knn.importances_mean,
    'importance_std': pfi_knn.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_knn_df.head(20)

Unnamed: 0,feature,importance_mean,importance_std
11,duration,0.034845,0.002016
10,month,0.012463,0.001231
9,day,0.007431,0.001582
13,pdays,0.007221,0.001181
8,contact,0.006193,0.00072
15,poutcome,0.00481,0.00051
6,housing,0.004401,0.001075
2,marital,0.004379,0.001181
0,age,0.003317,0.001797
14,previous,0.003152,0.00068


In [67]:
from collections import defaultdict

"""
model_dfs = {
    'Random Forest': pfi_rf_df,
    'Logistic Regression': pfi_logreg_df,
    'SVM (RBF Kernel)': pfi_svm_rbf_df
}
"""

model_dfs = {
    'Random Forest': pfi_rf_df,
    'SVM (RBF Kernel)': pfi_svm_rbf_df,
    'KNN': pfi_knn_df
}

#score dictionary 
borda_scores = defaultdict(float)

for model_name, df in model_dfs.items(): 
    #sort by importance descending 
    df_sorted = df.sort_values(by='importance_mean', ascending=False)

    #get ordered list of features 
    feature_list = df_sorted['feature'].tolist()
    n = len(feature_list)

    #assugn Borda scores 
    for rank, feature in enumerate(feature_list): 
        points = n-rank
        borda_scores[feature] += points

final_rank = sorted(borda_scores.items(), key=lambda x: x[1], reverse=True)

print("Final Borda Count Rankings:")
for feature, score in final_rank: 
    print(f"{feature}: {score:.2f}")

Final Borda Count Rankings:
duration: 48.00
month: 44.00
poutcome: 40.00
contact: 37.00
day: 36.00
housing: 32.00
pdays: 28.00
marital: 25.00
age: 24.00
job: 18.00
balance: 17.00
campaign: 17.00
education: 16.00
previous: 12.00
loan: 11.00
default: 3.00
