In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('bank.csv', sep=';')

# checking for null values
df.isna().sum() # no null values found 

# checking for duplicates
df.duplicated().sum() # no duplicates found

X = df.drop(columns="y")
y = df['y'].map({'yes':1, "no":0})

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude='object').columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)
print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

df['poutcome'].value_counts()

df.info()



Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 no

In [40]:
# fitting support vector machine model 
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

# initializing SVM models with different kernels

svm_rbf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("svm", SVC(kernel="rbf", gamma='scale', C=1.0, class_weight="balanced", random_state=42))
])

# train the models
svm_rbf.fit(X_train,y_train)

# make predictions
y_pred_rbf = svm_rbf.predict(X_test)

# evaluating the models
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# evaluating the rbf kernel
accuracy_rbf = accuracy_score(y_test,y_pred_rbf)
cm_rbf = confusion_matrix(y_test, y_pred_rbf)
class_report_rbf = classification_report(y_test,y_pred_rbf)
print("Accuracy for RBF Kernel SVM: ", accuracy_rbf)
print("Confusion Matrix for RBF Kernel SVM:\n", cm_rbf)
print("Classification Report for RBF Kernel SVM:\n", class_report_rbf)

Accuracy for RBF Kernel SVM:  0.8453038674033149
Confusion Matrix for RBF Kernel SVM:
 [[680 121]
 [ 19  85]]
Classification Report for RBF Kernel SVM:
               precision    recall  f1-score   support

           0       0.97      0.85      0.91       801
           1       0.41      0.82      0.55       104

    accuracy                           0.85       905
   macro avg       0.69      0.83      0.73       905
weighted avg       0.91      0.85      0.87       905



In [42]:
# permutation importance on model with duration feature included

# Transform the test data to match the feature names
pfi = permutation_importance(svm_rbf, X_test, y_test, n_repeats=10, random_state=42)

# getting feature names out after one-hot encoding
feature_names = svm_rbf.named_steps['preprocessor'].get_feature_names_out()

# creating a dataframe for feature importance
pfi_df = pd.DataFrame({
    'Feature': X_test.columns,
    "Importance_mean": pfi.importances_mean,
    "Importance_std": pfi.importances_std
}).sort_values(by="Importance_mean", ascending=False)

pfi_df.head(20)


Unnamed: 0,Feature,Importance_mean,Importance_std
11,duration,0.082431,0.006559
10,month,0.030608,0.005592
14,previous,0.014144,0.004524
9,day,0.013923,0.004771
15,poutcome,0.013039,0.002558
13,pdays,0.012818,0.00238
8,contact,0.010829,0.004937
12,campaign,0.006409,0.005759
5,balance,0.002652,0.005746
7,loan,0.001657,0.003135


In [49]:
# model without duration 
# dropping the duration feature
from sklearn.inspection import permutation_importance


X_no_duration = X.drop(columns=['duration'])

# splitting data into training and testing sets
X_train_no_duration, X_test_no_duration, y_train, y_test = train_test_split(X_no_duration, y, test_size=0.2, random_state=42, stratify=y)

cat_no_duration = X_no_duration.select_dtypes(include='object').columns
num_no_duration = X_no_duration.select_dtypes(exclude='object').columns

preprocessor_no_duration = ColumnTransformer(
    [
        ("num", StandardScaler(), num_no_duration),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_no_duration)

    ]
)

# svm model
svm_rbf_no_duration = Pipeline([
    ('preprocessor', preprocessor_no_duration),
    ("svm", SVC(kernel='rbf', gamma='scale', C=1.0, class_weight='balanced', random_state=42))
])

# fitting the model
svm_rbf_no_duration.fit(X_train_no_duration, y_train)


# pfi on model without duration feature 
pfi_no_duration = permutation_importance(svm_rbf_no_duration, X_test_no_duration,y_test, n_repeats=10, random_state=42)

# creating a dataframe for feature importance
pfi_no_duration_df = pd.DataFrame({
    "Feature": X_no_duration.columns,
    "Importance_mean": pfi_no_duration.importances_mean,
    "Importance_std": pfi_no_duration.importances_std
}).sort_values(by="Importance_mean", ascending = False)

pfi_no_duration_df.head(20)

Unnamed: 0,Feature,Importance_mean,Importance_std
10,month,0.060994,0.005178
8,contact,0.034033,0.006324
9,day,0.017127,0.005212
12,pdays,0.008508,0.004818
14,poutcome,0.008066,0.002617
2,marital,0.003646,0.004076
13,previous,0.003536,0.004245
5,balance,0.001989,0.006208
11,campaign,0.001436,0.002884
1,job,0.001215,0.004878
