In [1]:
import numpy as np 
import pandas as pd

# loading the dataset
df = pd.read_csv("bank-full.csv", sep=';')

# check the columns
df.columns


Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [2]:
# check for null values
df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [3]:
# separate the target variable from the features
X = df.drop(columns='y')
y = df['y'].map({'yes':1, "no":0})

# separating categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

# One-hot encoding for categorical features
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num ', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
         ]         )

print(numerical_features)
print(categorical_features)

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')
Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')


In [4]:
# splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline 

# creating preprocesser for each model 
rf_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

svm_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

knn_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# creating model pipelines
knn_model = Pipeline([
    ('preprocessor', knn_preprocessor),
    ("K nearest neighbors", KNeighborsClassifier(n_neighbors=5))
])

rf_model = Pipeline([
    ('preprocessor', rf_preprocessor),
    ("Random Forest", RandomForestClassifier(n_estimators=10, random_state=42))
])

svm_model = Pipeline([
    ('preprocessor', svm_preprocessor),
    ("Support Vector Machine", SVC(kernel='rbf', random_state=42))
])


In [5]:
# creating an ensemble model using stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('knn', knn_model),
    ('rf', rf_model),
    ('svm', svm_model)
]
final_estimator = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

stacking_model = StackingClassifier(
    estimators = estimators, 
    final_estimator= final_estimator,
    cv=5
)
# training the stacking model
stacking_model.fit(X_train, y_train)


In [6]:
# making predictions
y_pred = stacking_model.predict(X_test)

# evaluating the model
print("Accuracy", accuracy_score(y_test, y_pred))
print("Classification report\n", classification_report(y_test, y_pred))
print("Confusion matrix\n", confusion_matrix(y_test, y_pred))

Accuracy 0.8760367134800399
Classification report
               precision    recall  f1-score   support

           0       0.97      0.89      0.93      7952
           1       0.49      0.81      0.61      1091

    accuracy                           0.88      9043
   macro avg       0.73      0.85      0.77      9043
weighted avg       0.91      0.88      0.89      9043

Confusion matrix
 [[7042  910]
 [ 211  880]]


In [7]:
# permutation importance on the ensemble model
from sklearn.inspection import permutation_importance

pfi_ensemble = permutation_importance(stacking_model, X_test, y_test, n_repeats=5, random_state=42)

# creating a dataframe for the permutation importance results
pfi_ensmeble_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_ensemble.importances_mean,
    'importance_std': pfi_ensemble.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_ensmeble_df

Unnamed: 0,feature,importance_mean,importance_std
10,month,0.106159,0.002576
11,duration,0.070773,0.002337
8,contact,0.045737,0.001965
13,pdays,0.022603,0.001944
9,day,0.021984,0.001889
15,poutcome,0.019595,0.001419
14,previous,0.012275,0.000848
6,housing,0.010151,0.001701
0,age,0.006281,0.00091
12,campaign,0.0023,0.000902


In [8]:
import time 
t0 = time.time()
_ = stacking_model.predict(X_test)
print("Time taken for prediction:", time.time()- t0)

Time taken for prediction: 19.343682050704956
