In [28]:
#cleaned data hajara 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('bank-full.csv', sep=';')

# checking for null values
df.isna().sum() # no null values found 

# checking for duplicates
df.duplicated().sum() # no duplicates found

X = df.drop(columns="y")
y = df['y'].map({'yes':1, "no":0})

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude='object').columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)
print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)

df['poutcome'].value_counts()

df.info()

Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
<class 'pandas.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        45211 non-null  int64
 1   job        45211 non-null  str  
 2   marital    45211 non-null  str  
 3   education  45211 non-null  str  
 4   default    45211 non-null  str  
 5   balance    45211 non-null  int64
 6   housing    45211 non-null  str  
 7   loan       45211 non-null  str  
 8   contact    45211 non-null  str  
 9   day        45211 non-null  int64
 10  month      45211 non-null  str  
 11  duration   45211 non-null  int64
 12  campaign   45211 non-null  int64
 13  pdays      45211 non-null  int64
 14  previous   45211 non-null  int64
 15  poutcome   45211 non-null  str  
 16  y     

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [29]:
#implementing random forest classifier
 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
#import warnings warnings.filterwarnings('ignore')

#spliting into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

rf_classifier.fit(X_train, y_train) #train the model 

y_pred = rf_classifier.predict(X_test) #make predictions on the test set

#evaluate 
accuracy_rf = accuracy_score(y_test, y_pred) 
cm_rf = confusion_matrix(y_test, y_pred)
classification_rep_rf = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy_rf:.4f}')
print(f'\nConfusion Matrix:\n{cm_rf}')
print(f'\nClassification Report:\n{classification_rep_rf}')

Accuracy: 0.9066

Confusion Matrix:
[[7737  215]
 [ 630  461]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7952
           1       0.68      0.42      0.52      1091

    accuracy                           0.91      9043
   macro avg       0.80      0.70      0.74      9043
weighted avg       0.90      0.91      0.90      9043



In [38]:
#permutation feature importance 
from sklearn.inspection import permutation_importance

pfi_rf = permutation_importance(rf_classifier, X_test, y_test, n_repeats=10, random_state=42)

fnames = rf_classifier.named_steps['preprocessor'].get_feature_names_out()

pfi_rf_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_rf.importances_mean,
    'importance_std': pfi_rf.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_rf_df.head(20)

Unnamed: 0,feature,importance_mean,importance_std
11,duration,0.041126,0.001545
10,month,0.016046,0.000919
15,poutcome,0.011545,0.001329
8,contact,0.007221,0.001287
6,housing,0.004567,0.000912
9,day,0.003682,0.000678
2,marital,0.003174,0.001128
5,balance,0.001736,0.001037
3,education,0.001415,0.001005
0,age,0.001349,0.000603


In [31]:
#implementing binomial logistic regression
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr', LogisticRegression(max_iter=10000, random_state=42))
])

logreg.fit(X_train, y_train) #train model 

#pred 
y_pred_logreg = logreg.predict(X_test)

#evaluate 
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
classification_rep_logreg = classification_report(y_test, y_pred_logreg)

print(f'Logistic Regression Accuracy: {accuracy_logreg:.4f}')
print(f'\nConfusion Matrix:\n{cm_logreg}')
print(f'\nClassification Report:\n{classification_rep_logreg}')

Logistic Regression Accuracy: 0.8987

Confusion Matrix:
[[7754  198]
 [ 718  373]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.94      7952
           1       0.65      0.34      0.45      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.66      0.70      9043
weighted avg       0.88      0.90      0.88      9043



In [40]:
#permutation feature importance 

pfi_logreg = permutation_importance(logreg, X_test, y_test, n_repeats=10, random_state=42)

fnames = logreg.named_steps['preprocessor'].get_feature_names_out()

pfi_logreg_df = pd.DataFrame({
    'feature': X.columns,
    'importance_mean': pfi_rf.importances_mean,
    'importance_std': pfi_rf.importances_std
}).sort_values(by='importance_mean', ascending=False)

pfi_logreg_df.head(20)


Unnamed: 0,feature,importance_mean,importance_std
11,duration,0.041126,0.001545
10,month,0.016046,0.000919
15,poutcome,0.011545,0.001329
8,contact,0.007221,0.001287
6,housing,0.004567,0.000912
9,day,0.003682,0.000678
2,marital,0.003174,0.001128
5,balance,0.001736,0.001037
3,education,0.001415,0.001005
0,age,0.001349,0.000603
