In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('bank_details.csv', encoding='utf-8') as f:
    lines = f.readlines()

# Clean line endings and load into a proper DataFrame
cleaned_lines = [line.strip().split(';') for line in lines]
header = cleaned_lines[0]
data = cleaned_lines[1:]

df = pd.DataFrame(data, columns=header)

In [3]:
# Remove extra quotation marks from every value in the DataFrame
df = df.applymap(lambda x: x.strip('"').strip("'") if isinstance(x, str) else x)

In [4]:
df.columns = df.columns.str.strip('"').str.strip("'")

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no


In [103]:
# df.info()

In [6]:
new_df = df.drop(columns=['contact', 'day_of_week', 'month', 'duration'], axis=1)
new_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
1,57,services,married,high.school,unknown,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
2,37,services,married,high.school,no,yes,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
3,40,admin.,married,basic.6y,no,no,no,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no
4,56,services,married,high.school,no,no,yes,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,no


In [105]:
# new_df.isnull().sum()

In [7]:
categorical_features = new_df.select_dtypes(include=object).columns
print("Categorical Features : \n",categorical_features)
numeric_features = new_df.select_dtypes(exclude=object).columns
print("Numeric Features : \n",numeric_features)

Categorical Features : 
 Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
Numeric Features : 
 Index([], dtype='object')


In [8]:
y = new_df['y']
X = new_df.drop(columns=['y'])

In [108]:
# X.head()

In [109]:
# print(X['age'].unique())

In [110]:
print(X['campaign'].unique())

['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '19' '18' '23'
 '14' '22' '25' '16' '17' '15' '20' '56' '39' '35' '42' '28' '26' '27'
 '32' '21' '24' '29' '31' '30' '41' '37' '40' '33' '34' '43']


In [111]:
print(X['pdays'].unique())
print(X['previous'].unique())
print(X['emp.var.rate'].unique())
print(X['emp.var.rate'].unique())


['999' '6' '4' '3' '5' '1' '0' '10' '7' '8' '9' '11' '2' '12' '13' '14'
 '15' '16' '21' '17' '18' '22' '25' '26' '19' '27' '20']
['0' '1' '2' '3' '4' '5' '6' '7']
['1.1' '1.4' '-0.1' '-0.2' '-1.8' '-2.9' '-3.4' '-3' '-1.7' '-1.1']
['1.1' '1.4' '-0.1' '-0.2' '-1.8' '-2.9' '-3.4' '-3' '-1.7' '-1.1']


In [112]:
# X.head()

In [9]:
X['age'] = new_df['age'].astype(int)
X['campaign'] = new_df['campaign'].astype(int)
X['pdays'] = new_df['pdays'].astype(int)
X['previous'] = new_df['previous'].astype(int)
X['emp.var.rate'] = new_df['emp.var.rate'].astype(float)
X['cons.price.idx'] = new_df['cons.price.idx'].astype(float)
X['cons.conf.idx'] = new_df['cons.conf.idx'].astype(float)
X['euribor3m'] = new_df['euribor3m'].astype(float)
X['nr.employed'] = new_df['nr.employed'].astype(float)

In [114]:
# X.info()

In [10]:
y.unique()

array(['no', 'yes'], dtype=object)

In [11]:
y = y.map({'no' : 0, 'yes':1})

In [12]:
print(df['y'].value_counts())

y
no     36548
yes     4640
Name: count, dtype: int64


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

In [119]:
# X_train

In [14]:
X_cat = [features for features in X.columns if X[features].dtype == 'O']
X_num = [features for features in X.columns if X[features].dtype != 'O']
print("Cat : ",X_cat)
print("Num : ",X_num)

Cat :  ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome']
Num :  ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']


In [15]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

oe = OrdinalEncoder()
preprocessor = ColumnTransformer(
    [
        ('OrdinalEncoder', oe, X_cat)
    ]
)

In [16]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [17]:
from sklearn.tree import DecisionTreeClassifier

decisionTree = DecisionTreeClassifier(random_state=42)

In [20]:
decisionTree.fit(X_train, y_train)
y_train_pred = decisionTree.predict(X_train)
y_test_pred = decisionTree.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, precision_score, recall_score

print("For Training Dataset :- \n")
print("Score : ",accuracy_score(y_train,y_train_pred))
print("Precision Score : ", precision_score(y_train, y_train_pred))
print("Recall Score : ", recall_score(y_train, y_train_pred))
print("Confusion matrix : \n",confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))

print("="*60)

print("For Test Dataset :- \n")
print("Score : ",accuracy_score(y_test,y_test_pred))
print("Precision Score : ", precision_score(y_test, y_test_pred))
print("Recall Score : ", recall_score(y_test, y_test_pred))
print("Confusion matrix : \n",confusion_matrix(y_test,y_test_pred))
print(classification_report(y_test,y_test_pred))



For Training Dataset :- 

Score :  0.9044778190142555
Precision Score :  0.792654028436019
Recall Score :  0.2059729064039409
Confusion matrix : 
 [[25408   175]
 [ 2579   669]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     25583
           1       0.79      0.21      0.33      3248

    accuracy                           0.90     28831
   macro avg       0.85      0.60      0.64     28831
weighted avg       0.89      0.90      0.88     28831

For Test Dataset :- 

Score :  0.8935825847697662
Precision Score :  0.6090651558073654
Recall Score :  0.15445402298850575
Confusion matrix : 
 [[10827   138]
 [ 1177   215]]
              precision    recall  f1-score   support

           0       0.90      0.99      0.94     10965
           1       0.61      0.15      0.25      1392

    accuracy                           0.89     12357
   macro avg       0.76      0.57      0.59     12357
weighted avg       0.87      0.89      0.86    

# Hyperparameter Tuning using GridSearchCV:-

In [25]:
params = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

In [27]:
#

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=decisionTree,
    param_grid=params,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [28]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [34]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score
print("accuracy_score : ", accuracy_score(y_test, y_pred) )
print("Precision : ", precision_score(y_test, y_pred))
print("Recall : ", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


accuracy_score :  0.7798818483450676
Precision :  0.24021909233176839
Recall :  0.4410919540229885
              precision    recall  f1-score   support

           0       0.92      0.82      0.87     10965
           1       0.24      0.44      0.31      1392

    accuracy                           0.78     12357
   macro avg       0.58      0.63      0.59     12357
weighted avg       0.84      0.78      0.81     12357



In [33]:
print(best_model)

DecisionTreeClassifier(class_weight='balanced', max_depth=7, min_samples_leaf=4,
                       random_state=42)
