In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [17]:
data = pd.read_csv("C:/Users/srcha/OneDrive/Desktop/bank+marketing/bank/bank.csv",sep=';')
df=pd.DataFrame(data)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [21]:
# Encode categorical variables
le = LabelEncoder()
for col in data.select_dtypes(include='object').columns:
    data[col] = le.fit_transform(data[col])

# Define features (X) and target (y)
X = data.drop("y", axis=1)  # 'y' is the target column
y = data["y"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Create and train the model
clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
clf.fit(X_train, y_train)


In [25]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8983425414364641

Confusion Matrix:
 [[781  26]
 [ 66  32]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94       807
           1       0.55      0.33      0.41        98

    accuracy                           0.90       905
   macro avg       0.74      0.65      0.68       905
weighted avg       0.88      0.90      0.89       905



In [27]:
# Display the tree structure as text
tree_rules = export_text(clf, feature_names=list(X.columns))
print(tree_rules)


|--- duration <= 645.50
|   |--- duration <= 211.50
|   |   |--- month <= 9.50
|   |   |   |--- age <= 20.00
|   |   |   |   |--- balance <= 51.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- balance >  51.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- age >  20.00
|   |   |   |   |--- age <= 70.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- age >  70.50
|   |   |   |   |   |--- class: 0
|   |   |--- month >  9.50
|   |   |   |--- day <= 29.50
|   |   |   |   |--- housing <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- housing >  0.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- day >  29.50
|   |   |   |   |--- class: 1
|   |--- duration >  211.50
|   |   |--- pdays <= 22.50
|   |   |   |--- age <= 60.50
|   |   |   |   |--- month <= 9.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- month >  9.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- age >  60.50
|   |   |   |   |--- day <= 16.50
|   |   |   |   |   |--- class: 0


In [29]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Perform Grid Search
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 10}
Best Accuracy: 0.8926988529989378
