In [2]:
import numpy as np 
import pandas as pd

In [3]:
df = pd.read_csv('bank.csv')

df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


y = df['deposit'].map({'no': 0, 'yes': 1}).astype(int)
X = df.drop(columns=['deposit'])

In [5]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

In [6]:
X_cats = X[cat_cols].replace('unknown', np.nan)
X_nums = X[num_cols]

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [8]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [9]:
preprocess = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    pd.concat([X_nums, X_cats], axis = 1), y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape

((8929, 16), (2233, 16))

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd

In [None]:
def train_and_evaluate(max_depth, criterion):
  clf = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, random_state=42)
  pipe = Pipeline([
      ('preprocess', preprocess),
      ('clf', clf)
  ])
  pipe.fit(X_train, y_train)
  y_pred = pipe.predict(X_test)
  metrics = {
      'max_depth': max_depth,
      'criterion': criterion,
      'accuracy': accuracy_score(y_test, y_pred),
      'precision': precision_score(y_test, y_pred),
      'recall': recall_score(y_test, y_pred),
      'f1': f1_score(y_test, y_pred)
  }
  cm = confusion_matrix(y_test, y_pred)
  return pipe, metrics, cm


experiments = []
res = []

for md in [3, 5, 7, 9, 11, 13, None]:
  for crit in ['gini', 'entropy']:
    pipe, metrics, cm = train_and_evaluate(md, crit)
    experiments.append([pipe,metrics,cm])
    res.append(metrics)

ans = pd.DataFrame(res)
print(ans)