## Importing Required Libraries

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV
import joblib

## Loading the Dataset

In [4]:
train = pd.read_csv("./dataset/train.csv")
test = pd.read_csv("./dataset/test.csv")

## Splitting Features and Target

In [5]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

In [6]:
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

## GridSearch for Hyperparameter Tuning (Pre Pruning)

In [7]:
grid = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random']
}

In [8]:
#creating a custom scorer that ensures a balance between accuracy and recall
def custom_scorer(y_test, y_hat):
    accuracy = accuracy_score(y_test, y_hat)
    recall = recall_score(y_test, y_hat)
    score = (accuracy + recall) / 2
    return score

scorer = make_scorer(custom_scorer)

In [9]:
grid_search = GridSearchCV(DecisionTreeClassifier(),
                         param_grid = grid,
                         scoring = scorer,
                         cv = 10)

## Training the Models

In [14]:
grid_search.fit(X_train, y_train)

In [15]:
grid_search.best_params_

{'criterion': 'log_loss', 'splitter': 'best'}

## Evaluation on Test Data

In [17]:
decision_tree_model = grid_search.best_estimator_

In [18]:
y_hat = decision_tree_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_hat))
print('Recall:', recall_score(y_test, y_hat))

Accuracy: 0.9239766081871345
Recall: 0.9345794392523364


## Saving Model to File

In [19]:
joblib.dump(decision_tree_model, './models/decision_tree_model.joblib')

['./models/decision_tree_model.joblib']