# 2.4 Code Brief: Evaluate and Tune Decision Trees

Quick reference for evaluating and tuning decision tree hyperparameters.

## Setup

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

## Load Data

In [None]:
root_filepath = '/content/drive/MyDrive/projects/Applied-Data-Analytics-For-Higher-Education-Course-2/'
data_filepath = f'{root_filepath}data/'
models_path = f'{root_filepath}course_3/models/'

df_training = pd.read_csv(f'{data_filepath}training.csv')
df_testing = pd.read_csv(f'{data_filepath}testing.csv')

X_train, y_train = df_training, df_training['SEM_3_STATUS']
X_test, y_test = df_testing, df_testing['SEM_3_STATUS']

## Create Pipeline for Tuning

In [None]:
numerical_columns = ['HS_GPA', 'GPA_1', 'GPA_2', 'DFW_RATE_1', 'DFW_RATE_2', 'UNITS_ATTEMPTED_1', 'UNITS_ATTEMPTED_2']
categorical_columns = ['GENDER', 'RACE_ETHNICITY', 'FIRST_GEN_STATUS']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop=['Female', 'Other', 'Unknown'], sparse_output=False), categorical_columns)
    ],
    remainder='drop'
)

base_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

## Define Parameter Grid

In [None]:
param_grid = {
    'classifier__max_depth': [3, 4, 5, 6, 7, 8],
    'classifier__min_samples_split': [10, 20, 30, 50],
    'classifier__min_samples_leaf': [5, 10, 15, 20],
    'classifier__class_weight': ['balanced', None]
}

## Run Grid Search

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_train_binary = (y_train == 'N').astype(int)

grid_search = GridSearchCV(base_pipeline, param_grid, cv=cv, scoring='f1', n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, y_train_binary)

print("Best Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param.replace('classifier__', '')}: {value}")
print(f"\nBest CV F1 Score: {grid_search.best_score_:.4f}")

## Evaluate Best Model on Test Set

In [None]:
best_model = grid_search.best_estimator_
y_test_binary = (y_test == 'N').astype(int)

y_pred = best_model.predict(X_test)
y_pred_labels = np.where(y_pred == 1, 'N', 'E')

print(f"Test F1 Score: {f1_score(y_test, y_pred_labels, pos_label='N'):.4f}")
print(f"Test Precision: {precision_score(y_test, y_pred_labels, pos_label='N'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_pred_labels, pos_label='N'):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_labels):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_labels))

## Save Tuned Model

In [None]:
# Retrain on original labels
best_params = grid_search.best_params_
final_model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', DecisionTreeClassifier(
        max_depth=best_params['classifier__max_depth'],
        min_samples_split=best_params['classifier__min_samples_split'],
        min_samples_leaf=best_params['classifier__min_samples_leaf'],
        class_weight=best_params['classifier__class_weight'],
        random_state=42
    ))
])
final_model.fit(X_train, y_train)

pickle.dump(final_model, open(f'{models_path}tuned_decision_tree_final.pkl', 'wb'))
print("Saved tuned model.")