# 2.2 Code Brief: Build Decision Tree Classification Models

Quick reference for building decision tree pipelines.

## Setup

In [None]:
import pandas as pd
import pickle
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

## Load Data

In [None]:
root_filepath = '/content/drive/MyDrive/projects/Applied-Data-Analytics-For-Higher-Education-Course-2/'
data_filepath = f'{root_filepath}data/'
course3_filepath = f'{root_filepath}course_3/'

df_training = pd.read_csv(f'{data_filepath}training.csv')
X_train = df_training
y_train = df_training['SEM_3_STATUS']

## Define Feature Groups

In [None]:
numerical_columns = ['HS_GPA', 'GPA_1', 'GPA_2', 'DFW_RATE_1', 'DFW_RATE_2', 'UNITS_ATTEMPTED_1', 'UNITS_ATTEMPTED_2']
categorical_columns = ['GENDER', 'RACE_ETHNICITY', 'FIRST_GEN_STATUS']

## Create Preprocessor (No Scaling Needed)

In [None]:
preprocessor_dt = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop=['Female', 'Other', 'Unknown'], sparse_output=False), categorical_columns)
    ],
    remainder='drop'
)

## Build Basic Decision Tree

In [None]:
basic_dt_model = Pipeline([
    ('preprocessing', preprocessor_dt),
    ('classifier', DecisionTreeClassifier(
        criterion='gini',
        max_depth=None,  # No limit (will overfit)
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ))
])

## Build Constrained Decision Tree

In [None]:
constrained_dt_model = Pipeline([
    ('preprocessing', preprocessor_dt),
    ('classifier', DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,  # Limit depth
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42
    ))
])

## Build Balanced Decision Tree

In [None]:
balanced_dt_model = Pipeline([
    ('preprocessing', preprocessor_dt),
    ('classifier', DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        min_samples_split=20,
        min_samples_leaf=10,
        class_weight='balanced',  # Handle class imbalance
        random_state=42
    ))
])

## Save Models

In [None]:
models_to_save = {
    'basic_decision_tree_model': basic_dt_model,
    'constrained_decision_tree_model': constrained_dt_model,
    'balanced_decision_tree_model': balanced_dt_model
}

models_path = f'{course3_filepath}models/'
os.makedirs(models_path, exist_ok=True)

for name, model in models_to_save.items():
    filepath = f'{models_path}{name}.pkl'
    pickle.dump(model, open(filepath, 'wb'))
    print(f"Saved: {filepath}")

## Key Parameters

| Parameter | Description | Effect |
|:----------|:------------|:-------|
| `max_depth` | Maximum tree depth | Lower = simpler |
| `min_samples_split` | Min samples to split | Higher = simpler |
| `min_samples_leaf` | Min samples in leaf | Higher = simpler |
| `class_weight` | Handle imbalance | 'balanced' = equal importance |