# 3.2 Code Brief: Build Random Forest Classification Models

Quick reference for building random forest pipelines.

## Setup

In [None]:
import pandas as pd
import pickle
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

## Load Data

In [None]:
root_filepath = '/content/drive/MyDrive/projects/Applied-Data-Analytics-For-Higher-Education-Course-2/'
data_filepath = f'{root_filepath}data/'
course3_filepath = f'{root_filepath}course_3/'

df_training = pd.read_csv(f'{data_filepath}training.csv')
X_train = df_training
y_train = df_training['SEM_3_STATUS']

## Define Feature Groups and Preprocessor

In [None]:
minmax_columns = ['HS_GPA', 'GPA_1', 'GPA_2', 'DFW_RATE_1', 'DFW_RATE_2']
standard_columns = ['UNITS_ATTEMPTED_1', 'UNITS_ATTEMPTED_2']
categorical_columns = ['GENDER', 'RACE_ETHNICITY', 'FIRST_GEN_STATUS']

preprocessor = ColumnTransformer(
    transformers=[
        ('minmax', MinMaxScaler(), minmax_columns),
        ('standard', StandardScaler(), standard_columns),
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop=['Female', 'Other', 'Unknown'], sparse_output=False), categorical_columns)
    ],
    remainder='drop'
)

## Build Baseline Random Forest

In [None]:
rf_baseline_model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        max_features='sqrt',
        min_samples_split=2,
        min_samples_leaf=1,
        bootstrap=True,
        oob_score=True,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

## Build Large Random Forest (500 trees)

In [None]:
rf_large_model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

## Build Constrained Random Forest

In [None]:
rf_constrained_model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        max_features='sqrt',
        min_samples_split=5,
        min_samples_leaf=2,
        bootstrap=True,
        oob_score=True,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

## Save Models

In [None]:
models = {
    'rf_baseline_model': rf_baseline_model,
    'rf_large_500_model': rf_large_model,
    'rf_constrained_model': rf_constrained_model
}

models_path = f'{course3_filepath}models/'
os.makedirs(models_path, exist_ok=True)

for name, model in models.items():
    filepath = f'{models_path}{name}.pkl'
    pickle.dump(model, open(filepath, 'wb'))
    print(f"Saved: {filepath}")

## Key Parameters

| Parameter | Description | Default |
|:----------|:------------|:--------|
| `n_estimators` | Number of trees | 100 |
| `max_depth` | Maximum tree depth | None |
| `max_features` | Features per split | 'sqrt' |
| `bootstrap` | Use bootstrap sampling | True |
| `oob_score` | Calculate OOB score | False |
| `class_weight` | Handle imbalance | None |