# 3.4 Code Brief: Tune Random Forest Hyperparameters

Quick reference for hyperparameter tuning with Grid Search and Randomized Search.

## Setup

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from scipy.stats import randint

## Load Data

In [None]:
root_filepath = '/content/drive/MyDrive/projects/Applied-Data-Analytics-For-Higher-Education-Course-2/'
data_filepath = f'{root_filepath}data/'
models_path = f'{root_filepath}course_3/models/'

df_training = pd.read_csv(f'{data_filepath}training.csv')
df_testing = pd.read_csv(f'{data_filepath}testing.csv')

X_train, y_train = df_training, df_training['SEM_3_STATUS']
X_test, y_test = df_testing, df_testing['SEM_3_STATUS']

## Create Pipeline

In [None]:
minmax_columns = ['HS_GPA', 'GPA_1', 'GPA_2', 'DFW_RATE_1', 'DFW_RATE_2']
standard_columns = ['UNITS_ATTEMPTED_1', 'UNITS_ATTEMPTED_2']
categorical_columns = ['GENDER', 'RACE_ETHNICITY', 'FIRST_GEN_STATUS']

preprocessor = ColumnTransformer(
    transformers=[
        ('minmax', MinMaxScaler(), minmax_columns),
        ('standard', StandardScaler(), standard_columns),
        ('onehot', OneHotEncoder(handle_unknown='ignore', drop=['Female', 'Other', 'Unknown'], sparse_output=False), categorical_columns)
    ],
    remainder='drop'
)

rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(bootstrap=True, oob_score=True, class_weight='balanced', random_state=42, n_jobs=-1))
])

## Grid Search

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 15, None],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__min_samples_split': [2, 5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, return_train_score=True)
grid_search.fit(X_train, y_train)

print("Grid Search Best Parameters:")
for param, value in grid_search.best_params_.items():
    print(f"  {param.replace('classifier__', '')}: {value}")
print(f"Best CV ROC-AUC: {grid_search.best_score_:.4f}")

## Randomized Search

In [None]:
param_distributions = {
    'classifier__n_estimators': randint(50, 500),
    'classifier__max_depth': [5, 10, 15, 20, 25, 30, None],
    'classifier__max_features': ['sqrt', 'log2', 0.2, 0.3, 0.4, 0.5],
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 10)
}

random_search = RandomizedSearchCV(rf_pipeline, param_distributions, n_iter=50, cv=cv, scoring='roc_auc', n_jobs=-1, random_state=42, return_train_score=True)
random_search.fit(X_train, y_train)

print("Randomized Search Best Parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param.replace('classifier__', '')}: {value}")
print(f"Best CV ROC-AUC: {random_search.best_score_:.4f}")

## Select and Evaluate Best Model

In [None]:
best_model = random_search.best_estimator_ if random_search.best_score_ >= grid_search.best_score_ else grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("Test Set Performance:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"  Precision: {precision_score(y_test, y_pred):.4f}")
print(f"  Recall: {recall_score(y_test, y_pred):.4f}")
print(f"  F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"  ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Save Tuned Model

In [None]:
pickle.dump(best_model, open(f'{models_path}rf_tuned_best_model.pkl', 'wb'))
print("Saved tuned model.")

## Key Hyperparameters

| Parameter | Description | Typical Range |
|:----------|:------------|:--------------|
| `n_estimators` | Number of trees | 100-500 |
| `max_depth` | Maximum tree depth | 5-50 or None |
| `max_features` | Features per split | 'sqrt', 'log2', float |
| `min_samples_split` | Min samples to split | 2-20 |
| `min_samples_leaf` | Min samples in leaf | 1-10 |