# 3.3 Code Brief: Train and Evaluate Random Forests

Quick reference for training, evaluating, and analyzing random forests.

## Setup

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

## Load Data and Models

In [None]:
root_filepath = '/content/drive/MyDrive/projects/Applied-Data-Analytics-For-Higher-Education-Course-2/'
data_filepath = f'{root_filepath}data/'
models_path = f'{root_filepath}course_3/models/'

df_training = pd.read_csv(f'{data_filepath}training.csv')
df_testing = pd.read_csv(f'{data_filepath}testing.csv')

X_train, y_train = df_training, df_training['SEM_3_STATUS']
X_test, y_test = df_testing, df_testing['SEM_3_STATUS']

rf_baseline = pickle.load(open(f'{models_path}rf_baseline_model.pkl', 'rb'))

## Train Model

In [None]:
rf_baseline.fit(X_train, y_train)
rf = rf_baseline.named_steps['classifier']

print(f"Number of trees: {rf.n_estimators}")
print(f"OOB Score: {rf.oob_score_:.4f}")

## Cross-Validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall', 'f1': 'f1', 'roc_auc': 'roc_auc'}

results = cross_validate(rf_baseline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=True)

print("Cross-Validation Results:")
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    print(f"  {metric}: {results[f'test_{metric}'].mean():.4f} (+/- {results[f'test_{metric}'].std()*2:.4f})")

## Feature Importance

In [None]:
preprocessor = rf_baseline.named_steps['preprocessing']
minmax_features = preprocessor.transformers_[0][2]
standard_features = preprocessor.transformers_[1][2]
onehot_features = list(preprocessor.transformers_[2][1].get_feature_names_out(preprocessor.transformers_[2][2]))
all_feature_names = list(minmax_features) + list(standard_features) + onehot_features

importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("Top 10 Feature Importances:")
display(importance_df.head(10))

## Test Set Evaluation

In [None]:
y_pred = rf_baseline.predict(X_test)
y_prob = rf_baseline.predict_proba(X_test)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Test Recall: {recall_score(y_test, y_pred):.4f}")
print(f"Test F1: {f1_score(y_test, y_pred):.4f}")
print(f"Test ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Save Trained Model

In [None]:
pickle.dump(rf_baseline, open(f'{models_path}rf_best_trained_model.pkl', 'wb'))
print("Saved trained model.")