# 4. Modeling

This notebook implements machine learning models to predict treatment outcomes for OCD patients. We'll perform train/test splits, implement a Random Forest classifier, evaluate model performance, and analyze the confusion matrix.

In [None]:
# ---------- imports ----------
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

import joblib

# reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---------- path setup ----------
processed_path = Path('../data/processed/ocd_patient_data_processed.csv')
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# ---------- load processed data ----------
df = pd.read_csv(processed_path)
print("Loaded processed dataset with shape:", df.shape)
df.head()

In [None]:
# ---------- prepare features and target ----------
# Define target variable
TARGET = "medications"

# Check if target column exists
if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in dataset. Available: {df.columns.tolist()}")

# Separate features and target
X = df.drop(columns=[TARGET, "patient_id"])
y = df[TARGET]

print("Features shape:", X.shape)
print("Target distribution:")
print(y.value_counts(normalize=True))

# Handle any remaining missing values
X = X.fillna(X.median())

In [None]:
# ---------- train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nTraining target distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest target distribution:")
print(y_test.value_counts(normalize=True))

In [None]:
# ---------- feature scaling ----------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed.")

In [None]:
# ---------- random forest classifier implementation ----------
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=RANDOM_STATE,
    class_weight='balanced'
)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

print("Random Forest model trained successfully.")

In [None]:
# ---------- model evaluation ----------
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# ---------- confusion matrix analysis ----------
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=rf_model.classes_, 
            yticklabels=rf_model.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# ---------- feature importance ----------
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# ---------- save the model ----------
model_path = models_dir / 'ocd_medication_predictor.pkl'
scaler_path = models_dir / 'feature_scaler.pkl'

joblib.dump(rf_model, model_path)
joblib.dump(scaler, scaler_path)

print(f"Model saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")

## Summary

In this notebook, we've implemented and evaluated a machine learning model for predicting OCD treatment outcomes:
1. Prepared features and target variables from the processed dataset
2. Split the data into training and test sets
3. Applied feature scaling to normalize the data
4. Implemented a Random Forest classifier
5. Evaluated model performance using accuracy, classification report, and confusion matrix
6. Analyzed feature importance to understand which variables are most predictive
7. Saved the trained model and scaler for future use

The next step is to compile a comprehensive report with our findings and visualizations.