# 3. Preprocessing & Feature Engineering

This notebook handles data cleaning, preprocessing, and feature engineering for the OCD patient dataset. We'll handle missing values, encode categorical variables, convert date columns to numeric format, and create new features that may be useful for modeling.

In [None]:
# ---------- imports ----------
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

# reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---------- path setup ----------
raw_path = Path('../data/raw/ocd_patient_data.csv')
processed_dir = Path('../data/processed')
processed_dir.mkdir(exist_ok=True)

# ---------- load ----------
df = pd.read_csv(raw_path)

# ---------- normalize column names ----------
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"\s+", "_", regex=True)
      .str.replace(r"[-/]+", "_", regex=True)
)

print("Dataset loaded with shape:", df.shape)
df.head()

In [None]:
# ---------- handle missing values ----------
print("Missing values before preprocessing:")
print(df.isnull().sum())

# Fill missing values
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Numeric columns -> median
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Categorical columns -> mode
for col in cat_cols:
    df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown', inplace=True)

print("\nMissing values after preprocessing:")
print(df.isnull().sum().sum(), "total missing values")

In [None]:
# ---------- encode categorical variables ----------
# Create a copy for label encoding
df_encoded = df.copy()

# Label encode categorical variables
label_encoders = {}
for col in cat_cols:
    if col != 'patient_id':  # Don't encode patient_id
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

print("Categorical variables encoded. Shape:", df_encoded.shape)
df_encoded.head()

In [None]:
# ---------- convert date columns to numeric ----------
# Convert OCD diagnosis date to datetime
df['ocd_diagnosis_date'] = pd.to_datetime(df['ocd_diagnosis_date'], errors='coerce')

# Extract year, month, and day as separate features
df['diagnosis_year'] = df['ocd_diagnosis_date'].dt.year
df['diagnosis_month'] = df['ocd_diagnosis_date'].dt.month
df['diagnosis_day'] = df['ocd_diagnosis_date'].dt.day

# Calculate diagnosis date as days since earliest date
df['days_since_first_diagnosis'] = (df['ocd_diagnosis_date'] - df['ocd_diagnosis_date'].min()).dt.days

# Fill NaN values in new date columns
date_cols = ['diagnosis_year', 'diagnosis_month', 'diagnosis_day', 'days_since_first_diagnosis']
for col in date_cols:
    df[col].fillna(df[col].median(), inplace=True)

print("Date features extracted:")
print(df[date_cols].head())

In [None]:
# ---------- create new features ----------
# Total Y-BOCS score
df['total_y_bocs_score'] = df['y_bocs_score_(obsessions)'] + df['y_bocs_score_(compulsions)']

# Severity category based on total Y-BOCS score
def categorize_severity(score):
    if score <= 16:
        return 'Mild'
    elif score <= 24:
        return 'Moderate'
    elif score <= 32:
        return 'Severe'
    else:
        return 'Extreme'

df['severity_category'] = df['total_y_bocs_score'].apply(categorize_severity)

# Comorbidity indicator
df['has_comorbidity'] = ((df['depression_diagnosis'] == 'Yes') | (df['anxiety_diagnosis'] == 'Yes')).astype(int)

# Age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], labels=['Young Adult', 'Middle Adult', 'Older Adult', 'Senior'])

print("New features created:")
print("- Total Y-BOCS Score")
print("- Severity Category")
print("- Comorbidity Indicator")
print("- Age Group")

print("\nSeverity Category Distribution:")
print(df['severity_category'].value_counts())

print("\nComorbidity Distribution:")
print(df['has_comorbidity'].value_counts())

In [None]:
# ---------- one-hot encode categorical variables ----------
# Select categorical columns for one-hot encoding
categorical_for_ohe = ['gender', 'ethnicity', 'marital_status', 'education_level', 'obsession_type', 'compulsion_type', 'severity_category', 'age_group']

# One-hot encode
df_ohe = pd.get_dummies(df, columns=categorical_for_ohe, prefix=categorical_for_ohe)

print("One-hot encoding completed. New shape:", df_ohe.shape)
print("New columns:", [col for col in df_ohe.columns if any(cat in col for cat in categorical_for_ohe)])

In [None]:
# ---------- save processed dataset ----------
# Save the processed dataset
processed_path = processed_dir / 'ocd_patient_data_processed.csv'
df_ohe.to_csv(processed_path, index=False)
print(f"Processed dataset saved to: {processed_path}")

# Save label encoders for potential future use
import joblib
encoders_path = processed_dir / 'label_encoders.pkl'
joblib.dump(label_encoders, encoders_path)
print(f"Label encoders saved to: {encoders_path}")

# Display final dataset info
print("\nFinal dataset shape:", df_ohe.shape)
print("\nColumn types:")
print(df_ohe.dtypes.value_counts())

## Summary

In this notebook, we've completed preprocessing and feature engineering:
1. Handled missing values using appropriate imputation strategies
2. Encoded categorical variables using both label encoding and one-hot encoding
3. Converted date columns to numeric features
4. Created new features including:
   - Total Y-BOCS score
   - Severity category
   - Comorbidity indicator
   - Age groups
5. Saved the processed dataset for use in modeling

The next step is to build and evaluate machine learning models using this processed dataset.