# 2. Exploratory Data Analysis

This notebook performs exploratory data analysis on the OCD patient dataset. We'll examine distributions of key variables, analyze correlations, visualize demographic patterns, and identify outliers and anomalies.

In [None]:
# ---------- imports ----------
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---------- path setup ----------
csv_path = Path('../data/raw/ocd_patient_data.csv')

# ---------- load ----------
df = pd.read_csv(csv_path)

# ---------- normalize column names ----------
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"\s+", "_", regex=True)
      .str.replace(r"[-/]+", "_", regex=True)
)

print("Dataset loaded with shape:", df.shape)
df.head()

In [None]:
# ---------- distribution analysis ----------
print("Distribution of Age:")
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=20, kde=True)
plt.title('Age Distribution of OCD Patients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

print("\nAge Statistics:")
print(df['age'].describe())

In [None]:
# ---------- gender distribution ----------
print("Gender Distribution:")
gender_counts = df['gender'].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Gender Distribution of OCD Patients')
plt.show()

print(gender_counts)

In [None]:
# ---------- obsession and compulsion types ----------
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Obsession types
obsession_counts = df['obsession_type'].value_counts()
axes[0].bar(obsession_counts.index, obsession_counts.values)
axes[0].set_title('Distribution of Obsession Types')
axes[0].set_xlabel('Obsession Type')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Compulsion types
compulsion_counts = df['compulsion_type'].value_counts()
axes[1].bar(compulsion_counts.index, compulsion_counts.values)
axes[1].set_title('Distribution of Compulsion Types')
axes[1].set_xlabel('Compulsion Type')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("Obsession Types:")
print(obsession_counts)
print("\nCompulsion Types:")
print(compulsion_counts)

In [None]:
# ---------- Y-BOCS scores analysis ----------
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Obsessions scores
axes[0].hist(df['y_bocs_score_(obsessions)'], bins=10, color='skyblue', edgecolor='black')
axes[0].set_title('Distribution of Y-BOCS Obsession Scores')
axes[0].set_xlabel('Y-BOCS Obsession Score')
axes[0].set_ylabel('Frequency')

# Compulsions scores
axes[1].hist(df['y_bocs_score_(compulsions)'], bins=10, color='lightcoral', edgecolor='black')
axes[1].set_title('Distribution of Y-BOCS Compulsion Scores')
axes[1].set_xlabel('Y-BOCS Compulsion Score')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("Y-BOCS Obsession Scores Statistics:")
print(df['y_bocs_score_(obsessions)'].describe())
print("\nY-BOCS Compulsion Scores Statistics:")
print(df['y_bocs_score_(compulsions)'].describe())

In [None]:
# ---------- correlation analysis ----------
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) > 1:
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Numeric Variables')
    plt.show()
else:
    print("Not enough numeric columns for correlation analysis.")

In [None]:
# ---------- comorbidities analysis ----------
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Depression diagnosis
depression_counts = df['depression_diagnosis'].value_counts()
axes[0].pie(depression_counts, labels=depression_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Depression Diagnosis Distribution')

# Anxiety diagnosis
anxiety_counts = df['anxiety_diagnosis'].value_counts()
axes[1].pie(anxiety_counts, labels=anxiety_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Anxiety Diagnosis Distribution')

plt.tight_layout()
plt.show()

print("Depression Diagnosis:")
print(depression_counts)
print("\nAnxiety Diagnosis:")
print(anxiety_counts)

In [None]:
# ---------- medications analysis ----------
print("Medications Distribution:")
medication_counts = df['medications'].value_counts()
plt.figure(figsize=(10, 6))
bars = plt.bar(medication_counts.index, medication_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
plt.title('Distribution of Medications Prescribed')
plt.xlabel('Medication Type')
plt.ylabel('Count')
plt.xticks(rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}',
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(medication_counts)

## Summary

In this notebook, we've performed comprehensive exploratory data analysis:
1. Examined the distribution of patient ages
2. Analyzed gender distribution
3. Explored obsession and compulsion types
4. Investigated Y-BOCS scores for both obsessions and compulsions
5. Performed correlation analysis between numeric variables
6. Analyzed comorbidities (depression and anxiety diagnoses)
7. Examined medications prescribed

The next step is to preprocess the data and engineer features for machine learning modeling.