# 1. Data Overview

This notebook provides an initial exploration of the OCD Patient Demographics & Clinical Data dataset. We'll load the raw data, perform basic inspection, check data types and missing values, and generate a statistical summary.

In [None]:
# ---------- imports ----------
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---------- path setup ----------
csv_path = Path('../data/raw/ocd_patient_data.csv')

# ---------- load ----------
df = pd.read_csv(csv_path)
print("Using file:", csv_path)
print("Shape:", df.shape)
df.head()

In [None]:
# ---------- normalize column names & quick schema ----------
orig_cols = df.columns.tolist()
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"\s+", "_", regex=True)
      .str.replace(r"[-/]+", "_", regex=True)
)
print("Renamed columns (if changed):")
for o, n in zip(orig_cols, df.columns.tolist()):
    if o != n:
        print(f'  "{o}" -> "{n}"')

print("\nDtypes:")
print(df.dtypes)
print("\nMissing values (top 20):")
print(df.isnull().sum().sort_values(ascending=False).head(20))
df.head()

In [None]:
# ---------- coerce numeric-like object columns ----------
obj_cols = df.select_dtypes(include=['object']).columns.tolist()
coerced = []
for col in obj_cols:
    sample = df[col].dropna().astype(str).head(200)
    if len(sample) < 5:
        continue
    # heuristic: count values that look numeric after removing common formatting
    numeric_like = sample.apply(lambda x: x.replace(',', '').replace('%','').replace('.','',1).lstrip('-').isdigit()).sum()
    if numeric_like >= 0.8 * len(sample):
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',','').str.replace('%',''), errors='coerce')
        coerced.append(col)

print("Coerced to numeric:", coerced)
# refresh types
df.dtypes

In [None]:
# ---------- missing value imputation ----------
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

# numeric -> median
if num_cols:
    medians = df[num_cols].median()
    df[num_cols] = df[num_cols].fillna(medians)

# categorical -> 'Unknown'
for c in cat_cols:
    df[c] = df[c].fillna('Unknown')

print("Total missing values after basic imputation:", int(df.isnull().sum().sum()))

In [None]:
# ---------- patient_id & duplicates ----------
if 'id' in df.columns and 'patient_id' not in df.columns:
    df.rename(columns={'id': 'patient_id'}, inplace=True)

if 'patient_id' not in df.columns:
    df.insert(0, 'patient_id', range(1, len(df) + 1))
    print("Added patient_id column (1..n)")

before = len(df)
df.drop_duplicates(inplace=True)
print(f"Removed {before - len(df)} duplicate rows; remaining rows: {len(df)}")

In [None]:
# ---------- basic statistics ----------
print("Basic statistical summary:")
df.describe()

In [None]:
# ---------- data info ----------
print("Dataset Info:")
df.info()

## Summary

In this notebook, we've:
1. Loaded the raw OCD patient dataset
2. Normalized column names for consistency
3. Checked data types and identified numeric vs categorical columns
4. Handled missing values with appropriate imputation strategies
5. Removed duplicates and ensured unique patient identifiers
6. Generated basic statistical summaries

The next step is to perform exploratory data analysis to understand patterns and relationships in the data.