In [None]:
# Data Preparation for PD vs Healthy Control EEG
#
# This notebook:
# 1. Loads raw EEG (.vhdr) files from a mixed PD/Control folder
# 2. Preprocesses EEG and extracts features
# 3. Cleans the data (missing values, duplicates, outliers)
# 4. Runs basic EDA
# 5. Transforms data (scaling) and performs feature selection

import os
import glob
import numpy as np
import pandas as pd
import mne
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Display plots inline
%matplotlib inline


In [None]:
# Configuration

# Path to your mixed PD/Control EEG folder (with .vhdr/.vmrk/.eeg per subject)
DATA_DIR = r"C:\Users\Usha Sri\OneDrive\Documents\Parkinson_Project\JamesCavanagh\PD_Dataset_timing"  # change if needed

# EEG preprocessing params
L_FREQ = 1.0       # high-pass
H_FREQ = 40.0      # low-pass
NOTCH_FREQ = 50.0  # or 60.0 depending on mains
RESAMPLE = 128     # Hz, or None to keep original
EPOCH_LENGTH = 2.0  # seconds

# Outlier handling
Z_THRESHOLD = 4.0  # for z-score based outlier removal



In [None]:
# Helpers: labels and subject IDs

def get_label_from_filename(fname):
    """Return 1 for PD, 0 for Control (handles 'Contorl' typo)."""
    base = os.path.basename(fname).lower()
    if base.startswith("pd"):
        return 1
    if base.startswith("control") or base.startswith("contorl"):
        return 0
    raise ValueError(f"Cannot determine label from filename: {fname}")


def get_subject_id_from_filename(fname):
    """Example: PD1325.vhdr -> PD1325"""
    base = os.path.basename(fname)
    subj_id = os.path.splitext(base)[0]
    return subj_id



In [None]:
# EEG preprocessing and feature extraction

def preprocess_raw(vhdr_path,
                   l_freq=L_FREQ,
                   h_freq=H_FREQ,
                   notch_freq=NOTCH_FREQ,
                   resample=RESAMPLE):
    print(f"Loading {vhdr_path}")
    raw = mne.io.read_raw_brainvision(vhdr_path, preload=True)

    # Set average reference
    raw.set_eeg_reference("average", projection=False)

    # Band-pass filter
    raw.filter(l_freq=l_freq, h_freq=h_freq)

    # Notch filter
    if notch_freq is not None:
        raw.notch_filter(freqs=[notch_freq])

    # Resample
    if resample is not None and raw.info["sfreq"] != resample:
        raw.resample(resample)

    return raw


def make_fixed_length_epochs(raw, epoch_length=EPOCH_LENGTH):
    epochs = mne.make_fixed_length_epochs(
        raw,
        duration=epoch_length,
        preload=True
    )
    return epochs


def extract_features_from_epochs(epochs):
    """Flatten each epoch + add mean and std per channel as extra features."""
    data = epochs.get_data()  # (n_epochs, n_channels, n_times)
    n_epochs, n_channels, n_times = data.shape

    flat = data.reshape(n_epochs, n_channels * n_times)
    mean_feats = data.mean(axis=2)  # (n_epochs, n_channels)
    std_feats = data.std(axis=2)   # (n_epochs, n_channels)

    features = np.concatenate([flat, mean_feats, std_feats], axis=1)
    return features


def process_subject(vhdr_path):
    """Process one subject/file into a feature DataFrame with labels and meta."""
    label = get_label_from_filename(vhdr_path)
    subject_id = get_subject_id_from_filename(vhdr_path)

    raw = preprocess_raw(vhdr_path)
    epochs = make_fixed_length_epochs(raw)
    X_sub = extract_features_from_epochs(epochs)

    n_epochs, n_features = X_sub.shape
    feature_cols = [f"feat_{i}" for i in range(n_features)]

    df_sub = pd.DataFrame(X_sub, columns=feature_cols)
    df_sub["label"] = label
    df_sub["subject_id"] = subject_id
    df_sub["epoch_idx"] = np.arange(n_epochs)

    return df_sub


def build_raw_feature_dataset(data_dir=DATA_DIR):
    """Build raw feature DataFrame from all .vhdr files in folder."""
    vhdr_files = sorted(glob.glob(os.path.join(data_dir, "*.vhdr")))
    if not vhdr_files:
        raise FileNotFoundError(f"No .vhdr files found in {data_dir}")

    all_dfs = []
    for f in vhdr_files:
        try:
            df_sub = process_subject(f)
            all_dfs.append(df_sub)
        except ValueError as e:
            print(f"Skipping file: {e}")

    full_df = pd.concat(all_dfs, axis=0, ignore_index=True)
    print("Raw feature DataFrame shape:", full_df.shape)
    return full_df



In [None]:
# Data cleaning: duplicates, missing values, outliers

def clean_data(df):
    print("\n--- Data Cleaning ---")

    meta_cols = ["label", "subject_id", "epoch_idx"]
    feature_cols = [c for c in df.columns if c not in meta_cols]

    # 1) Remove duplicates
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print(f"Removed duplicates: {before - after}")

    # 2) Handle missing values
    #    a) Drop features with > 40% missing
    missing_ratio = df[feature_cols].isna().mean()
    keep_features = missing_ratio[missing_ratio <= 0.4].index.tolist()
    dropped_features = [f for f in feature_cols if f not in keep_features]
    if dropped_features:
        print(f"Dropping {len(dropped_features)} features with >40% missing values")
    df = df[keep_features + meta_cols]

    feature_cols = keep_features

    #    b) Impute remaining missing values with median
    imputer = SimpleImputer(strategy="median")
    df[feature_cols] = imputer.fit_transform(df[feature_cols])

    # 3) Outlier handling via z-score
    feature_data = df[feature_cols].values.astype(float)
    mean = feature_data.mean(axis=0)
    std = feature_data.std(axis=0)
    std[std == 0] = 1.0

    z_scores = (feature_data - mean) / std
    max_abs_z = np.max(np.abs(z_scores), axis=1)
    mask = max_abs_z <= Z_THRESHOLD

    removed_outliers = np.sum(~mask)
    print(f"Removed outlier rows: {removed_outliers}")

    df_clean = df.loc[mask].reset_index(drop=True)

    print("Cleaned DataFrame shape:", df_clean.shape)
    return df_clean, feature_cols



In [None]:
# Exploratory Data Analysis (EDA)

def run_basic_eda(df, feature_cols, save_plots=False, out_dir="eda_plots"):
    print("\n--- Exploratory Data Analysis (EDA) ---")

    if save_plots and not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)

    # Class balance
    print("Class counts (0=Control, 1=PD):")
    print(df["label"].value_counts())

    # Histograms for a few features
    sample_features = feature_cols[:5]
    for col in sample_features:
        plt.figure(figsize=(6, 4))
        sns.histplot(data=df, x=col, hue="label", kde=True,
                     stat="density", common_norm=False)
        plt.title(f"Distribution of {col} by class")
        if save_plots:
            plt.savefig(os.path.join(out_dir, f"hist_{col}.png"), dpi=150)
        plt.show()

    # Correlation heatmap for a subset of features
    corr_features = feature_cols[:30]
    corr = df[corr_features].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, cmap="coolwarm", center=0)
    plt.title("Correlation heatmap (subset of features)")
    if save_plots:
        plt.savefig(os.path.join(out_dir, "corr_heatmap.png"), dpi=150)
    plt.show()

    print("EDA plots generated (and saved if save_plots=True).")



In [None]:
# Data transformation and feature selection

def transform_and_select_features(df, feature_cols):
    """Standardize, remove low-variance features, and select informative ones."""
    print("\n--- Data Transformation & Feature Selection ---")

    X = df[feature_cols].values
    y = df["label"].values

    # 1) Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 2) Remove near-constant features
    vt = VarianceThreshold(threshold=1e-5)
    X_vt = vt.fit_transform(X_scaled)
    kept_mask_vt = vt.get_support()
    kept_features_vt = [f for f, keep in zip(feature_cols, kept_mask_vt) if keep]
    print(f"Features after VarianceThreshold: {len(kept_features_vt)}")

    # 3) Model-based selection
    rf = RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_vt, y)
    selector = SelectFromModel(rf, prefit=True, threshold="median")
    X_final = selector.transform(X_vt)
    kept_mask_model = selector.get_support()
    selected_features = [
        f for f, keep in zip(kept_features_vt, kept_mask_model) if keep
    ]

    print(f"Features after model-based selection: {len(selected_features)}")

    return X_final, y, selected_features, scaler, vt, selector



In [None]:
# Run the full data preparation pipeline

# 1) Build raw feature dataset from EEG
df_raw = build_raw_feature_dataset(DATA_DIR)

# 2) Data cleaning
df_clean, feature_cols = clean_data(df_raw)

# 3) EDA (set save_plots=True to also save PNGs to disk)
run_basic_eda(df_clean, feature_cols, save_plots=False)

# 4) Transformation + feature selection
X_final, y, selected_features, scaler, vt, selector = transform_and_select_features(
    df_clean, feature_cols
)

print("\nFinal data ready for modeling:")
print("X_final shape:", X_final.shape)
print("y shape:", y.shape)

# Optionally save cleaned dataset and selected feature list
df_clean.to_csv("cleaned_eeg_features.csv", index=False)
pd.Series(selected_features).to_csv("selected_features.txt", index=False)
print("Saved cleaned data and selected feature list.")



### Next step: Model training

After running this notebook, you can use `X_final` and `y` to train your
classification model (e.g., RandomForest, SVM, deep learning). You can either:

- Add new cells **below** to train a classifier directly in this notebook, or
- Save `X_final` and `y` to disk and load them in a separate modeling notebook.
