In [17]:
import numpy as np
import pandas as pd

In [18]:
INPUT_CSV = "./adhdata.csv"
OUTPUT_CSV = "processed_adhdata.csv"
SAMPLE_RATE = 128                                       # Hz
WINDOW_SECONDS = 4                                      # window duration
WINDOW_OVERLAP = 0.5                                    # 50% overlap
SAMPLES_PER_WINDOW = int(SAMPLE_RATE * WINDOW_SECONDS)
STEP = int(SAMPLES_PER_WINDOW * (1 - WINDOW_OVERLAP))   # step size for sliding window
FREQUENCIES = np.arange(2.0, 40.5, 0.5)                 # target frequency bins

In [19]:
df = pd.read_csv(INPUT_CSV)
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

indices = df['ID'].unique()
print(f"Unique IDs ({len(indices)}): {indices}")

results = []
window_count = 0

Dataset shape: (2166383, 21)
Columns: ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T7', 'T8', 'P7', 'P8', 'Fz', 'Cz', 'Pz', 'Class', 'ID']
Unique IDs (121): ['v10p' 'v12p' 'v14p' 'v15p' 'v173' 'v18p' 'v19p' 'v1p' 'v20p' 'v21p'
 'v22p' 'v24p' 'v25p' 'v27p' 'v28p' 'v29p' 'v30p' 'v31p' 'v32p' 'v33p'
 'v34p' 'v35p' 'v36p' 'v37p' 'v38p' 'v39p' 'v3p' 'v40p' 'v6p' 'v8p' 'v177'
 'v179' 'v181' 'v183' 'v190' 'v196' 'v198' 'v200' 'v204' 'v206' 'v209'
 'v213' 'v215' 'v219' 'v227' 'v231' 'v234' 'v236' 'v238' 'v244' 'v246'
 'v250' 'v254' 'v263' 'v265' 'v270' 'v274' 'v279' 'v284' 'v286' 'v288'
 'v107' 'v108' 'v109' 'v110' 'v111' 'v112' 'v113' 'v114' 'v115' 'v116'
 'v41p' 'v42p' 'v43p' 'v44p' 'v45p' 'v46p' 'v47p' 'v48p' 'v49p' 'v50p'
 'v51p' 'v52p' 'v53p' 'v54p' 'v55p' 'v56p' 'v57p' 'v58p' 'v59p' 'v60p'
 'v117' 'v118' 'v120' 'v121' 'v123' 'v125' 'v127' 'v129' 'v131' 'v133'
 'v134' 'v138' 'v140' 'v143' 'v147' 'v149' 'v151' 'v297' 'v298' 'v299'
 'v300' 'v302' 'v303' 'v304'

In [None]:
%pip install mne

import mne

def apply_ica_cleaning(df: pd.DataFrame, sfreq: float):
    """Apply ICA to remove muscle and eye artifacts from EEG dataframe."""

    ch_names = df.columns.tolist()
    ch_types = ["eeg"] * len(ch_names)
    info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
    raw = mne.io.RawArray(df.to_numpy().T, info, verbose=False)

    # --- Filter for ICA (1–80 Hz typical) ---
    raw.filter(1.0, 60.0, fir_design="firwin", verbose=False)

    # --- Fit ICA ---
    ica = mne.preprocessing.ICA(n_components=0.95, method="fastica", verbose=False)
    ica.fit(raw, picks="eeg", decim=3)

    # --- Detect EOG-like (blink) components if present ---
    try:
        eog_inds, _ = ica.find_bads_eog(raw)
    except Exception:
        eog_inds = []

    # --- Detect muscle-like components (high-frequency ratio heuristic) ---
    sources = ica.get_sources(raw).get_data()
    n_ic = sources.shape[0]
    sfreq = raw.info["sfreq"]

    from scipy.signal import welch
    muscle_candidates = []
    for ic_idx in range(n_ic):
        f, Pxx = welch(sources[ic_idx], sfreq, nperseg=512)
        hf_mask = (f >= 20)
        hf_ratio = Pxx[hf_mask].sum() / np.sum(Pxx)
        if hf_ratio > 0.25:  # threshold – tune as needed
            muscle_candidates.append(ic_idx)

    exclude = list(set(eog_inds + muscle_candidates))
    ica.exclude = exclude
    if len(exclude) > 0:
        print(f"ICA removed components: {exclude}")
        raw_clean = ica.apply(raw.copy())
    else:
        print("No ICA components removed.")
        raw_clean = raw

    # --- Convert back to pandas DataFrame ---
    cleaned = pd.DataFrame(raw_clean.get_data().T, columns=ch_names)
    return cleaned

[0mNote: you may need to restart the kernel to use updated packages.


In [26]:
def apply_filter(df: pd.DataFrame, lower=0.5, upper=30):
    electrode_columns = df.select_dtypes(include=[np.number]).columns

    n = len(df)
    freqs = np.fft.rfftfreq(n, d=1 / SAMPLE_RATE)
    filtered_df = df.copy()

    for electrode in electrode_columns:
        signal = df[electrode].to_numpy()
        fft_vals = np.fft.rfft(signal)
        fft_filter = (freqs >= lower) & (freqs <= upper)
        fft_vals[~fft_filter] = 0
        filtered_signal = np.fft.irfft(fft_vals, n=n)
        filtered_df[electrode] = filtered_signal

    return filtered_df

In [None]:
def process_window(window, saved_class, idx, window_count):
    # numeric columns = EEG electrodes
    electrode_columns = window.select_dtypes(include=[np.number]).columns

    n = len(window)
    original_freqs = np.fft.rfftfreq(n, d=1 / SAMPLE_RATE)

    # compute power spectra for all electrodes
    electrode_powers = {}
    for electrode in electrode_columns:
        signal = window[electrode].to_numpy()
        fft_vals = np.fft.rfft(signal)
        power = np.abs(fft_vals) ** 2

        # interpolate to common freq bins
        electrode_powers[electrode] = np.interp(FREQUENCIES, original_freqs, power)

    # build rows: one per frequency bin
    for i, f in enumerate(FREQUENCIES):
        row = {
            "ID": idx,
            "Class": saved_class,
            "Window": window_count,
            "Frequency": f
        }
        for electrode in electrode_columns:
            row[electrode] = electrode_powers[electrode][i]
        results.append(row)

In [28]:
def apply_ica_cleaning_to_dataset(df: pd.DataFrame):
    ...
    result_df = df.copy()
    numerical_df = df.select_dtypes(include=[np.number])
    cleaned_df = apply_ica_cleaning(numerical_df, SAMPLE_RATE)
    for col in numerical_df.columns:
        result_df[col] = cleaned_df[col]

    return cleaned_df

In [29]:
for idx_i, idx in enumerate(indices):
    print(f"\nProcessing ID {idx_i+1}/{len(indices)}: {idx}")
    subset = df[df['ID'] == idx].reset_index(drop=True)
    saved_class = subset['Class'].unique()
    subset = apply_filter(subset, 0.5, 30)
    subset = apply_ica_cleaning_to_dataset(subset)

    if len(saved_class) != 1:
        raise ValueError(f"ID {idx} has multiple classes: {saved_class}")

    saved_class = saved_class[0]
    specific_window_count = 0
    n_samples = len(subset)

    # Sliding window with overlap
    for start in range(0, n_samples - SAMPLES_PER_WINDOW + 1, STEP):
        window = subset.iloc[start:start + SAMPLES_PER_WINDOW]
        process_window(window, saved_class, idx, window_count)

        window_count += 1
        specific_window_count += 1
    print(f"  Windows generated for ID {idx}: {specific_window_count}")


Processing ID 1/121: v10p
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 12 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v10p: 54

Processing ID 2/121: v12p
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 12 components
Fitting ICA took 0.1s.
No ICA components removed.
  Windows generated for ID v12p: 67

Processing ID 3/121: v14p
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 9 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v14p: 67

Processing ID 4/121: v15p
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 10 components
Fitting ICA took 0.2s.
No ICA components removed.
  Windows generated for ID v15p: 167

Processing ID 5/121: v173
Fitting I



No ICA components removed.
  Windows generated for ID v238: 37

Processing ID 50/121: v244
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 9 components
Fitting ICA took 0.1s.
No ICA components removed.
  Windows generated for ID v244: 151

Processing ID 51/121: v246
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 9 components
Fitting ICA took 0.0s.
ICA removed components: [8, 7]
Applying ICA to Raw instance
    Transforming to ICA space (9 components)
    Zeroing out 2 ICA components
    Projecting back using 19 PCA components
  Windows generated for ID v246: 82

Processing ID 52/121: v250
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 9 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v250: 53

Processing ID 53/121: v254
Fitting ICA to data using 19 c



No ICA components removed.
  Windows generated for ID v121: 62

Processing ID 96/121: v123
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 11 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v123: 55

Processing ID 97/121: v125
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 12 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v125: 59

Processing ID 98/121: v127
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 10 components
Fitting ICA took 0.0s.
ICA removed components: [5, 6]
Applying ICA to Raw instance
    Transforming to ICA space (10 components)
    Zeroing out 2 ICA components
    Projecting back using 19 PCA components
  Windows generated for ID v127: 56

Processing ID 99/121: v129
Fitting ICA to data using 1



No ICA components removed.
  Windows generated for ID v133: 57

Processing ID 102/121: v134
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 12 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v134: 51

Processing ID 103/121: v138
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 9 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v138: 47

Processing ID 104/121: v140
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 8 components
Fitting ICA took 0.0s.
No ICA components removed.
  Windows generated for ID v140: 66

Processing ID 105/121: v143
Fitting ICA to data using 19 channels (please be patient, this may take a while)
Selecting by explained variance: 6 components
Fitting ICA took 0.0s.
No ICA components removed.
  Wi

In [None]:
outer_df = pd.DataFrame(results)
print("\nFinal shape:", outer_df.shape)
outer_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved to {OUTPUT_CSV}")


Final shape: (1274966, 23)


Exception: 