# EEG Spectrogram Generation and Dataset Splitting for Sleep Apnea Classification

In [1]:
import os
import glob

import numpy as np
import mne
from scipy.signal import spectrogram
import matplotlib

# Use a non-interactive backend to avoid GUI pop-ups during processing
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [2]:
# ─── CONFIGURATION ──────────────────────────────────────────────────────────────
raw_dir = "../data/eeg_signals"         # Directory containing EEG .edf files
out_dir = "../data/eeg_spectrograms"    # Directory to save spectrogram images
Fs = 128                                # Sampling frequency (Hz)
epoch_sec = 30                          # Length of each epoch (seconds)
L = Fs * epoch_sec                      # Total samples per epoch
window = np.hamming(L)                  # Apply a Hamming window to each epoch
noverlap = L // 2                       # 50% overlap between windows
nfft = 2 ** int(np.ceil(np.log2(L)))    # FFT length (next power-of-two)

channel_name = 'C3A2'

os.makedirs(out_dir, exist_ok=True)

plt.set_cmap('viridis')

In [3]:
# ─── SPECTROGRAM GENERATION ────────────────────────────────────────────────────
# Find all EDF files in the specified directory
edf_list = sorted(glob.glob(os.path.join(raw_dir, "*.edf")))
if not edf_list:
    raise FileNotFoundError(f"No .edf files found in {raw_dir}")

# Iterate over each EDF file
for edf_path in edf_list:
    # Load EDF file into MNE Raw object
    raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)

    # List available EEG channels
    eeg_chs = mne.pick_types(raw.info, eeg=True)
    available = [raw.info['ch_names'][i] for i in eeg_chs]
    print(f"Detected EEG channels in {os.path.basename(edf_path)}: {available}")

    # Verify the specified EEG channel is present
    if channel_name not in raw.ch_names:
        raise ValueError(f"Channel '{channel_name}' not found in {edf_path}")

    # Extract data for the specified EEG channel
    pick_idx = mne.pick_channels(raw.info['ch_names'], include=[channel_name])
    sig = raw.get_data(picks=pick_idx)[0]  # EEG signal as 1D numpy array

    # Determine the file's base name and number of epochs
    base = os.path.splitext(os.path.basename(edf_path))[0]
    nseg = len(sig) // L  # number of complete 30-second epochs

    # Process each epoch individually
    for i in range(nseg):
        # Extract epoch segment
        seg = sig[i*L:(i+1)*L]

        # Compute spectrogram using Short-Time Fourier Transform (STFT)
        f, t, Sxx = spectrogram(
            seg,
            fs=Fs,
            window=window,
            noverlap=noverlap,
            nfft=nfft
        )

        # Convert spectrogram power values to decibels (dB)
        Sxx_db = 10 * np.log10(np.abs(Sxx) + 1e-12)

        # Visualize spectrogram
        plt.figure()
        plt.imshow(
            Sxx_db,
            origin='lower',
            aspect='auto',
            extent=[0, epoch_sec, f.min(), f.max()]
        )
        plt.xlabel('Time (s)')
        plt.ylabel('Frequency (Hz)')
        plt.title(f"{base} – {channel_name} Segment {i+1}")
        plt.colorbar(label='Power/Frequency (dB/Hz)')

        # Save spectrogram as PNG
        plt.tight_layout()
        out_fn = f"{base}_{channel_name}_spectrogram_{i+1}.png"
        plt.savefig(os.path.join(out_dir, out_fn), bbox_inches='tight', dpi=150)

        # Close figures to manage memory efficiently
        plt.close('all')

print("✓ Spectrogram generation complete.")

  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb002.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb003.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb005.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb006.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb007.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb008.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb009.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb010.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb011.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'Right leg', 'ECG', 'Left leg', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb012.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb013.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb014.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb015.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb017.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb018.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb019.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb020.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb021.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb022.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb023.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb024.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb025.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb026.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb027.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Soud', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']


  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
  raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)


Detected EEG channels in ucddb028.edf: ['Lefteye', 'RightEye', 'EMG', 'C3A2', 'C4A1', 'ECG', 'SpO2', 'Sound', 'Flow', 'Sum', 'ribcage', 'abdo', 'BodyPos', 'Pulse']
✓ Spectrogram generation complete.


In [11]:
# ─── EXPLANATION OF OUTCOME ──────────────────────────────────────────────────────
# The script processes EEG recordings from EDF files, segments them into fixed
# 30-second intervals, computes the spectrograms for each segment, and saves
# each spectrogram as a high-resolution PNG image.
#
# Each image clearly visualizes how EEG signal power varies with frequency
# over the 30-second epoch. These spectrogram images are suitable inputs for
# deep learning models (e.g., CNNs or YOLO architectures) for tasks such as
# classifying sleep stages or detecting sleep-related disorders like apnea.
#
# Outcomes:
# - Directory filled with systematically named spectrogram images.
# - Each image can be directly utilized in machine learning workflows.

In [12]:
# AHI < 5 (normal):
# ucddb018 (ahi = 2, eeg_spectrograms_count = 822)
# total = 1 subject, 822 spectrograms
#
# 5 ≤ AHI < 15 (mild):
# ucddb005 (ahi = 13, eeg_spectrograms_count = 826)
# ucddb007 (ahi = 12, eeg_spectrograms_count = 813)
# ucddb008 (ahi = 5, eeg_spectrograms_count = 768)
# ucddb009 (ahi = 12, eeg_spectrograms_count = 925)
# ucddb011 (ahi = 8, eeg_spectrograms_count = 901)
# ucddb015 (ahi = 6, eeg_spectrograms_count = 916)
# ucddb017 (ahi = 12, eeg_spectrograms_count = 789)
# ucddb021 (ahi = 13, eeg_spectrograms_count = 913)	
# ucddb022 (ahi = 7, eeg_spectrograms_count = 788)
# ucddb026 (ahi = 14, eeg_spectrograms_count = 838)
# total = 10 subjects, 8477 spectrograms
#
# 15 ≤ AHI < 30 (moderate):
# ucddb002 (ahi = 23, eeg_spectrograms_count = 749)
# ucddb012 (ahi = 25, eeg_spectrograms_count = 864)
# ucddb013 (ahi = 16, eeg_spectrograms_count = 811)
# ucddb019 (ahi = 16, eeg_spectrograms_count = 852)
# ucddb020 (ahi = 15, eeg_spectrograms_count = 752)
# ucddb024 (ahi = 24, eeg_spectrograms_count = 908)
# total = 6 subjects, 4936 spectrograms
#
# AHI ≥ 30 (severe):
# ucddb003 (ahi = 51, eeg_spectrograms_count = 882)
# ucddb006 (ahi = 31, eeg_spectrograms_count = 808)
# ucddb010 (ahi = 34, eeg_spectrograms_count = 907)
# ucddb014 (ahi = 36, eeg_spectrograms_count = 774)
# ucddb023 (ahi = 39, eeg_spectrograms_count = 861)
# ucddb025 (ahi = 91, eeg_spectrograms_count = 711)
# ucddb027 (ahi = 55, eeg_spectrograms_count = 893)
# ucddb028 (ahi = 46, eeg_spectrograms_count = 722)
# total = 8 subjects, 6558 spectrograms

In [13]:
import random
import shutil

In [14]:
# Ratios for splitting the dataset
train_ratio = 0.70
val_ratio = 0.15
# Remaining 15% implicitly used for testing

In [15]:
# ─── DATA SPLITTING ─────────────────────────────────────────────────────────────
# Iterate through each class subfolder within the spectrograms directory
for feature in os.listdir(out_dir):
    feature_dir = os.path.join(out_dir, feature)
    if not os.path.isdir(feature_dir):
        continue  # Skip if not a directory

    # Create directories for training, validation, and test subsets
    for split in ("train", "val", "test"):
        os.makedirs(os.path.join(feature_dir, split), exist_ok=True)

    # List and shuffle all spectrogram images
    pics = [f for f in os.listdir(feature_dir) if f.endswith(".png")]
    random.shuffle(pics)

    # Compute the number of images for each subset
    n_total = len(pics)
    n_train = int(n_total * train_ratio)
    n_val = int(n_total * val_ratio)

    # Distribute images into the respective subsets
    for idx, pic in enumerate(pics):
        if idx < n_train:
            dst = "train"
        elif idx < n_train + n_val:
            dst = "val"
        else:
            dst = "test"

        # Move the image to the corresponding subset folder
        src_path = os.path.join(feature_dir, pic)
        dst_path = os.path.join(feature_dir, dst, pic)
        shutil.move(src_path, dst_path)

print("✓ Dataset split into train/val/test subsets complete.")

✓ Dataset split into train/val/test subsets complete.


In [16]:
# ─── EXPLANATION OF OUTCOME ──────────────────────────────────────────────────────
# This script splits spectrogram images into training, validation, and test sets
# following the defined ratios. It ensures each class has proportionally divided
# subsets, facilitating robust training and evaluation of machine learning models.
#
# Outcomes:
# - Clear directory structure organized into train, validation, and test subsets.
# - Ready-to-use dataset structure suitable for machine learning tasks.