# Data Preprocessing

This notebook is for data acquisition(conversion to MNE) and preprocessing the data with basic steps as follows:
1. Data Aquisition
2. Exploratory analysis
3. Preprocessing
    - Filtering
    - Artifact Removal
    - Epoching
    - Normalisation
4. Optimizations

In [5]:
import os
import numpy as np
import matplotlib.pyplot as plt
import mne

# For loading .mat files
import scipy.io as sio
import h5py

# For artifact removal
from mne.preprocessing import ICA

# For visualising
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Set aesthetic style for plots
sns.set(style="whitegrid", palette="viridis", font_scale=1.2)

# Check basic system details to monitor resource usage
import platform
print("Running on:", platform.platform())

Running on: macOS-15.3.2-arm64-arm-64bit


### Folder structure and file paths

In [3]:
subject_id = 2
data_root = "./data/raw/"
subject_folder = f"Sub{subject_id}"
eeg_folder = os.path.join(data_root, subject_folder, "EEG")

cnt_file = os.path.join(eeg_folder, "cnt.mat")
mrk_file = os.path.join(eeg_folder, "mrk.mat")

# Check existence
if not os.path.exists(cnt_file):
    raise FileNotFoundError(f"Cannot find {cnt_file}. Please check your dataset path.")
if not os.path.exists(mrk_file):
    raise FileNotFoundError(f"Cannot find {mrk_file}. Please check your dataset path.")

### Loading the mat files

loading data and extracting the following
- EEG data array
- sampling frequency
- channel names
- event information from mrk.mat files

In [6]:
try:
    cnt_mat = sio.loadmat(cnt_file, struct_as_record=False, squeeze_me=True)
    mrk_mat = sio.loadmat(mrk_file, struct_as_record=False, squeeze_me=True)
    print("Loaded CNT file with sio")
except NotImplementedError:
    try:
        cnt_mat = h5py.File(cnt_file, mode="r")
        mrk_mat = h5py.File(mrk_file, mode="r")
        print("Files loaded successfully with h5py")
    except Exception as e:
        print(f"Error loading files: {e}")


# Inspect top-level keys (sometimes the actual data is nested deeper)
print("CNT.mat keys:", cnt_mat.keys())
print("MRK.mat keys:", mrk_mat.keys())

Files loaded successfully with h5py
CNT.mat keys: <KeysViewHDF5 ['#refs#', 'cnt']>
MRK.mat keys: <KeysViewHDF5 ['#refs#', 'mrk']>


In [13]:
print(cnt_mat["cnt"].keys())

<KeysViewHDF5 ['EEG', 'Gender', 'HumanFactor', 'clab', 'fs']>


In [11]:
print(mrk_mat["mrk"].keys())

<KeysViewHDF5 ['className', 'event', 'time', 'y']>


In [14]:
# Extracting data from cnt.mat
eeg_data = np.array(cnt_mat["cnt"]["EEG"][()])
fs = np.array(cnt_mat["cnt"]["fs"][()]).item()  # Convert to a Python scalar
clab = np.array(cnt_mat["cnt"]["clab"][()])

In [15]:
# Decode channel labels if they're stored as bytes
if clab.dtype.kind == 'S':
    clab = [s.decode('utf-8') for s in clab]
else:
    clab = list(clab)

print("EEG data shape:", eeg_data.shape)
print("Sampling frequency:", fs)
print("Channel labels:", clab)

EEG data shape: (3691800, 63)
Sampling frequency: 1000.0
Channel labels: [array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object), array([<HDF5 o

In [19]:
# Extracting marker information from mrk.mat
mrk_time = np.array(mrk_mat["mrk"]["time"][()])
mrk_y = np.array(mrk_mat["mrk"]["y"][()])
# mrk_event = np.array(mrk_mat["mrk"]["event"][()])
# because the above shows an error for the group
event_field = mrk_mat["mrk"]["event"]
if isinstance(event_field, h5py.Dataset):
    mrk_event = np.array(event_field[()])
elif isinstance(event_field, h5py.Group):
    # Inspect the group keys to decide what to extract
    event_keys = list(event_field.keys())
    print("Event group keys:", event_keys)
    # If there's only one key, we assume that's our data.
    if len(event_keys) == 1:
        mrk_event = np.array(event_field[event_keys[0]][()])
    else:
        # Otherwise, choose a key or combine data as needed.
        # For now, we select the first key.
        mrk_event = np.array(event_field[event_keys[0]][()])
else:
    mrk_event = None

Event group keys: ['chan', 'desc', 'length', 'type']


In [20]:
# Extracting classname info

mrk_className = np.array(mrk_mat["mrk"]["className"][()])
if mrk_className.dtype.kind == 'S':
    mrk_className = [s.decode('utf-8') for s in mrk_className]
else:
    mrk_className = list(mrk_className)

print("Marker times shape:", mrk_time.shape)
print("Marker y shape:", mrk_y.shape)
if mrk_event is not None:
    print("Marker event shape:", mrk_event.shape)
print("Marker class names:", mrk_className)

Marker times shape: (160, 1)
Marker y shape: (160, 2)
Marker event shape: (1, 160)
Marker class names: [array([<HDF5 object reference>], dtype=object), array([<HDF5 object reference>], dtype=object)]


In [24]:
# Sampling rate
sfreq = fs # creating a copy of the variable here
print("Sampling frequency:", sfreq)

Sampling frequency: 1000.0


In [26]:
n_channels = eeg_data.shape[1]
channel_names = [f"EEG{i}" for i in range(n_channels)]
print("Channel names:", channel_names)

Channel names: ['EEG0', 'EEG1', 'EEG2', 'EEG3', 'EEG4', 'EEG5', 'EEG6', 'EEG7', 'EEG8', 'EEG9', 'EEG10', 'EEG11', 'EEG12', 'EEG13', 'EEG14', 'EEG15', 'EEG16', 'EEG17', 'EEG18', 'EEG19', 'EEG20', 'EEG21', 'EEG22', 'EEG23', 'EEG24', 'EEG25', 'EEG26', 'EEG27', 'EEG28', 'EEG29', 'EEG30', 'EEG31', 'EEG32', 'EEG33', 'EEG34', 'EEG35', 'EEG36', 'EEG37', 'EEG38', 'EEG39', 'EEG40', 'EEG41', 'EEG42', 'EEG43', 'EEG44', 'EEG45', 'EEG46', 'EEG47', 'EEG48', 'EEG49', 'EEG50', 'EEG51', 'EEG52', 'EEG53', 'EEG54', 'EEG55', 'EEG56', 'EEG57', 'EEG58', 'EEG59', 'EEG60', 'EEG61', 'EEG62']


In [27]:
# MNE expects shape (n_channels, n_times)
print(f"eeg data shape: {eeg_data.shape}")

eeg data shape: (3691800, 63)


In [28]:
# because the data is (n_samples, n_channels) we transpose for mne
eeg_data = eeg_data.T
print("After transpose, EEG data shape:", eeg_data.shape)

After transpose, EEG data shape: (63, 3691800)
