# Feature extraction

In [8]:
import mne
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import pandas as pd
import mne_features

import utils
logger = utils.get_logger()

In [4]:
def load_fif_file(filename):
    eeg_data = mne.read_epochs(filename)
    return eeg_data

In [5]:
rsEEG = load_fif_file("dataset-cleaned/TMS-EEG-H_02_S1b_rsEEG_post-epo.fif")
spTEP = load_fif_file("dataset-cleaned/TMS-EEG-H_02_S1b_spTEP_post-epo.fif")

print(type(rsEEG))

Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_S1b_rsEEG_post-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
296 matching events found
No baseline correction applied
0 projection items activated
Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_S1b_spTEP_post-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
149 matching events found
No baseline correction applied
0 projection items activated
<class 'mne.epochs.EpochsFIF'>


## Label file generator

Create one file containing the labels for all files, as labels are file related and would just be duplicated in every line of the csv of the corresponding file. This save storage and keeps a structured overview and seperation of features and labels.

In [12]:
def create_labels_csv(directory, metadata_csv, output_csv):
    metadata = pd.read_csv(metadata_csv, index_col=0, header=None)

    labels = {0: 'sham', 1: 'ctbs', 2: 'itbs'}
    data = []

    for filename in os.listdir(directory):
        match = re.match(r'TMS-EEG-H_(\d+)_(\w+)_(rsEEG|spTEP)_(pre|post)-epo.fif', filename)
        if match:
            logger.info(match.groups())
            patient_id, session, eeg_type, pre_post = match.groups()
            # session = int(session)

            # Get the label for the session from the metadata
            label = labels[metadata.loc[f'H{patient_id:02d}'][session]]

            data.append([filename, label, patient_id, eeg_type, pre_post])

    df = pd.DataFrame(data, columns=['filename', 'label', 'patient_id', 'eeg_type', 'pre_post'])
    df.to_csv(output_csv, index=False)
    
create_labels_csv("dataset-cleaned", "Randomisatielijst.csv", "labels.csv")

[32m[2024-06-11 15:23:44,683] - INFO - ('02', 'S1b', 'rsEEG', 'post')[0m


ValueError: invalid literal for int() with base 10: 'S1b'

## Using `mne_features` library

In [None]:
# Open fif file
def load_fif_file(filename):
    eeg_data = mne.read_epochs(filename)
    return eeg_data

# Extract features from epochs file
def get_features(epochs):
    """Extract features from epochs, returns a dictionary mapping feature names to values."""
    return {"mean": 2}

# Save features to csv file
def save_features(features, filename):
    """Save features to a csv file from a dataframe."""
    pd.DataFrame.to_csv()
    pass

In [7]:
# mean is useless because of the rereferencing to the average
selected_funcs = [
    # "variance",
    # "std",
    "ptp_amp",
    # "skewness",
    # "kurtosis",
    # "rms",
    # "hurst_exp",
    "pow_freq_bands",
    "zero_crossings",
    "line_length",
    # "app_entropy",
    "hjorth_mobility",
    "hjorth_complexity",
    "wavelet_coef_energy",
    "spect_slope",
    "spect_entropy",
    ]

logger.info(f"Extracting features from rsEEG data: {selected_funcs}")
rsEEG_feat = mne_features.feature_extraction.extract_features(rsEEG.get_data(copy=True), sfreq=rsEEG.info["sfreq"], selected_funcs=selected_funcs, n_jobs=2)

print(rsEEG_feat.shape)

# logger.info("Reshaping rsEEG features")
# num_epochs, num_features_times_channels = rsEEG_feat.shape
# num_channels = len(rsEEG.ch_names)
# num_features = len(selected_funcs)
# rsEEG_feat_reshaped = rsEEG_feat.reshape(num_epochs, num_features, num_channels)
# feature_dict = {} # Each entry has shape (num_epochs, num_channels)
# for i, feature in enumerate(selected_funcs):
#     feature_dict[feature] = rsEEG_feat_reshaped[:, i, :]

[32m[2024-06-11 14:36:03,050] - INFO - Extracting features from rsEEG data: ['ptp_amp', 'pow_freq_bands', 'zero_crossings', 'line_length', 'hjorth_mobility', 'hjorth_complexity', 'wavelet_coef_energy', 'spect_slope', 'spect_entropy'][0m


(296, 1302)


In [33]:
print(feature_dict)

{'mean': array([[-1.33257980e-05, -1.22196947e-05, -1.12201125e-05, ...,
         2.28906292e-05,  2.83082296e-05,  2.46540160e-05],
       [ 4.68221682e-07, -2.98352692e-07, -1.75293345e-06, ...,
         1.84813250e-05,  1.99884481e-05,  1.75306126e-05],
       [ 1.12163156e-05,  1.92017116e-05,  1.28856847e-05, ...,
        -4.43649305e-05, -4.15336375e-05, -4.88646255e-05],
       ...,
       [ 4.10698061e-07, -4.92459367e-06,  3.87440399e-07, ...,
         1.18961250e-05,  1.19634643e-05,  2.07024968e-05],
       [ 1.91989304e-05,  2.17594706e-05,  1.99962726e-05, ...,
        -6.20678959e-05, -7.19881544e-05, -6.53507950e-05],
       [-1.06956035e-05, -1.02976913e-05, -7.42197672e-06, ...,
         1.22044028e-05,  4.75971276e-06,  3.95664747e-06]]), 'variance': array([[2.20752919e-10, 1.74038338e-10, 1.69562604e-10, ...,
        9.47842457e-10, 1.25527175e-09, 8.97152983e-10],
       [1.91725061e-10, 1.69678426e-10, 1.67965545e-10, ...,
        1.33976689e-09, 1.82185938e-09, 1.

In [34]:
from sklearn.cluster import KMeans

# Flatten the feature matrices for each feature into 1D arrays
flattened_features = {feature: matrix.reshape(-1) for feature, matrix in feature_dict.items()}

# Stack the flattened feature arrays horizontally
X = np.column_stack(list(flattened_features.values()))

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)

# The labels_ attribute contains the labels of each point
labels = kmeans.labels_

In [35]:
labels

array([0, 0, 0, ..., 0, 1, 0], dtype=int32)

In [None]:
import matplotlib.pyplot as plt
 
#filter rows of original data
filtered_label0 = df[label == 0]
 
#plotting the results
plt.scatter(filtered_label0[:,0] , filtered_label0[:,1])
plt.show()

# Scrap
## rsEEG

## Statistical features
- Mean
- STD
- Peak amplitude

In [None]:
def mean_dict(eeg_data):
    data = eeg_data.get_data()
    ch_names = eeg_data.info['ch_names']
    avg_values = np.mean(data, axis=1)
    electrode_avg_dict = dict(zip(ch_names, avg_values))
    return electrode_avg_dict

def total_mean(mean_dict, electrodes=None):
    if electrodes is None:
        return np.mean(list(mean_dict.values()))
    else:
        return np.mean([mean_dict[electrode] for electrode in electrodes])
    
def std_dict(eeg_data):
    data = eeg_data.get_data()
    ch_names = eeg_data.info['ch_names']
    std_values = np.std(data, axis=1)
    electrode_std_dict = dict(zip(ch_names, std_values))
    return electrode_std_dict

def total_std(std_dict, electrodes=None):
    if electrodes is None:
        return np.mean(list(std_dict.values()))
    else:
        return np.mean([std_dict[electrode] for electrode in electrodes])
    
def peak_dict(eeg_data):
    data = eeg_data.get_data()
    ch_names = eeg_data.info['ch_names']
    peak_values = np.max(data, axis=1)
    electrode_peak_dict = dict(zip(ch_names, peak_values))
    return electrode_peak_dict

In [None]:
print("Mean dict: ", mean_dict(rsEEG))
print("Total mean: ", total_mean(mean_dict(rsEEG)))
print("STD dict: ", std_dict(rsEEG))
print("Total STD: ", total_std(std_dict(rsEEG)))

## Power spectrum

In [None]:
rsEEG.compute_psd(fmin=0.5, fmax=100, n_fft=2048, n_overlap=1024, verbose=True).plot()