# Feature extraction

In [6]:
%pip install mne mne-features colorlog

Collecting colorlog
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Installing collected packages: colorlog
Successfully installed colorlog-6.8.2
Note: you may need to restart the kernel to use updated packages.


In [7]:
import mne
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import pandas as pd
import mne_features

import utils
logger = utils.get_logger()

In [31]:
def load_fif_file(filename):
    eeg_data = mne.read_epochs(filename)
    return eeg_data

In [47]:
rsEEG = load_fif_file("dataset-cleaned/TMS-EEG-H_02_s1b_rsEEG_pre-epo.fif")
spTEP = load_fif_file("dataset-cleaned/TMS-EEG-H_02_S1b_spTEP_post-epo.fif")

print(type(rsEEG))

Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_s1b_rsEEG_pre-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
317 matching events found
No baseline correction applied
0 projection items activated
Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_S1b_spTEP_post-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
149 matching events found
No baseline correction applied
0 projection items activated
<class 'mne.epochs.EpochsFIF'>


## Label file generator

Create one file containing the labels for all files, as labels are file related and would just be duplicated in every line of the csv of the corresponding file. This save storage and keeps a structured overview and seperation of features and labels.

In [3]:
def create_labels_csv(directory, metadata_csv, output_csv):
    metadata = pd.read_csv(metadata_csv, index_col=0, header=None)

    labels = {0: 'sham', 1: 'ctbs', 2: 'itbs'}
    data = []

    for filename in os.listdir(directory):
        # note that the s can be upper or lower case and that the letter b can be behind the session number
        match = re.match(r'TMS-EEG-H_(\d+)_(S|s)(\w+)(b?)_(rsEEG|spTEP)_(pre|post)-epo.fif', filename)
        if match:
            patient_id, _, session, _, eeg_type, pre_post = match.groups()
            session = int(session.rstrip('b'))

            # Get the procedure for the session from the metadata
            procedure = labels[metadata.loc[f'H{patient_id}'][session]]

            data.append([filename, procedure, patient_id, eeg_type, pre_post])
            logger.info(f'Added entry to labels.csv: {filename}, {procedure}, {patient_id}, {eeg_type}, {pre_post}')

    df = pd.DataFrame(data, columns=['filename', 'procedure', 'patient_id', 'eeg_type', 'pre_post'])
    df.to_csv(output_csv, index=False, sep=";")
    
create_labels_csv("dataset-cleaned", "Randomisatielijst.csv", "labels.csv")

[32m[2024-06-11 17:35:26,194] - INFO - Added entry to labels.csv: TMS-EEG-H_02_S1b_rsEEG_post-epo.fif, itbs, 02, rsEEG, post[0m
[32m[2024-06-11 17:35:26,195] - INFO - Added entry to labels.csv: TMS-EEG-H_02_s1b_rsEEG_pre-epo.fif, itbs, 02, rsEEG, pre[0m
[32m[2024-06-11 17:35:26,196] - INFO - Added entry to labels.csv: TMS-EEG-H_02_S1b_spTEP_pre-epo.fif, itbs, 02, spTEP, pre[0m
[32m[2024-06-11 17:35:26,196] - INFO - Added entry to labels.csv: TMS-EEG-H_02_S1b_spTEP_post-epo.fif, itbs, 02, spTEP, post[0m


## Using `mne_features` library

In [None]:
# Open fif file
def load_fif_file(filename):
    eeg_data = mne.read_epochs(filename)
    return eeg_data

# Extract features from epochs file
def get_features(epochs):
    """Extract features from epochs, returns a dictionary mapping feature names to values."""
    return {"mean": 2}

# Save features to csv file
def save_features(features, filename):
    """Save features to a csv file from a dataframe."""
    pd.DataFrame.to_csv()
    pass

In [51]:
# List of features to extract
# mean is useless because of the rereferencing to the average
# takes long to calculate: app entropy
# wavelet coef energy: 6 * channels
selected_funcs = [
    "variance",
    "std",
    # "ptp_amp",
    # "skewness",
    # "kurtosis",
    # "rms",
    # "hjorth_mobility",
    # "hjorth_complexity",
    # "zero_crossings",
    # "line_length",
    # "app_entropy",
    # "hurst_exp",
    # ==============
    # "pow_freq_bands",
    # "wavelet_coef_energy",
    # "spect_slope",
    # "spect_entropy",
    ]

def epochs_to_feature_csv(epochs, selected_funcs, output_csv):
    # Extract features
    logger.info(f"Extracting features: {selected_funcs}")
    rsEEG_feat = mne_features.feature_extraction.extract_features(epochs.get_data(copy=True), sfreq=epochs.info["sfreq"], selected_funcs=selected_funcs, n_jobs=2) # shape (num_epochs, num_features * num_channels)

    logger.info(f"Feature shape: {rsEEG_feat.shape}")

    # Fit features to dictionary
    logger.info("Reshaping features to dictionary")
    num_epochs, num_features_times_channels = rsEEG_feat.shape
    num_channels = len(epochs.ch_names)
    num_features = len(selected_funcs)
    rsEEG_feat_reshaped = rsEEG_feat.reshape(num_epochs, num_features, num_channels)
    logger.info(f"Reshaped feature shape: {rsEEG_feat_reshaped.shape}")
    feature_dict = {} # Each entry has shape (num_epochs, num_channels * num_features)
    for i, feature in enumerate(selected_funcs):
        feature_dict[feature] = rsEEG_feat_reshaped[:, i, :]

    # Save features to dataframe
    logger.info("Transforming dictionary to dataframe")
    df_list = [pd.DataFrame(feature_dict[key]) for key in feature_dict]
    df = pd.concat(df_list, axis=1, keys=feature_dict.keys())

    # Normalize features
    logger.info("Normalizing features")
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    for feature in df.columns.levels[0]:
        df[feature] = scaler.fit_transform(df[feature])

    # Save features to csv
    logger.info("Saving features to csv")
    df.to_csv(output_csv, index=False)

def feat_extr_on_folder(source_folder, destination_folder):
    # Make sure the folder for csv files exists
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    for filename in os.listdir(source_folder):
        if filename.endswith(".fif"):
            eeg_data = load_fif_file(os.path.join(source_folder, filename))
            epochs_to_feature_csv(eeg_data, selected_funcs, os.path.join(destination_folder, f"{filename}.csv"))
            
feat_extr_on_folder("dataset-cleaned", "features")

Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_s1b_rsEEG_pre-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
317 matching events found
No baseline correction applied
0 projection items activated
Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_S1b_rsEEG_post-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
296 matching events found
No baseline correction applied
0 projection items activated


[32m[2024-06-12 17:18:19,644] - INFO - Extracting features: ['variance', 'std'][0m
[32m[2024-06-12 17:18:20,492] - INFO - Feature shape: (296, 124)[0m
[32m[2024-06-12 17:18:20,493] - INFO - Reshaping features to dictionary[0m
[32m[2024-06-12 17:18:20,494] - INFO - Reshaped feature shape: (296, 2, 62)[0m
[32m[2024-06-12 17:18:20,494] - INFO - Transforming dictionary to dataframe[0m
[32m[2024-06-12 17:18:20,495] - INFO - Normalizing features[0m
[32m[2024-06-12 17:18:20,500] - INFO - Saving features to csv[0m


Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_s1b_rsEEG_pre-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
317 matching events found
No baseline correction applied
0 projection items activated


[32m[2024-06-12 17:18:21,830] - INFO - Extracting features: ['variance', 'std'][0m
[32m[2024-06-12 17:18:22,632] - INFO - Feature shape: (317, 124)[0m
[32m[2024-06-12 17:18:22,633] - INFO - Reshaping features to dictionary[0m
[32m[2024-06-12 17:18:22,633] - INFO - Reshaped feature shape: (317, 2, 62)[0m
[32m[2024-06-12 17:18:22,634] - INFO - Transforming dictionary to dataframe[0m
[32m[2024-06-12 17:18:22,635] - INFO - Normalizing features[0m
[32m[2024-06-12 17:18:22,641] - INFO - Saving features to csv[0m


Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_S1b_spTEP_pre-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
150 matching events found
No baseline correction applied
0 projection items activated


[32m[2024-06-12 17:18:22,899] - INFO - Extracting features: ['variance', 'std'][0m
[32m[2024-06-12 17:18:23,144] - INFO - Feature shape: (150, 124)[0m
[32m[2024-06-12 17:18:23,144] - INFO - Reshaping features to dictionary[0m
[32m[2024-06-12 17:18:23,145] - INFO - Reshaped feature shape: (150, 2, 62)[0m
[32m[2024-06-12 17:18:23,145] - INFO - Transforming dictionary to dataframe[0m
[32m[2024-06-12 17:18:23,146] - INFO - Normalizing features[0m
[32m[2024-06-12 17:18:23,150] - INFO - Saving features to csv[0m


Reading /home/tomasgalle/UGent/thesis/tms-research/dataset-cleaned/TMS-EEG-H_02_S1b_spTEP_post-epo.fif ...
    Found the data of interest:
        t =       0.00 ...    2000.00 ms
        0 CTF compensation matrices available
Not setting metadata
149 matching events found
No baseline correction applied
0 projection items activated


[32m[2024-06-12 17:18:23,366] - INFO - Extracting features: ['variance', 'std'][0m
[32m[2024-06-12 17:18:23,657] - INFO - Feature shape: (149, 124)[0m
[32m[2024-06-12 17:18:23,658] - INFO - Reshaping features to dictionary[0m
[32m[2024-06-12 17:18:23,658] - INFO - Reshaped feature shape: (149, 2, 62)[0m
[32m[2024-06-12 17:18:23,659] - INFO - Transforming dictionary to dataframe[0m
[32m[2024-06-12 17:18:23,660] - INFO - Normalizing features[0m
[32m[2024-06-12 17:18:23,665] - INFO - Saving features to csv[0m


# Scrap
## rsEEG

## Statistical features
- Mean
- STD
- Peak amplitude

In [None]:
def mean_dict(eeg_data):
    data = eeg_data.get_data()
    ch_names = eeg_data.info['ch_names']
    avg_values = np.mean(data, axis=1)
    electrode_avg_dict = dict(zip(ch_names, avg_values))
    return electrode_avg_dict

def total_mean(mean_dict, electrodes=None):
    if electrodes is None:
        return np.mean(list(mean_dict.values()))
    else:
        return np.mean([mean_dict[electrode] for electrode in electrodes])
    
def std_dict(eeg_data):
    data = eeg_data.get_data()
    ch_names = eeg_data.info['ch_names']
    std_values = np.std(data, axis=1)
    electrode_std_dict = dict(zip(ch_names, std_values))
    return electrode_std_dict

def total_std(std_dict, electrodes=None):
    if electrodes is None:
        return np.mean(list(std_dict.values()))
    else:
        return np.mean([std_dict[electrode] for electrode in electrodes])
    
def peak_dict(eeg_data):
    data = eeg_data.get_data()
    ch_names = eeg_data.info['ch_names']
    peak_values = np.max(data, axis=1)
    electrode_peak_dict = dict(zip(ch_names, peak_values))
    return electrode_peak_dict

In [None]:
print("Mean dict: ", mean_dict(rsEEG))
print("Total mean: ", total_mean(mean_dict(rsEEG)))
print("STD dict: ", std_dict(rsEEG))
print("Total STD: ", total_std(std_dict(rsEEG)))

## Power spectrum

In [None]:
rsEEG.compute_psd(fmin=0.5, fmax=100, n_fft=2048, n_overlap=1024, verbose=True).plot()