In [12]:
%pip install mne colorlog

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import os
import re
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import utils
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
from collections import defaultdict
import seaborn as sns

logger = utils.get_logger()

## Feature Importance
The functions below calculate and visually represent the importance of features in a given dataframe, useful for analysis of the importances in the different analysis paradigms.

In [None]:
def get_pca_attributions(filenames, source_folder="features-4", top_n=20):
    logger.info(f"Getting PCA attributions of {len(filenames)} files")
    
    logger.info(f"Loading {len(filenames)} files")
    dfs = [
        pd.read_csv(os.path.join(source_folder, file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)
    feature_names = df.columns

    # Normalization
    scaler = StandardScaler()
    df = scaler.fit_transform(df)
    
    # PCA
    logger.info("Executing PCA")
    pca = PCA(n_components=3)
    df_reduced = pca.fit_transform(df)
    
    # Feature Loadings
    logger.info("Getting loadings")
    loadings = pca.components_
    top_features = {}
    for i, component in enumerate(loadings):
        component_loadings = zip(feature_names, component)
        sorted_loadings = sorted(component_loadings, key=lambda x: abs(x[1]), reverse=True)
        top_features[i] = sorted_loadings[:top_n]
    
    return top_features

def plot_hierarchical_importances(data):
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    for i, (component, features) in enumerate(data.items()):
        feature_names = [f"{feature[0]}" for feature, _ in features]
        importances = [importance for _, importance in features]
        
        sns.barplot(x=importances, y=feature_names, ax=axes[i])
        axes[i].set_title(f'Principal Component {i+1}')
        axes[i].set_xlabel('Importance')
        axes[i].set_ylabel('Feature')
    
    plt.tight_layout()
    plt.show()

def combine_and_aggregate(data):
    combined_importances = defaultdict(float)
    
    for component in data.values():
        for (feature, importance) in component:
            combined_importances[feature] += abs(importance)
    
    combined_importances = sorted(combined_importances.items(), key=lambda x: x[1], reverse=True)
    
    return combined_importances

def plot_combined_importances(combined_importances):
    top_features = combined_importances
    
    feature_names = [f"{feature[0]}" for feature, _ in top_features]
    importances = [importance for _, importance in top_features]
    
    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x=importances, y=feature_names)
    plt.title('Top Features by Combined Importance')
    plt.xlabel('Aggregated Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()

def top_feature_importance(filenames, source_folder="features-4", top_n=20):
    top_features = get_pca_attributions(filenames, source_folder, top_n)
    plot_hierarchical_importances(top_features)
    combined_importances = combine_and_aggregate(top_features)
    plot_combined_importances(combined_importances)

## Clustering

When the features have been extracted, there's now a list of files containing one entry per epoch. Each file has a certain paradigm containing: patient, procedure, timing. Clustering will be applied for each individual procedure containing all timings (pre & post). These clusters will be generated once with all patients and once per individual patient.

The desired outcome is two clusters per plot; one for data before the procedure and one for after the procedure (with the active control procedure being the exception since this should have no impact).

In [None]:
def cluster_df(df, n_clusters=2, plot_title="PCA of Clusters", algorithm='kmeans'):
    """Expects one large dataframe containing all the data to be clustered, and one column 'timings' containing the label for each entry to compare cluster result against ground truth."""
    if "label" not in df.columns:
        logger.error("No timings column found in DataFrame")
        return

    df_ground_truth = df
    df = df.drop(columns=["label"])
    feature_names = df.columns

    # Outlier removal
    OUTLIER_THRESHOLD = 0.05
    Q1 = df.quantile(0.10)
    Q3 = df.quantile(0.90)
    IQR = Q3 - Q1

    def is_outlier(row):
        return ((row < (Q1 - 1.5 * IQR)) | (row > (Q3 + 1.5 * IQR))).sum()

    outlier_counts = df.apply(is_outlier, axis=1)
    threshold = len(df.columns) * OUTLIER_THRESHOLD
    rows_to_drop = outlier_counts[outlier_counts > threshold].index
    df_filtered = df.drop(index=rows_to_drop)
    df_ground_truth = df_ground_truth.drop(index=rows_to_drop)
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Filtered DataFrame shape: {df_filtered.shape}")
    df = df_filtered

    # Normalization
    scaler = StandardScaler()
    df = scaler.fit_transform(df)

    # Apply clustering algorithm
    if algorithm == 'kmeans':
        model = KMeans(n_clusters=n_clusters, n_init=10)
    elif algorithm == 'gmm':
        model = GaussianMixture(n_components=n_clusters, n_init=10)
    else:
        raise ValueError("Unsupported algorithm. Use 'kmeans' or 'gmm'.")

    clusters = model.fit_predict(df)

    # Apply PCA
    pca = PCA(n_components=3)
    df_reduced = pca.fit_transform(df)

    # 2D Plot
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.scatter(df_reduced[:, 0], df_reduced[:, 1], c=clusters, cmap="viridis")
    plt.title(plot_title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar(label="Cluster Label")

    plt.subplot(1, 2, 2)
    plt.scatter(
        df_reduced[:, 0],
        df_reduced[:, 1],
        c=df_ground_truth["label"].astype("category").cat.codes,
        cmap="viridis",
    )
    plt.title("KMeans Clustering Results - Ground Truth")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar(label="Ground Truth Label")

    plt.show()

    # 3D Plot
    fig = plt.figure(figsize=(12, 6))
    ax = fig.add_subplot(121, projection='3d')
    ax.scatter(df_reduced[:, 0], df_reduced[:, 1], df_reduced[:, 2], c=clusters, cmap="viridis")
    ax.set_title(plot_title)
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.set_zlabel("Component 3")

    ax = fig.add_subplot(122, projection='3d')
    ax.scatter(df_reduced[:, 0], df_reduced[:, 1], df_reduced[:, 2], c=df_ground_truth["label"].astype("category").cat.codes, cmap="viridis")
    ax.set_title("KMeans Clustering Results - Ground Truth")
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.set_zlabel("Component 3")

    plt.show()

    # Attributions
    loadings = pca.components_
    for i, component in enumerate(loadings):
        component_loadings = zip(feature_names, component)
        sorted_loadings = sorted(component_loadings, key=lambda x: abs(x[1]), reverse=True)
        print(f"Principal Component {i+1}:")
        for feature, loading in sorted_loadings[:5]:
            print(f"{feature}: {loading}")
        print("\n")

In [14]:
labels = utils.get_metadata_df("features-4", "Randomisatielijst.csv")
labels

Unnamed: 0,filename,procedure,patient_id,eeg_type,pre_post
0,TMS-EEG-H_07_S3_rsEEG_pre-epo.csv,itbs,07,rsEEG,pre
1,TMS-EEG-H_06_S2_rsEEG_pre-epo.csv,itbs,06,rsEEG,pre
2,TMS-EEG-H_14_S1_rsEEG_pre-epo.csv,ctbs,14,rsEEG,pre
3,TMS-EEG-H_17_S2_rsEEG_pre-epo.csv,sham,17,rsEEG,pre
4,TMS-EEG-H_16_S3_rsEEG_pre-epo.csv,itbs,16,rsEEG,pre
...,...,...,...,...,...
83,TMS-EEG-H_02_S3_rsEEG_pre-epo.csv,ctbs,02,rsEEG,pre
84,TMS-EEG-H_09_S3_rsEEG_post-epo.csv,sham,09,rsEEG,post
85,TMS-EEG-H_08_s1_rsEEG_pre-epo.csv,sham,08,rsEEG,pre
86,TMS-EEG-H_16_S2_rsEEG_post-epo.csv,sham,16,rsEEG,post


In [None]:
# Use all data for feature importance to find out the most general trend first
filenames = labels[
    (labels["eeg_type"] == "rsEEG")
]["filename"]

top_feature_importance(filenames)

## Session variability
By clustering all pre sessions per patient, we can get a feel for the inter-session variability. If this is too high, it means that the distinction in feature values is primarily based upon this difference.

In [None]:
for patientid in range(2, 19):
    filenames = labels[
        (labels["eeg_type"] == "rsEEG")
        & (labels["patient_id"] == f"{patientid:02}")
        & (labels["pre_post"] == "pre")
    ]["filename"]

    if len(filenames) == 0:
        continue

    top_feature_importance(filenames)

    # Get corresponding labels
    procedure_labels = [
        labels[labels["filename"] == file]["procedure"].values[0]
        for file in filenames
    ]

    # Load dataframe
    dfs = [
        pd.read_csv(os.path.join("features-4", file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)

    # Add procedure column
    ground_truth = []
    for procedure, df_part in zip(procedure_labels, dfs):
        ground_truth.extend([procedure] * len(df_part))
    df["label"] = ground_truth

    cluster_df(df, n_clusters=3, plot_title=f"Procedure clustering patient {patientid}")

## Individual procedures

The code below applies the clustering function to dataframes for each **individual procedure**. First, it's applied to all patients and then to each individual patient. We expect two seperate clusters for itbs and ctbs: one before and one after procedure, marking the impact of the procedure. Sham shouldn't have any impact.

In [None]:
for procedure in ["itbs", "ctbs", "sham"]:
    logger.info(f"Clustering {procedure}")
    # Cluster all patients
    filenames = labels[
        (labels["procedure"] == procedure)
        & (labels["eeg_type"] == "rsEEG")
    ]["filename"]

    if len(filenames) == 0:
        continue

    top_feature_importance(filenames)

    # Get corresponding labels
    pre_post_labels = [
        labels[labels["filename"] == file]["pre_post"].values[0]
        for file in filenames
    ]

    # Load dataframe
    dfs = [
        pd.read_csv(os.path.join("features-4", file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)

    # Add timings column
    ground_truth = []
    for timing, df_part in zip(pre_post_labels, dfs):
        ground_truth.extend([timing] * len(df_part))
    df["label"] = ground_truth

    # cluster_df(df, plot_title=procedure, algorithm="gmm")
    cluster_df(df, plot_title=procedure)

    # Cluster individual patients
    for patientid in range(2, 19):
        # Get relevant filenames
        filenames = labels[
            (labels["procedure"] == procedure)
            & (labels["eeg_type"] == "rsEEG")
            & (labels["patient_id"] == f"{patientid:02}")
        ]["filename"]

        if len(filenames) == 0:
            continue

        top_feature_importance(filenames)

        # Get corresponding labels
        pre_post_labels = [
            labels[labels["filename"] == file]["pre_post"].values[0]
            for file in filenames
        ]

        # Load dataframe
        dfs = [
            pd.read_csv(os.path.join("features-4", file), header=[0, 1])
            for file in filenames
        ]
        df = pd.concat(dfs)

        # Add timings column
        ground_truth = []
        for timing, df_part in zip(pre_post_labels, dfs):
            ground_truth.extend([timing] * len(df_part))
        df["label"] = ground_truth

        # cluster_df(df, plot_title=procedure, algorithm="gmm")
        cluster_df(df, plot_title=procedure)

## Post procedures

The code below applies the clustering function to dataframes after **all procedures** (timing is **post**). First, it's applied to all patients and then to each individual patient. We expect three seperate clusters for each procedure.


In [None]:
logger.info(f"Clustering all post procedure data")

# Cluster all patients
filenames = labels[
    (labels["eeg_type"] == "rsEEG")
    & (labels["pre_post"] == "post")
]["filename"]

top_feature_importance(filenames)

# Get corresponding labels
procedure_labels = [
    labels[labels["filename"] == file]["procedure"].values[0]
    for file in filenames
]

# Load dataframe
dfs = [
    pd.read_csv(os.path.join("features-4", file), header=[0, 1])
    for file in filenames
]
df = pd.concat(dfs)

# Add procedure column
ground_truth = []
for procedure, df_part in zip(procedure_labels, dfs):
    ground_truth.extend([procedure] * len(df_part))
df["label"] = ground_truth

cluster_df(df, n_clusters=3, plot_title="Procedure clustering")

# Cluster individual patients
for patientid in range(2, 19):
    # Get relevant filenames
    filenames = labels[
        (labels["eeg_type"] == "rsEEG")
        & (labels["patient_id"] == f"{patientid:02}")
        & (labels["pre_post"] == "post")
    ]["filename"]

    if len(filenames) == 0:
        continue

    top_feature_importance(filenames)

    # Get corresponding labels
    procedure_labels = [
        labels[labels["filename"] == file]["procedure"].values[0]
        for file in filenames
    ]

    # Load dataframe
    dfs = [
        pd.read_csv(os.path.join("features-4", file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)

    # Add procedure column
    ground_truth = []
    for procedure, df_part in zip(procedure_labels, dfs):
        ground_truth.extend([procedure] * len(df_part))
    df["label"] = ground_truth

    cluster_df(df, n_clusters=3, plot_title=f"Procedure clustering patient {patientid}")

## Procedure paired t-test

Another test that could be of interest is a paired t-test where for each procedure, the impact the procedure had on the average feature values on the patient is tested. This is done with a paired t-test.

In [15]:
import pandas as pd
import os
from scipy.stats import ttest_rel

def process_and_average(pre_filenames, post_filenames, source_folder="features-4"):
    def load_data(filenames):
        data = []
        for filename in filenames:
            df = pd.read_csv(os.path.join(source_folder, filename), header=[0, 1])
            data.append(df)
        return pd.concat(data, ignore_index=True)
    
    # Load pre and post data
    pre_data = load_data(pre_filenames)
    post_data = load_data(post_filenames)
    
    # Concatenate pre and post data
    combined_data = pd.concat([pre_data, post_data], ignore_index=True)

    # Normalize the combined data
    scaler = StandardScaler()
    normalized_combined = pd.DataFrame(scaler.fit_transform(combined_data), columns=combined_data.columns)
    
    # Split back into pre and post dataframes
    normalized_pre_data = normalized_combined.iloc[:len(pre_data)]
    normalized_post_data = normalized_combined.iloc[len(pre_data):]
    
    # Average the dataframes to one row
    averaged_pre_data = normalized_pre_data.mean().to_frame().T
    averaged_post_data = normalized_post_data.mean().to_frame().T
    
    return averaged_pre_data, averaged_post_data

def perform_t_test(normalize=True):
    t_test_results = {}

    # t-test for each procedure individually
    for procedure in ["itbs", "ctbs", "sham"]:
        logger.info(f"Performing t-test for {procedure}")
        all_pre_avg_values = []
        all_post_avg_values = []

        # get average value entries for each patient
        for patientid in range(2, 19):
            pre_filenames = labels[
                (labels["eeg_type"] == "rsEEG")
                & (labels["patient_id"] == f"{patientid:02}")
                & (labels["pre_post"] == "pre")
                & (labels["procedure"] == procedure)
            ]["filename"]
            
            post_filenames = labels[
                (labels["eeg_type"] == "rsEEG")
                & (labels["patient_id"] == f"{patientid:02}")
                & (labels["pre_post"] == "post")
                & (labels["procedure"] == procedure)
            ]["filename"]
            
            if len(pre_filenames) == 0 or len(post_filenames) == 0:
                continue
            
            pre_avg_values, post_avg_values = process_and_average(pre_filenames, post_filenames)

            all_pre_avg_values.append(pre_avg_values)
            all_post_avg_values.append(post_avg_values)

        if all_pre_avg_values and all_post_avg_values:
            # convert list to df
            all_pre_avg_values = pd.concat(all_pre_avg_values, ignore_index=True)
            all_post_avg_values = pd.concat(all_post_avg_values, ignore_index=True)
            
            t_test_results[procedure] = {}
            # t-test for individual features
            for column in all_pre_avg_values.columns:
                t_stat, p_value = ttest_rel(all_pre_avg_values[column], all_post_avg_values[column])
                
                t_test_results[procedure][column] = {
                    "t_stat": t_stat,
                    "p_value": p_value
                }
    return t_test_results

In [16]:
t_test_results = perform_t_test()

[32m[2024-08-18 12:47:46,967] - INFO - Performing t-test for itbs[0m
[32m[2024-08-18 12:49:17,055] - INFO - Performing t-test for ctbs[0m
[32m[2024-08-18 12:50:38,484] - INFO - Performing t-test for sham[0m


Significant features for itbs:
- variance
- std
- skewness
- rms
- hjorth_mobility
- hjorth_complexity
- zero_crossings
- line_length
- app_entropy
- spect_entropy
- hurst_exp
- pow_freq_bands
- phase_lock_val
- time_corr
- spect_corr
Significant features for ctbs:
- variance
- std
- ptp_amp
- skewness
- kurtosis
- rms
- hjorth_mobility
- zero_crossings
- app_entropy
- pow_freq_bands
- phase_lock_val
- time_corr
- spect_corr
Significant features for sham:
- variance
- std
- ptp_amp
- skewness
- rms
- line_length
- hurst_exp
- pow_freq_bands
- phase_lock_val
- time_corr
- spect_corr


In [26]:
def filter_significant_results(t_test_results, significance_level=0.05):
    significant_results = {}
    lowest_p_value_features = {}
    
    for procedure, features in t_test_results.items():
        significant_features = {}
        lowest_p_values = {}
        
        for feature, stats in features.items():
            feature_name = feature[0]
            if feature_name not in significant_features:
                significant_features[feature_name] = []
                lowest_p_values[feature_name] = (feature[1], stats["p_value"])
            else:
                if stats["p_value"] < lowest_p_values[feature_name][1]:
                    lowest_p_values[feature_name] = (feature[1], stats["p_value"])

            # Record p-values regardless of significance
            significant_features[feature_name].append(stats["p_value"])
        
        if significant_features:
            significant_results[procedure] = significant_features
            lowest_p_value_features[procedure] = lowest_p_values
    
    return significant_results, lowest_p_value_features

def calculate_average_p_values(significant_results):
    avg_p_values = {}
    
    for procedure, features in significant_results.items():
        avg_p_values[procedure] = {}
        
        for feature, p_values in features.items():
            if p_values:  # Check to ensure the list is not empty
                avg_p_values[procedure][feature] = sum(p_values) / len(p_values)
            else:
                avg_p_values[procedure][feature] = None  # Or handle as appropriate
    
    return avg_p_values

# Filtering results and finding lowest p-value features
significant_results, lowest_p_value_features = filter_significant_results(t_test_results)

# Calculating average p-values
avg_p_values = calculate_average_p_values(significant_results)

# Printing results
for procedure, features in avg_p_values.items():
    print(f"Features for {procedure}:")
    for feature, avg_p_value in features.items():
        if avg_p_value is not None:  # Ensure there's an average p-value to display
            print(f"- {feature}: Average p-value = {avg_p_value:.5f}")
            lowest_feature_detail, lowest_p_value = lowest_p_value_features[procedure][feature]
            print(f"  * Lowest p-value observed at: {lowest_feature_detail} (p-value = {lowest_p_value:.5f})")


Features for itbs:
- variance: Average p-value = 0.58940
  * Lowest p-value observed at: Fp1 (p-value = 0.02290)
- std: Average p-value = 0.58830
  * Lowest p-value observed at: Fp1 (p-value = 0.03359)
- ptp_amp: Average p-value = 0.61729
  * Lowest p-value observed at: Fp1 (p-value = 0.06698)
- skewness: Average p-value = 0.54334
  * Lowest p-value observed at: PO3 (p-value = 0.02991)
- kurtosis: Average p-value = 0.62806
  * Lowest p-value observed at: TP8 (p-value = 0.06036)
- rms: Average p-value = 0.59975
  * Lowest p-value observed at: Fp1 (p-value = 0.03266)
- hjorth_mobility: Average p-value = 0.29200
  * Lowest p-value observed at: P7 (p-value = 0.00116)
- hjorth_complexity: Average p-value = 0.27386
  * Lowest p-value observed at: CP1 (p-value = 0.00154)
- zero_crossings: Average p-value = 0.25533
  * Lowest p-value observed at: TP10 (p-value = 0.00065)
- line_length: Average p-value = 0.38005
  * Lowest p-value observed at: TP10 (p-value = 0.00323)
- app_entropy: Average p-v

In [24]:
def gather_all_p_values(t_test_results):
    all_p_values = {}
    
    for procedure, features in t_test_results.items():
        feature_p_values = {}
        
        for feature, stats in features.items():
            feature_name = feature[0]
            if feature_name not in feature_p_values:
                feature_p_values[feature_name] = []
            feature_p_values[feature_name].append(stats["p_value"])
        
        all_p_values[procedure] = feature_p_values
    
    return all_p_values

all_p_values = gather_all_p_values(t_test_results)
avg_p_values = calculate_average_p_values(all_p_values)

for procedure, features in avg_p_values.items():
    print(f"Average p-values for features in {procedure}:")
    for feature, avg_p_value in features.items():
        print(f"- {feature}: Average p-value = {avg_p_value:.5f}")

Average p-values for features in itbs:
- variance: Average p-value = 0.58940
- std: Average p-value = 0.58830
- ptp_amp: Average p-value = 0.61729
- skewness: Average p-value = 0.54334
- kurtosis: Average p-value = 0.62806
- rms: Average p-value = 0.59975
- hjorth_mobility: Average p-value = 0.29200
- hjorth_complexity: Average p-value = 0.27386
- zero_crossings: Average p-value = 0.25533
- line_length: Average p-value = 0.38005
- app_entropy: Average p-value = 0.23917
- spect_entropy: Average p-value = 0.30797
- hurst_exp: Average p-value = 0.61133
- pow_freq_bands: Average p-value = 0.43633
- phase_lock_val: Average p-value = 0.46207
- time_corr: Average p-value = 0.49236
- spect_corr: Average p-value = 0.47147
Average p-values for features in ctbs:
- variance: Average p-value = 0.47755
- std: Average p-value = 0.48018
- ptp_amp: Average p-value = 0.50656
- skewness: Average p-value = 0.45774
- kurtosis: Average p-value = 0.55133
- rms: Average p-value = 0.49479
- hjorth_mobility: Av

## Conclusion

- The most important and recurring features for distinction between the capturing sessions are correlation, spectral power, entropy, and hjorth parameters.
- Clear clusters are achieved, but also on sham procedures, which shouldn't result in clusters at all. This likely means that the brain state of the patients varies too much between recording sessions.
- Paired t-tests show that the average impact of the procedures on the patients is negligible.