In [None]:
%pip install mne colorlog

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import os
import re
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import utils
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D

logger = utils.get_logger()

# Clustering

When the features have been extracted, there's now a list of files containing one entry per epoch. Each file has a certain paradigm containing: patient, procedure, timing. Clustering will be applied for each individual procedure containing all timings (pre & post). These clusters will be generated once with all patients and once per individual patient.

The desired outcome is two clusters per plot; one for data before the procedure and one for after the procedure (with the active control procedure being the exception since this should have no impact).


In [None]:
def cluster_df(df, n_clusters=2, plot_title="PCA of Clusters", algorithm='kmeans'):
    """Expects one large dataframe containing all the data to be clustered, and one column 'timings' containing the label for each entry to compare cluster result against ground truth."""
    if "label" not in df.columns:
        logger.error("No timings column found in DataFrame")
        return

    df_ground_truth = df
    df = df.drop(columns=["label"])
    feature_names = df.columns

    # Outlier removal
    OUTLIER_THRESHOLD = 0.05
    Q1 = df.quantile(0.10)
    Q3 = df.quantile(0.90)
    IQR = Q3 - Q1

    def is_outlier(row):
        return ((row < (Q1 - 1.5 * IQR)) | (row > (Q3 + 1.5 * IQR))).sum()

    outlier_counts = df.apply(is_outlier, axis=1)
    threshold = len(df.columns) * OUTLIER_THRESHOLD
    rows_to_drop = outlier_counts[outlier_counts > threshold].index
    df_filtered = df.drop(index=rows_to_drop)
    df_ground_truth = df_ground_truth.drop(index=rows_to_drop)
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Filtered DataFrame shape: {df_filtered.shape}")
    df = df_filtered

    # Normalization
    scaler = StandardScaler()
    df = scaler.fit_transform(df)

    # Apply clustering algorithm
    if algorithm == 'kmeans':
        model = KMeans(n_clusters=n_clusters, n_init=10)
    elif algorithm == 'gmm':
        model = GaussianMixture(n_components=n_clusters, n_init=10)
    else:
        raise ValueError("Unsupported algorithm. Use 'kmeans' or 'gmm'.")

    clusters = model.fit_predict(df)

    # Apply PCA
    pca = PCA(n_components=3)
    df_reduced = pca.fit_transform(df)

    # 2D Plot
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.scatter(df_reduced[:, 0], df_reduced[:, 1], c=clusters, cmap="viridis")
    plt.title(plot_title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar(label="Cluster Label")

    plt.subplot(1, 2, 2)
    plt.scatter(
        df_reduced[:, 0],
        df_reduced[:, 1],
        c=df_ground_truth["label"].astype("category").cat.codes,
        cmap="viridis",
    )
    plt.title("KMeans Clustering Results - Ground Truth")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar(label="Ground Truth Label")

    plt.show()

    # 3D Plot
    fig = plt.figure(figsize=(12, 6))
    ax = fig.add_subplot(121, projection='3d')
    ax.scatter(df_reduced[:, 0], df_reduced[:, 1], df_reduced[:, 2], c=clusters, cmap="viridis")
    ax.set_title(plot_title)
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.set_zlabel("Component 3")

    ax = fig.add_subplot(122, projection='3d')
    ax.scatter(df_reduced[:, 0], df_reduced[:, 1], df_reduced[:, 2], c=df_ground_truth["label"].astype("category").cat.codes, cmap="viridis")
    ax.set_title("KMeans Clustering Results - Ground Truth")
    ax.set_xlabel("Component 1")
    ax.set_ylabel("Component 2")
    ax.set_zlabel("Component 3")

    plt.show()

    # Attributions
    loadings = pca.components_
    for i, component in enumerate(loadings):
        component_loadings = zip(feature_names, component)
        sorted_loadings = sorted(component_loadings, key=lambda x: abs(x[1]), reverse=True)
        print(f"Principal Component {i+1}:")
        for feature, loading in sorted_loadings[:5]:
            print(f"{feature}: {loading}")
        print("\n")

In [None]:
labels = utils.get_metadata_df("features-4", "Randomisatielijst.csv")

In [None]:
labels["filename"].to_string()

## Individual procedures

The code below applies the clustering function to dataframes for each **individual procedure**. First, it's applied to all patients and then to each individual patient. We expect two seperate clusters for itbs and ctbs: one before and one after procedure, marking the impact of the procedure. Sham shouldn't have any impact.

In [None]:
for procedure in ["itbs", "ctbs", "sham"]:
    logger.info(f"Clustering {procedure}")
    # Cluster all patients
    filenames = labels[
        (labels["procedure"] == procedure)
        & (labels["eeg_type"] == "rsEEG")
    ]["filename"]

    if len(filenames) == 0:
        continue

    # Get corresponding labels
    pre_post_labels = [
        labels[labels["filename"] == file]["pre_post"].values[0]
        for file in filenames
    ]

    # Load dataframe
    dfs = [
        pd.read_csv(os.path.join("features-4", file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)

    # Add timings column
    ground_truth = []
    for timing, df_part in zip(pre_post_labels, dfs):
        ground_truth.extend([timing] * len(df_part))
    df["label"] = ground_truth

    # cluster_df(df, plot_title=procedure, algorithm="gmm")
    cluster_df(df, plot_title=procedure)

    # Cluster individual patients
    for patientid in range(2, 19):
        # Get relevant filenames
        filenames = labels[
            (labels["procedure"] == procedure)
            & (labels["eeg_type"] == "rsEEG")
            & (labels["patient_id"] == f"{patientid:02}")
        ]["filename"]

        if len(filenames) == 0:
            continue

        # Get corresponding labels
        pre_post_labels = [
            labels[labels["filename"] == file]["pre_post"].values[0]
            for file in filenames
        ]

        # Load dataframe
        dfs = [
            pd.read_csv(os.path.join("features-4", file), header=[0, 1])
            for file in filenames
        ]
        df = pd.concat(dfs)

        # Add timings column
        ground_truth = []
        for timing, df_part in zip(pre_post_labels, dfs):
            ground_truth.extend([timing] * len(df_part))
        df["label"] = ground_truth

        # cluster_df(df, plot_title=procedure, algorithm="gmm")
        cluster_df(df, plot_title=procedure)

## Procedure differences

The code below applies the clustering function to dataframes after **all procedures** (timing is **post**). First, it's applied to all patients and then to each individual patient. We expect three seperate clusters for each procedure.


In [None]:
logger.info(f"Clustering all post procedure data")
# Cluster all patients
filenames = labels[
    (labels["eeg_type"] == "rsEEG")
    & (labels["pre_post"] == "post")
]["filename"]

# Get corresponding labels
procedure_labels = [
    labels[labels["filename"] == file]["procedure"].values[0]
    for file in filenames
]

# Load dataframe
dfs = [
    pd.read_csv(os.path.join("features-4", file), header=[0, 1])
    for file in filenames
]
df = pd.concat(dfs)

# Add procedure column
ground_truth = []
for procedure, df_part in zip(procedure_labels, dfs):
    ground_truth.extend([procedure] * len(df_part))
df["label"] = ground_truth

cluster_df(df, n_clusters=3, plot_title="Procedure clustering")

# Cluster individual patients
for patientid in range(2, 19):
    # Get relevant filenames
    filenames = labels[
        (labels["eeg_type"] == "rsEEG")
        & (labels["patient_id"] == f"{patientid:02}")
        & (labels["pre_post"] == "post")
    ]["filename"]

    if len(filenames) == 0:
        continue

    # Get corresponding labels
    procedure_labels = [
        labels[labels["filename"] == file]["procedure"].values[0]
        for file in filenames
    ]

    # Load dataframe
    dfs = [
        pd.read_csv(os.path.join("features-4", file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)

    # Add procedure column
    ground_truth = []
    for procedure, df_part in zip(procedure_labels, dfs):
        ground_truth.extend([procedure] * len(df_part))
    df["label"] = ground_truth

    cluster_df(df, n_clusters=3, plot_title=f"Procedure clustering patient {patientid}")

## Conclusion

- The most recurring features are correlation, entropy, and hjorth parameters.
- Clear clusters are achieved, but also on sham procedures, which shouldn't result in clusters at all.