In [None]:
%pip install mne colorlog

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import os
import re
import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import utils
from sklearn.preprocessing import StandardScaler

logger = utils.get_logger()

# Clustering

When the features have been extracted, there's now a list of files containing one entry per epoch. Each file has a certain paradigm containing: patient, procedure, timing. Clustering will be applied for each individual procedure containing all timings (pre & post). These clusters will be generated once with all patients and once per individual patient.

The desired outcome is two clusters per plot; one for data before the procedure and one for after the procedure (with the active control procedure being the exception since this should have no impact).


In [None]:
def cluster_df(df, n_clusters=2, plot_title="PCA of Clusters"):
    """Expects one large dataframe containing all the data to be clustered, and one column 'timings' containing the label for each entry to compare cluster result against ground truth."""
    if "timings" not in df.columns:
        logger.error("No timings column found in DataFrame")
        return

    df_ground_truth = df
    df = df.drop(columns=["timings"])

    # outlier removal
    OUTLIER_THRESHOLD = 0.05
    Q1 = df.quantile(0.10)
    Q3 = df.quantile(0.90)
    IQR = Q3 - Q1

    def is_outlier(row):
        return ((row < (Q1 - 1.5 * IQR)) | (row > (Q3 + 1.5 * IQR))).sum()

    outlier_counts = df.apply(is_outlier, axis=1)
    threshold = len(df.columns) * OUTLIER_THRESHOLD
    rows_to_drop = outlier_counts[outlier_counts > threshold].index
    df_filtered = df.drop(index=rows_to_drop)
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Filtered DataFrame shape: {df_filtered.shape}")
    df = df_filtered

    # normalization
    scaler = StandardScaler()
    df = scaler.fit_transform(df)

    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(df)

    # # Evaluate the clustering results
    # score = metrics.adjusted_rand_score(your_labels, clusters)

    pca = PCA(n_components=2)
    df_reduced = pca.fit_transform(df)

    plt.subplot(1, 2, 1)
    plt.scatter(df_reduced[:, 0], df_reduced[:, 1], c=clusters, cmap="viridis")
    plt.title(plot_title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar(label="Cluster Label")

    print(
        f"lengths: {len(df_reduced[:, 0])}, {len(df_reduced[:, 1])}, {len(df_ground_truth['timings'])}"
    )

    plt.subplot(1, 2, 2)
    plt.scatter(
        df_reduced[:, 0],
        df_reduced[:, 1],
        c=df_ground_truth["timings"].astype("category").cat.codes,
        cmap="viridis",
    )
    plt.title("KMeans Clustering Results - Ground Truth")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar(label="Ground Truth Label")

    plt.show()

    # Attributions
    loadings = pca.components_
    feature_names = df.columns
    for i, component in enumerate(loadings):
        component_loadings = zip(feature_names, component)
        sorted_loadings = sorted(component_loadings, key=lambda x: abs(x[1]), reverse=True)
        print(f"Principal Component {i+1}:")
        for feature, loading in sorted_loadings[:5]:
            print(f"{feature}: {loading}")
        print("\n")

In [None]:
labels = utils.get_metadata_df("features", "Randomisatielijst.csv")

In [None]:
labels

In [None]:
for procedure in ["itbs", "ctbs", "sham"]:
    logger.info(f"Clustering {procedure}")
    # Cluster all patients
    filenames = labels[
        (labels["procedure"] == procedure)
        & (labels["eeg_type"] == "rsEEG")
    ]["filename"]

    if len(filenames) == 0:
        continue

    # Get corresponding labels
    pre_post_labels = [
        labels[labels["filename"] == file]["pre_post"].values[0]
        for file in filenames
    ]

    # Load dataframe
    dfs = [
        pd.read_csv(os.path.join("features", file), header=[0, 1])
        for file in filenames
    ]
    df = pd.concat(dfs)

    # Add timings column
    ground_truth = []
    for timing, df_part in zip(pre_post_labels, dfs):
        ground_truth.extend([timing] * len(df_part))
    df["timings"] = ground_truth

    cluster_df(df, plot_title=procedure)

    # Cluster individual patients
    for patientid in range(2, 19):
        # Get relevant filenames
        filenames = labels[
            (labels["procedure"] == procedure)
            & (labels["eeg_type"] == "rsEEG")
            & (labels["patient_id"] == f"{patientid:02}")
        ]["filename"]

        if len(filenames) == 0:
            continue

        # Get corresponding labels
        pre_post_labels = [
            labels[labels["filename"] == file]["pre_post"].values[0]
            for file in filenames
        ]

        # Load dataframe
        dfs = [
            pd.read_csv(os.path.join("features", file), header=[0, 1])
            for file in filenames
        ]
        df = pd.concat(dfs)

        # Add timings column
        ground_truth = []
        for timing, df_part in zip(pre_post_labels, dfs):
            ground_truth.extend([timing] * len(df_part))
        df["timings"] = ground_truth

        cluster_df(df, plot_title=procedure)

## Clustering: Procedure Impact

The first clustering is done pre/post procedure. The goal is to see if there is a pattern to the impact a procedure has on all patients.

- Patients: all
- Procedures: seperate
- Timing: all
  => 2 or 3 clusters, depending if active control is included


In [None]:
def cluster_df(df, num_clusters):
    feature_names = df.columns

    # outlier removal
    OUTLIER_THRESHOLD = 0.0
    Q1 = df.quantile(0.02)
    Q3 = df.quantile(0.98)
    IQR = Q3 - Q1

    def is_outlier(row):
        return ((row < (Q1 - 1.5 * IQR)) | (row > (Q3 + 1.5 * IQR))).sum()

    outlier_counts = df.apply(is_outlier, axis=1)
    threshold = len(df.columns) * OUTLIER_THRESHOLD
    rows_to_drop = outlier_counts[outlier_counts > threshold].index
    df_filtered = df.drop(index=rows_to_drop)
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Filtered DataFrame shape: {df_filtered.shape}")
    df = df_filtered

    # normalization
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    df = scaler.fit_transform(df)

    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=2)
    clusters = kmeans.fit_predict(df)

    # PCA
    pca = PCA(n_components=num_clusters)
    df_reduced = pca.fit_transform(df)

    # plot clusters
    plt.scatter(df_reduced[:, 0], df_reduced[:, 1], c=clusters, cmap="viridis")
    plt.title("PCA of Clusters")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.colorbar(label="Cluster Label")
    plt.show()

    # Attributions
    loadings = pca.components_
    for i, component in enumerate(loadings):
        component_loadings = zip(feature_names, component)
        sorted_loadings = sorted(
            component_loadings, key=lambda x: abs(x[1]), reverse=True
        )
        print(f"Principal Component {i+1}:")
        for feature, loading in sorted_loadings[:20]:
            print(f"{feature}: {loading}")
        print("\n")

In [None]:
def get_procedure_df(procedure_id):
    dfs = []
    label_df = pd.read_csv("labels.csv")
    for filename in os.listdir("features"):
        filename = filename.split(".")[0]
        # get row from label_df with matching filename
        row = label_df[label_df["filename"] == filename]
        if row["procedure"].values[0] == procedure_id:
            df = pd.read_csv(os.path.join("features", filename + ".csv"), header=[0, 1])
            dfs.append(df)
    if len(dfs) == 0:
        return None
    df = pd.concat(dfs)
    return df

### cTBS


In [None]:
ctbs_df = get_procedure_df(2)

In [None]:
cluster_df(ctbs_df, 2)

### iTBS


In [None]:
df = get_procedure_df(1)