In [None]:
# Basic imports
import numpy as np
import pandas as pd

from dataframe.csv_utils import (
    load_data_from_csv,
    get_filtered_data,
)

# sklearn imports
from sklearn.decomposition import PCA  # Principal Component Analysis
from sklearn.manifold import TSNE  # T-Distributed Stochastic Neighbor Embedding

# plotly imports
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode

SORTED_BLOCK_NAMES = [
    "a_hvha",
    "a_hvla",
    "a_nvha",
    "a_nvla",
    "b_hvha",
    "b_hvla",
    "b_nvha",
    "b_nvla",
    "medi",
    "wandering",
]

In [None]:
"""
    load features from csv
"""
dir_name = "eeg_features2"
result = load_data_from_csv(dir_name)

In [None]:
# This is needed so we can display plotly plots properly
init_notebook_mode(connected=True)

In [None]:
from umap import UMAP

import plotly.express as px
from plotly.offline import plot
from plotly.subplots import make_subplots


def get_filter_pattern(feature: str = "ALL", channel: str = '') -> str:
    prefix = ''
    if len(channel) > 0:
        prefix = f'^{channel}'

    # get_other feature_pattern = ".*(?<!GAMMA|BETA2|BETA1|ALPHA|THETA|DELTA)$"
    if feature == "ALL":
        return f"{prefix}.*(?<!sdf)$"
    
    return f"{prefix}.*{feature}$"  # f"^{feature}.*(?<!BETA2)$"

def get_processed_feature(feature: str, channel: str, result: pd.DataFrame, subjects: list=[]):
    pattern = get_filter_pattern(feature, channel)
    normalized_eeg_features, filtered_result, label_list = get_filtered_data(
        result, subjects, pattern
    )
    print('filtered', normalized_eeg_features.shape)
    return normalized_eeg_features, filtered_result

def get_umap(feature: str, channel: str, result: pd.DataFrame, all_blocks: list, subjects: list = []):
    pattern = get_filter_pattern(feature, channel)
    normalized_eeg_features, filtered_result, label_list = get_filtered_data(
        result, subjects, pattern
    )
    print(normalized_eeg_features.shape)

    # Run UMAP
    umap2d = UMAP(n_components=2, init="random", random_state=0)
    proj_2d = pd.DataFrame(umap2d.fit_transform(normalized_eeg_features))

    # Concatanate the umap points and original data
    filtered_result = filtered_result.reset_index()
    filtered_result["condition"] = all_blocks * len(subjects)
    proj_2d.columns = ["C1_2d", "C2_2d"]
    return pd.concat([filtered_result, proj_2d], axis=1, join="inner")

def get_pca(feature: str, channel: str, result: pd.DataFrame, subjects: list=[]):
    normalized_eeg_features, filtered_result = get_processed_feature(feature, channel, result, subjects)

    # Run PCA
    pca = PCA(n_components=2)
    proj_2d = pd.DataFrame(pca.fit_transform(normalized_eeg_features))

    # Concatanate the umap points and original data
    filtered_result = filtered_result.reset_index()
    proj_2d.columns = ["C1_2d", "C2_2d"]
    return pd.concat([filtered_result['Subject'], proj_2d], axis=1, join="inner")

def show_result(feature: str, subjects: list, plotX: pd.DataFrame, conditions: list, to_save: bool=False):
    title = f"EEG {feature} (average spectral power per trial)"

    figures = []
    for subj in subjects:
        df = plotX[plotX["Subject"] == subj]
        figures.append(px.scatter(df, x="C1_2d", y="C2_2d", color=conditions, opacity=0.5))

    fig = make_subplots(
        rows=int((len(figures) / 2) + (len(figures) % 2)),
        cols=2,
        subplot_titles=subjects,
        horizontal_spacing=0.1,
        vertical_spacing=0.05,
    )

    for i, figure in enumerate(figures):
        showlegend = True if i == 0 else False
        for trace in range(len(figure["data"])):
            figure["data"][trace].update(showlegend=showlegend)
            fig.append_trace(
                figure["data"][trace], row=int(i / 2 + 1), col=int(i % 2 + 1)
            )

    fig.update_layout(
        height=1000, width=1000, title_text=title, margin=dict(r=0, b=0, l=0)
    )

    name = ",".join(str(s) for s in subjects)
    filename = f"results/valence/{feature}_{name}.png"
    if to_save:
        fig.write_image(filename)

    fig.show()
    
def show_pca_matrix(
    band: str, 
    channel: str, 
    subject: int, 
    result: pd.DataFrame, 
    all_blocks: list, 
    n_components: int,
    to_save: bool=False,
):
    normalized_eeg_features, _ = get_processed_feature(band, channel, result, [subject])
    pca = PCA(n_components=n_components)
    proj_2d = pd.DataFrame(pca.fit_transform(normalized_eeg_features))
    labels = {
        str(i): f"PC {i+1} ({var:.1f}%)"
        for i, var in enumerate(pca.explained_variance_ratio_ * 100)
    }

    identifier = str(subject) + f'_{band}_{channel}_channel'
    total_var = pca.explained_variance_ratio_.sum() * 100
    fig = px.scatter_matrix(
        proj_2d,
        labels=labels,
        dimensions=range(len(labels)),
        color=all_blocks,
        opacity=0.5,
        title=f'{identifier} Total Explained Variance: {total_var:.2f}%',
    )
    fig.update_traces(diagonal_visible=False)
    fig.update_layout(
        height=1000, width=1000, margin=dict(r=0, b=0, l=0)
    )

    filename = f"results/{identifier}_PCA_breakdown.png"
    if to_save:
        fig.write_image(filename)
        
    fig.show()

#     target = 4
#     proj_2d = pd.concat([proj_2d[0], proj_2d[target]], axis=1, join="inner")
#     proj_2d.columns = [0, 1]

#     target_str = str(target)
#     labels = { '0': labels['0'], target_str: labels[target_str]}
#     fig = px.scatter_matrix(
#         proj_2d,
#         labels=labels,
#         dimensions=range(len(labels)),
#         color=all_blocks * len(subjects),
#         opacity=0.5,
#         title=f'Total Explained Variance: {total_var:.2f}%',
#     )
#     fig.update_traces(diagonal_visible=False)
#     fig.show()

In [None]:
FOUR_CONDITIONS = ["audio"] * 4 + ["breath"] * 4 + ["meditation", "wandering"]
VALENCE_CONDITIONS = (
    ["h_valence"] * 2
    + ["n_valence"] * 2
    + ["h_valence"] * 2
    + ["n_valence"] * 2
    + ["meditation", "wandering"]
)

AROUSAL_CONDITIONS = (
    ["h_arousal", "l_arousal"] * 4
    + ["meditation", "wandering"]
)

name_to_batch = {
    "f": [2017, 2018, 2020, 2024, 2025, 2026],
    "s": [2028, 2029, 2031, 2032, 2033, 2035],
    "t": [2036, 2039, 2040, 2041, 2042, 2043, 2044, 2045],
}

subjects = name_to_batch['s']
all_blocks = []
for b in FOUR_CONDITIONS:
    all_blocks.extend([b] * 13)  

In [None]:
 # https://plotly.com/python/pca-visualization/
# ['A', 'B', 'C', 'D']
for band in ['ALL', 'DELTA', 'THETA', 'ALPHA', 'BETA1', 'BETA2', 'GAMMA']:
    for ch in ['A', 'B', 'C', 'D']: 
        plotX = get_pca(band, ch ,result, subjects)
        feature = f'valence_{band}_{ch}_channel_PCA'
        show_result(feature, subjects, plotX, all_blocks, False)

In [None]:
n_components=6
for s in subjects:
    show_pca_matrix('THETA', 'C', s, result, all_blocks, n_components, True)

In [None]:
for f in ['ALL', 'DELTA', 'THETA', 'ALPHA', 'BETA1', 'BETA2', 'GAMMA']:
    plotX = get_umap(f, result, all_blocks, subjects)
    feature = f'{f}_UMAP'
    show_result(feature, subjects, plotX, condition)

In [None]:
# reference https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/
reduced_data = PCA(n_components=0.95).fit_transform(normalized_eeg_features)
normalized_eeg_features = pd.DataFrame(reduced_data)
normalized_eeg_features.head()


In [None]:
# from sklearn.mixture import GaussianMixture
# import matplotlib.pyplot as plt

# gmm = GaussianMixture(n_components=2).fit(normalized_eeg_features)
# clusters = gmm.predict(normalized_eeg_features)


In [None]:
from sklearn.cluster import AgglomerativeClustering
import time

print("Compute unstructured hierarchical clustering...")
st = time.time()
ward = AgglomerativeClustering(n_clusters=4, linkage="ward").fit(
    normalized_eeg_features
)
elapsed_time = time.time() - st
clusters = ward.labels_
print(f"Elapsed time: {elapsed_time:.2f}s")
print(f"Number of points: {clusters.size}")


In [None]:
from dataframe.visualization import pca_2d

# Add the cluster vector to our DataFrame, X
normalized_eeg_features["Cluster"] = clusters
# normalized_eeg_features['Block'] = all_blocks

pca_2d(
    normalized_eeg_features,
    4,
    [
        "rgba(255, 128, 255, 0.8)",
        "rgba(255, 128, 2, 0.8)",
        "rgba(0, 255, 200, 0.8)",
        "rgba(0, 128, 200, 0.8)",
    ],
    title,
    False,
    mode="markers",
    textfont=dict(size=10),
)


In [None]:
# Clustering
# kmeans = KMeans(init="k-means++", n_clusters=4, n_init=2)
# kmeans.fit(normalized_eeg_features)
# #Find which cluster each data-point belongs to
# clusters = kmeans.predict(normalized_eeg_features)


# from sklearn.cluster import DBSCAN
# from sklearn import metrics

# db = DBSCAN(eps=7, min_samples=10).fit(normalized_eeg_features)
# clusters = db.labels_

# # Number of clusters in labels, ignoring noise if present.
# n_clusters_ = len(set(clusters)) - (1 if -1 in clusters else 0)
# n_noise_ = list(clusters).count(-1)

# print("Estimated number of clusters: %d" % n_clusters_)
# print("Estimated number of noise points: %d" % n_noise_)
# clusters


In [None]:
# visualization via t-SNE
# Set our perplexity
perplexity = 50
# T-SNE with one dimension
tsne_1d = TSNE(n_components=1, perplexity=perplexity)

# T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

# T-SNE with three dimensions
tsne_3d = TSNE(n_components=3, perplexity=perplexity)
# This DataFrame holds a single dimension,built by T-SNE
s_1d = pd.DataFrame(
    tsne_1d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1))
)

# This DataFrame contains two dimensions, built by T-SNE
s_2d = pd.DataFrame(
    tsne_2d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1))
)

# And this DataFrame contains three dimensions, built by T-SNE
s_3d = pd.DataFrame(
    tsne_3d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1))
)

s_1d.columns = ["TC1_1d"]

# "TC1_2d" means: 'The first component of the components created for 2-D visualization, by T-SNE.'
# And "TC2_2d" means: 'The second component of the components created for 2-D visualization, by T-SNE.'
s_2d.columns = ["TC1_2d", "TC2_2d"]

s_3d.columns = ["TC1_3d", "TC2_3d", "TC3_3d"]
method = "t-SNE"
