In [None]:
#Basic imports
import numpy as np
import pandas as pd

from dataframe.csv_utils import (
    load_data_from_csv,
    get_labels_from_result,
    get_features_from_result,
)


#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

CLEANED_BETA_SUBJECT = [
    "../CleandDataV1/2017",
    "../CleandDataV1/2018",
    "../CleandDataV1/2020",
    "../CleandDataV1/2024",
    "../CleandDataV1/2025",
]

In [None]:
"""
    load features from csv
"""

dir_name = "extracted_features_v1"
result = load_data_from_csv(dir_name)

# Drop not cleaned beta subjects
# mask = result["Subject"].isin(CLEANED_BETA_SUBJECT)
# result = result[mask]

all_label_array, label_list = get_labels_from_result(result)
all_feature_array, feature_names = get_features_from_result(result, False)
all_feature_array = all_feature_array.drop(["index"], axis=1)
feature_names = all_feature_array.columns
print(all_feature_array.shape, len(feature_names), len(label_list))

In [None]:
#Filter features
eeg_features = all_feature_array.filter(regex=".*(?<!GAMMA|BETA2|BETA1|ALPHA|THETA|DELTA)$")
eeg_features.head()

In [None]:
#Initialize our scaler
scaler = StandardScaler()
#Scale each column in numer
normalized_eeg_features = pd.DataFrame(scaler.fit_transform(eeg_features))
normalized_eeg_features.head()

In [None]:
# reference https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/
reduced_data = PCA(n_components=0.95).fit_transform(normalized_eeg_features)
normalized_eeg_features = pd.DataFrame(reduced_data)
normalized_eeg_features.head()

In [None]:
# Clustering
kmeans = KMeans(init="k-means++", n_clusters=4, n_init=4)
kmeans.fit(normalized_eeg_features)
#Find which cluster each data-point belongs to
clusters = kmeans.predict(normalized_eeg_features)
#Add the cluster vector to our DataFrame, X
normalized_eeg_features["Cluster"] = clusters

In [None]:
# visualization via t-SNE
#Set our perplexity
perplexity = 50
#T-SNE with one dimension
tsne_1d = TSNE(n_components=1, perplexity=perplexity)

#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

#T-SNE with three dimensions
tsne_3d = TSNE(n_components=3, perplexity=perplexity)
#This DataFrame holds a single dimension,built by T-SNE
s_1d = pd.DataFrame(tsne_1d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#This DataFrame contains two dimensions, built by T-SNE
s_2d = pd.DataFrame(tsne_2d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#And this DataFrame contains three dimensions, built by T-SNE
s_3d = pd.DataFrame(tsne_3d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

s_1d.columns = ["TC1_1d"]

#"TC1_2d" means: 'The first component of the components created for 2-D visualization, by T-SNE.'
#And "TC2_2d" means: 'The second component of the components created for 2-D visualization, by T-SNE.'
s_2d.columns = ["TC1_2d","TC2_2d"]

s_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]
method = 't-SNE'

In [None]:
# visualization via PCA
# reference: https://www.kaggle.com/code/minc33/visualizing-high-dimensional-clusters/notebook#Imports:

#PCA with one principal component
pca_1d = PCA(n_components=1)
#PCA with two principal components
pca_2d = PCA(n_components=2)
#PCA with three principal components
pca_3d = PCA(n_components=3)

#This DataFrame holds that single principal component mentioned above
s_1d = pd.DataFrame(pca_1d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
s_2d = pd.DataFrame(pca_2d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
s_3d = pd.DataFrame(pca_3d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

# Rename the columns
s_1d.columns = ["PC1_1d"]
#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
s_2d.columns = ["PC1_2d", "PC2_2d"]
s_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
method = 'PCA'

In [None]:
plotX = pd.concat([normalized_eeg_features,s_1d,s_2d,s_3d], axis=1, join='inner')

# Used for 1-D
plotX["dummy"] = 0

#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
cluster3 = plotX[plotX["Cluster"] == 3]

In [None]:
#This is needed so we can display plotly plots properly
init_notebook_mode(connected=True)

In [None]:
#trace1 is for 'Cluster 0'
col_name = s_1d.columns[0]
trace1 = go.Scatter(
                    x = cluster0[col_name],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1[col_name],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2[col_name],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter(
                    x = cluster3[col_name],
                    y = cluster3["dummy"],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 128, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4]

title = f"EEG(including BETA): Clusters in One Dimension Using {method}"

layout = dict(title = title,
              xaxis= dict(title= col_name,ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [None]:
#trace1 is for 'Cluster 0'
col_name0 = s_2d.columns[0]
col_name1 = s_2d.columns[1]
trace1 = go.Scatter(
                    x = cluster0[col_name0],
                    y = cluster0[col_name1],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1[col_name0],
                    y = cluster1[col_name1],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2[col_name0],
                    y = cluster2[col_name1],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter(
                    x = cluster3[col_name0],
                    y = cluster3[col_name1],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 128, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4]

title = f"EEG(including BETA): Clusters in Two Dimensions Using {method}"

layout = dict(title = title,
              xaxis= dict(title= col_name0,ticklen= 5,zeroline= False),
              yaxis= dict(title= col_name1,ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [None]:
col_name0 = s_3d.columns[0]
col_name1 = s_3d.columns[1]
col_name2 = s_3d.columns[2]

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0[col_name0],
                    y = cluster0[col_name1],
                    z = cluster0[col_name2],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1[col_name0],
                    y = cluster1[col_name1],
                    z = cluster1[col_name2],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2[col_name0],
                    y = cluster2[col_name1],
                    z = cluster2[col_name2],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter3d(
                    x = cluster3[col_name0],
                    y = cluster3[col_name1],
                    z = cluster3[col_name2],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 128, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4]

title = f"EEG(excluding BETA): Clusters in Three Dimensions Using {method}"

layout = dict(title = title,
              xaxis= dict(title= col_name0,ticklen= 5,zeroline= False),
              yaxis= dict(title= col_name1,ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)