In [1]:
#Basic imports
import numpy as np
import pandas as pd

from dataframe.csv_utils import (
    load_data_from_csv,
    get_labels_from_result,
    get_features_from_result,
)


#sklearn imports
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering
from sklearn.preprocessing import StandardScaler #used for 'Feature Scaling'

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

CLEANED_BETA_SUBJECT = [
    "../CleandDataV1/2017",
    "../CleandDataV1/2018",
    "../CleandDataV1/2020",
    "../CleandDataV1/2024",
    "../CleandDataV1/2025",
]

In [51]:
"""
    load features from csv
"""

dir_name = "extracted_features_v1"
result = load_data_from_csv(dir_name)

# Drop not cleaned beta subjects
# mask = result["Subject"].isin(CLEANED_BETA_SUBJECT)
# result = result[mask]

all_label_array, label_list = get_labels_from_result(result)
all_feature_array, feature_names = get_features_from_result(result, False)
all_feature_array = all_feature_array.drop(["index"], axis=1)
feature_names = all_feature_array.columns
print(all_feature_array.shape, len(feature_names), len(label_list))

(2210, 911) 911 2210


In [73]:
#Filter features
#get_other feature_pattern = ".*(?<!GAMMA|BETA2|BETA1|ALPHA|THETA|DELTA)$"
eeg_features = all_feature_array.filter(regex="^B.*GAMMA$")
eeg_features.head()

Unnamed: 0,B1_GAMMA,B2_GAMMA,B3_GAMMA,B4_GAMMA,B5_GAMMA,B6_GAMMA,B7_GAMMA,B8_GAMMA,B9_GAMMA,B10_GAMMA,...,B23_GAMMA,B24_GAMMA,B25_GAMMA,B26_GAMMA,B27_GAMMA,B28_GAMMA,B29_GAMMA,B30_GAMMA,B31_GAMMA,B32_GAMMA
0,437.203834,613.284869,604.388435,652.823552,610.826211,613.081102,607.700593,613.104916,623.925467,825.922083,...,1448.34759,949.035362,963.281746,980.685322,1742.436147,1025.559431,1244.79901,893.785659,879.442765,817.147208
1,453.049765,641.697332,653.500575,705.561394,678.014372,662.013592,661.45167,654.014884,646.723141,886.930563,...,1448.630001,1031.895037,1061.373869,1078.327947,1813.02998,1162.515949,1293.271469,906.722722,890.597366,834.869844
2,428.844315,630.988736,616.67303,669.773952,609.437071,586.322245,575.30085,574.709633,590.38518,728.069151,...,1425.1287,1070.183712,1105.073101,1129.648603,1783.653748,1119.776609,1182.480505,841.80518,820.865137,773.086392
3,857.240432,1284.710504,1235.939597,1285.954045,1237.375502,1300.970462,1311.689289,1316.152372,1681.305603,1537.888187,...,2447.992259,1974.898443,2047.809594,2099.049507,3027.960703,2338.466427,2271.572983,1849.554498,1830.510402,1716.867667
4,556.283989,804.267099,831.927285,932.012265,880.603217,850.858091,847.79901,842.818755,805.858495,1153.824809,...,1751.697997,1275.609721,1320.068572,1351.234945,2267.866724,1355.63139,1460.930657,1061.146147,1044.408155,988.181572


In [74]:
#Initialize our scaler
scaler = StandardScaler()
#Scale each column in numer
normalized_eeg_features = pd.DataFrame(scaler.fit_transform(eeg_features))
normalized_eeg_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.194584,1.16042,0.129217,0.652786,0.347008,0.186061,-0.145777,-0.187424,-0.304062,0.302291,...,0.796744,0.088296,-0.188177,-0.157048,0.899602,0.125704,0.353581,-0.109083,0.185264,0.570956
1,0.288471,1.313601,0.224768,0.881008,0.644869,0.364442,0.023517,-0.047236,-0.236186,0.468487,...,0.797139,0.229984,-0.059625,-0.017537,0.99018,0.329213,0.414993,-0.094515,0.203798,0.618398
2,0.145053,1.255867,0.153118,0.726139,0.34085,0.088513,-0.247823,-0.318995,-0.403922,0.035724,...,0.764306,0.295457,-0.002356,0.055789,0.952487,0.265705,0.274627,-0.167613,0.087938,0.45301
3,2.683313,4.780304,1.357939,3.392641,3.124647,2.693733,2.071499,2.221739,2.844104,2.241799,...,2.193326,1.842499,1.233125,1.440859,2.549045,2.076598,1.654443,0.967121,1.76545,2.979407
4,0.900137,2.19007,0.571908,1.860968,1.542992,1.052867,0.610434,0.599746,0.237612,1.19555,...,1.220549,0.646731,0.279401,0.372389,1.573776,0.616169,0.627407,0.079367,0.459352,1.028796


In [16]:
# reference https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/
reduced_data = PCA(n_components=0.95).fit_transform(normalized_eeg_features)
normalized_eeg_features = pd.DataFrame(reduced_data)
normalized_eeg_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-11.851594,-0.490025,-0.033344,0.081132,0.901694,0.076524,0.482326,-0.189483,-0.009669
1,-10.125805,0.227896,0.180924,-0.044774,0.951012,0.152764,0.714882,0.266726,-0.116437
2,-9.032871,-0.160388,0.334244,0.509675,0.872546,0.27571,0.465017,-0.158288,0.068108
3,-8.895699,-0.947722,0.196435,0.37581,0.779859,-0.760065,0.132133,-0.544422,-0.416483
4,-10.48131,-0.518308,0.06727,0.061677,0.845429,-0.253151,0.449278,-0.124238,-0.082353


In [75]:
# Clustering
kmeans = KMeans(init="k-means++", n_clusters=4, n_init=4)
kmeans.fit(normalized_eeg_features)
#Find which cluster each data-point belongs to
clusters = kmeans.predict(normalized_eeg_features)
#Add the cluster vector to our DataFrame, X
normalized_eeg_features["Cluster"] = clusters

In [76]:
# visualization via PCA
# reference: https://www.kaggle.com/code/minc33/visualizing-high-dimensional-clusters/notebook#Imports:

#PCA with one principal component
pca_1d = PCA(n_components=1)
#PCA with two principal components
pca_2d = PCA(n_components=2)
#PCA with three principal components
pca_3d = PCA(n_components=3)

#This DataFrame holds that single principal component mentioned above
s_1d = pd.DataFrame(pca_1d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
s_2d = pd.DataFrame(pca_2d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
s_3d = pd.DataFrame(pca_3d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

# Rename the columns
s_1d.columns = ["PC1_1d"]
#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
s_2d.columns = ["PC1_2d", "PC2_2d"]
s_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
method = 'PCA'

In [20]:
# visualization via t-SNE
#Set our perplexity
perplexity = 50
#T-SNE with one dimension
tsne_1d = TSNE(n_components=1, perplexity=perplexity)

#T-SNE with two dimensions
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

#T-SNE with three dimensions
tsne_3d = TSNE(n_components=3, perplexity=perplexity)
#This DataFrame holds a single dimension,built by T-SNE
s_1d = pd.DataFrame(tsne_1d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#This DataFrame contains two dimensions, built by T-SNE
s_2d = pd.DataFrame(tsne_2d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

#And this DataFrame contains three dimensions, built by T-SNE
s_3d = pd.DataFrame(tsne_3d.fit_transform(normalized_eeg_features.drop(["Cluster"], axis=1)))

s_1d.columns = ["TC1_1d"]

#"TC1_2d" means: 'The first component of the components created for 2-D visualization, by T-SNE.'
#And "TC2_2d" means: 'The second component of the components created for 2-D visualization, by T-SNE.'
s_2d.columns = ["TC1_2d","TC2_2d"]

s_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]
method = 't-SNE'

In [77]:
plotX = pd.concat([normalized_eeg_features,s_1d,s_2d,s_3d], axis=1, join='inner')

# Used for 1-D
plotX["dummy"] = 0

#Note that all of the DataFrames below are sub-DataFrames of 'plotX'.
#This is because we intend to plot the values contained within each of these DataFrames.

cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
cluster3 = plotX[plotX["Cluster"] == 3]

In [78]:
#This is needed so we can display plotly plots properly
init_notebook_mode(connected=True)

In [79]:
#trace1 is for 'Cluster 0'
col_name = s_1d.columns[0]
trace1 = go.Scatter(
                    x = cluster0[col_name],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1[col_name],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2[col_name],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter(
                    x = cluster3[col_name],
                    y = cluster3["dummy"],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 128, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4]

title = f"EEG mean DELTA: Clusters in One Dimension Using {method}"

layout = dict(title = title,
              xaxis= dict(title= col_name,ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [80]:
#trace1 is for 'Cluster 0'
col_name0 = s_2d.columns[0]
col_name1 = s_2d.columns[1]
trace1 = go.Scatter(
                    x = cluster0[col_name0],
                    y = cluster0[col_name1],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1[col_name0],
                    y = cluster1[col_name1],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2[col_name0],
                    y = cluster2[col_name1],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter(
                    x = cluster3[col_name0],
                    y = cluster3[col_name1],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 128, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4]

title = f"EEG mean GAMMA B: Clusters in Two Dimensions Using {method}"

layout = dict(title = title,
              xaxis= dict(title= col_name0,ticklen= 5,zeroline= False),
              yaxis= dict(title= col_name1,ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [None]:
col_name0 = s_3d.columns[0]
col_name1 = s_3d.columns[1]
col_name2 = s_3d.columns[2]

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0[col_name0],
                    y = cluster0[col_name1],
                    z = cluster0[col_name2],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1[col_name0],
                    y = cluster1[col_name1],
                    z = cluster1[col_name2],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2[col_name0],
                    y = cluster2[col_name1],
                    z = cluster2[col_name2],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

#trace4 is for 'Cluster 3'
trace4 = go.Scatter3d(
                    x = cluster3[col_name0],
                    y = cluster3[col_name1],
                    z = cluster3[col_name2],
                    mode = "markers",
                    name = "Cluster 3",
                    marker = dict(color = 'rgba(0, 128, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3, trace4]

title = f"EEG(excluding BETA): Clusters in Three Dimensions Using {method}"

layout = dict(title = title,
              xaxis= dict(title= col_name0,ticklen= 5,zeroline= False),
              yaxis= dict(title= col_name1,ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)