In [None]:
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from importlib import reload
# import feature_extraction
# reload(feature_extraction)
from biomarkers import (
    EEG,
    EMG,
    BP,
    EOG,
    ECG,
    TREV,
    GSR,
    Resp,
    EGG,
    MARKER_TO_CHANNEL_NAMES,
)
from feature_extraction import (
    Feature,
    EEG_BANDS,
    STAT_FEATURES,
)
from data_utils import (
    extract_features,
    extract_labels,
    load_data_from_dir,
    get_all_behaviors_labels,
    get_all_features_by_marker,
    extract_features_by_channel,
    concatenate_all_data,
)
from calculate_correlation import (
    EEG_BANDS_LIST,
    get_all_behaviors_feature_to_pc_by_markers,
    get_all_trials_average_rp_values,
)

from labels import get_categorical_labels, print_label_count

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
"""
    load features from csv
"""
from dataframe.csv_utils import (
    load_data_from_csv,
    get_labels_from_result,
    get_features_from_result,
)

dir_name = "extracted_features_v1"
result = load_data_from_csv(dir_name)
all_label_array, label_list = get_labels_from_result(result)
all_feature_array, feature_names = get_features_from_result(result)
all_feature_array = all_feature_array.drop(["index"], axis=1)
feature_names = all_feature_array.columns
print(all_feature_array.shape, len(feature_names), len(label_list))

In [None]:
ALL_DIRS = [
    "../CleandDataV1/2017",
    "../CleandDataV1/2018",
    "../CleandDataV1/2020",
    "../CleandDataV1/2024",
    "../CleandDataV1/2025",
]

dir_to_data = {}
for dir_name in ALL_DIRS:
    all_data = load_data_from_dir(dir_name)
    dir_to_data[dir_name] = all_data

In [None]:
from dataframe.extraction import (
    extract_features_by_markers,
)

"""
  extract features from physiological signals
"""
# (TODO) GSR, Resp, TREV  BP.__name__, EGG.__name__
markers = [EEG.__name__]
for s in ALL_DIRS:
    # extract features
    df = extract_features_by_markers(markers, dir_to_data, [s])
#     print(s, df.shape)
#     subject_features = result[result["Subject"] == s]
#     subject_features = subject_features.reset_index()

    all_features = df #pd.concat([df, subject_features], axis=1)
    # save features to csv
    subject_name = s.replace("../CleandDataV1/", "")


    """ 
      extract labels from behavior data
    """
    all_label_array = extract_labels(dir_to_data, all_dir=[s])
    all_features["Valence"] = all_label_array["valence"]
    all_features["Arousal"] = all_label_array["arousal"]
    all_features["Attention"] = all_label_array["attention"]
    all_features["Subject"] = [s]*130
    all_features.to_csv(
        f"extracted_features_v1/{subject_name}_features.csv", index=False
    )    

In [None]:
"""
    calculate correlation
"""
from dataframe.correlation import (
    get_feature_to_corr_by_behavior,
    get_behavior_to_average_corr,
)

marker = EEG.__name__
behavior_to_rp = {}
for b in ["Valence", "Arousal", "Attention"]:
    behavior_to_rp[b] = get_feature_to_corr_by_behavior(
        result, b, feature_names, marker, "pearsonr"
    )

avg_condition_to_features = get_behavior_to_average_corr(behavior_to_rp)

In [None]:
from plotting import (
    plot_correlation_table_by_channel,
    plot_k_chaneels_by_r_value,
    plot_eeg_topomap_all_blocks,
)

""" plot the single channel correlation table
"""
# channel = 1
# features = STAT_FEATURES
channel_names = MARKER_TO_CHANNEL_NAMES[marker]
# for condition, feature_to_pc in avg_condition_to_features.items():
#     label = f"{channel_names[channel]} {condition}"
#     plot_correlation_table_by_channel(
#         label,
#         feature_to_pc,
#         ["pearson r", "pearson p", "spearman r", "spearman p"],
#         features,
#         channel,
#         True,
#     )

""" plot the top channel correlation table given r values
"""
# for condition, feature_to_pc in avg_condition_to_features.items():
#     features = feature_to_pc.keys()
#     plot_k_chaneels_by_r_value(feature_to_pc, channel_names, features, condition, True, 10)
#     plot_k_chaneels_by_r_value(feature_to_pc, channel_names, features, condition, False, 10)

""" 
    plot the topography for eeg
"""
# for condition, feature_to_pc in avg_condition_to_features.items():
#     plot_eeg_topomap_all_blocks(condition, feature_to_pc)
#     features = feature_to_pc.keys()
#     for f in features:
#         plot_eeg_topomap_one_block(condition, f, feature_to_pc, all_block_names)


""" plot the top channel correlation table with different blocks
"""
# all_block_names = list(all_data.keys())
# all_block_names.sort()
# for condition, feature_to_pc in avg_condition_to_features.items():
#     plot_eeg_pearson_correlation_table(condition, feature_to_pc, all_block_names, 1)

""" plot the series for the top k channels
"""
# define number of rows and columns for subplots
# nrow = 3
# ncol = 2
# for condition, feature_to_pc in avg_condition_to_features.items():
#     print(f"{condition}")
#     ser_list = get_eeg_pearson_correlation_series_all_blocks(feature_to_pc, channel_names, k=20)
#     plot_series(nrow, ncol, ser_list)


""" 
    plot the time series given the marker and block
"""
# plot_time_series_by_epoch(all_data['audio_hvla'], 'EMG', 'audio_hvla', 0)

""" 
    plot the scattor for physiological signals
"""
# plot_pd_scatter_by_marker("LEOG", result, ['../2007', '../2002', '../2006'])

In [None]:
""" 
    discard high correlated features
"""
corr = all_feature_array.corr()
updated = corr[(((corr < 0.9) & (corr > -0.9)) | (corr == 1)).all(axis=1)]
all_feature_array = all_feature_array[list(updated.index.values)]
feature_names = all_feature_array.columns

print(all_feature_array.shape, len(feature_names), len(label_list))

In [None]:
import seaborn as sns

"""
    plot the correlation heatmap
"""
corr = all_feature_array.corr()
sorted_corr = corr.sort_index().sort_index(axis=1)
truncated_corr = sorted_corr.truncate(
    before="D7_ALPHA", after="VEOG_VAR", axis="rows"
).truncate(before="D7_ALPHA", after="VEOG_VAR", axis="columns")

plt.figure(figsize=(14, 12))
sns.heatmap(truncated_corr, vmin=-1, vmax=1, annot=False, cmap="RdBu_r")

In [None]:
from labels import get_tranformed_labels, binary_label, print_label_count
transformed = get_tranformed_labels(all_label_array)
label_list = get_categorical_labels(all_label_array, valence_threshold=0.6)
valence_lables = binary_label(all_label_array['valence'], 0.6)
print_label_count( label_list)

In [None]:
from sklearn.model_selection import GroupKFold
import shap

colors = ["c", "y", "m", "r"]

NUM_LABEL_PER_SUBJECT = 130
accuracy = []
gkf = GroupKFold()
label_array = np.array(label_list)
groups_list = [[i/NUM_LABEL_PER_SUBJECT] for i, j in enumerate(label_list)]
group_array = np.hstack(groups_list)

# normalize
scaler = StandardScaler()
normalized_all_feature_array = pd.DataFrame(
    scaler.fit_transform(all_feature_array), columns=all_feature_array.columns
)

list_shap_values = list()
list_test_sets = list()
for train_index, val_index in gkf.split(
    normalized_all_feature_array, label_array, groups=group_array
):
    train_features, train_labels = (
        normalized_all_feature_array.iloc[train_index],
        label_array[train_index],
    )
    val_features, val_labels = (
        normalized_all_feature_array.iloc[val_index],
        label_array[val_index],
    )

    # create model instance
    model = XGBClassifier(n_estimators=2)
    # fit model
    model.fit(train_features, train_labels)
    # Print accuracy.
    acc = model.score(val_features, val_labels)
    print("Accuracy: %.2f%%" % (acc * 100.0))
    accuracy.append(acc)

    # Summary plot
    shap_values = shap.TreeExplainer(model).shap_values(val_features)
     #for each iteration we save the test_set index and the shap_values
    list_shap_values.append(shap_values)
    list_test_sets.append(val_index)
#     color_func = lambda i: colors[i % len(shap_values)]
#     shap.summary_plot(
#         shap_values,
#         all_feature_array.columns,
#         #class_names=["nvla", "nvha", "hvla", "hvha"],
#         #color=color_func,
#     )


In [None]:
print(accuracy)
print(np.mean(accuracy))


In [None]:
#combining results from all iterations
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
    test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
    #shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1) # for multi
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=0) # for binary
#bringing back variable names    
X_test = pd.DataFrame(normalized_all_feature_array.iloc[test_set],columns=all_feature_array.columns)
#creating explanation plot for the whole experiment, the first dimension from shap_values indicate the class we are predicting (0=0, 1=1)
shap.summary_plot(shap_values, X_test)  # for binary
#shap.summary_plot(shap_values[1], X_test)  # for multi i = class_num

In [None]:
features = STAT_FEATURES
marker = EOG.__name__

channel_names = dir_to_data["../2002"]["audio_hvla"].get_chanlocs(marker)
channel_num = 0
""" 
    extract features from physiological signals
"""
# features_to_trials = extract_features_by_channel(marker, dir_to_data, features, channel_num, channel_names[channel_num])
# dir_name_to_labels = {}
# for dir_name, all_data in dir_to_data.items():
#     dir_name_to_labels[dir_name] = get_all_behaviors_labels(all_data)

In [None]:
""" 
    extract correlation from physiological signals
"""
num_channels = 4
num_blocks = 0
dir_name_to_ctf = {}
for dir_name, all_data in dir_to_data.items():
    dir_name_to_ctf[dir_name] = get_all_behaviors_feature_to_pc_by_markers(
        all_data, marker, features, num_channels, num_blocks
    )

avg_condition_to_features = get_all_trials_average_rp_values(
    dir_name_to_ctf, features, "pearson"
)
spearman_corr = get_all_trials_average_rp_values(dir_name_to_ctf, features, "spearman")
for b, feature_to_pc in avg_condition_to_features.items():
    for f, pc in feature_to_pc.items():
        avg_condition_to_features[b][f] = np.hstack((pc, spearman_corr[b][f]))


In [None]:
""" 
    concatenate raw signal (x)
"""
data_list = []
block_names = list(dir_to_data[ALL_DIRS[0]].keys())
block_names.sort()

for m in [EGG.__name__, EEG.__name__]:  # EOG.__name__, , EMG.__name__, EGG.__name__
    all_participants_data, condition_to_labels = concatenate_all_data(dir_to_data, m)
    all_epoch_data = np.swapaxes(
        all_participants_data, 0, -1
    )  # (num_channels, num_data_points, num_epochs) => (num_epochs, num_data_points, num_channels)

    data_list.append(all_epoch_data)

data_array = np.concatenate(data_list, axis=2)

In [None]:
from tensorflow.keras import utils as np_utils
from models import cnn_model, train_with_cnn

""" 
    prepare labels (y)
"""
# to one-hot encoding vector
label_array = np_utils.to_categorical(
    label_list, num_classes=4
)  # nvla, nvha, hvla, hvha
# label_array = np.array(label_list['arousal'])

print_label_count(label_list)
print(data_array.shape, label_array.shape, group_array.shape)


num_channel = data_array.shape[2]

model = cnn_model(12288, num_channel)
model.summary()

In [None]:
""" 
    with CNN
"""
accuracy = train_with_cnn(12288, num_channel, data_array, label_array, group_array)
print(accuracy)
print(np.mean(accuracy))

In [None]:
""" 
    with logistic regression
"""
# from models import train_with_logistic

# label_name = "attention"
# best_model = train_with_logistic(all_features, label_name, condition_to_labels, group_array)
# name_to_transformed = get_tranformed_labels(condition_to_labels)

# # assume bigger coefficents has more contribution to the model
# # but have to be sure that the features has THE SAME SCALE otherwise this assumption is not correct.
# importance = best_model["classifier"].coef_[0]

# feat_importances = pd.Series(importance, index=get_feature_names(importance))
# feat_importances.nlargest(10).plot(
#     kind="barh", title=f"{label_name} Feature Importance"
# )
