In [None]:
# Basic imports
import numpy as np
import pandas as pd
import os

from dataframe.csv_utils import (
    load_data_from_csv,
    get_labels_from_result,
    get_features_from_result,
)


# For machine learning modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from constants import AUDIO_BLOCKS

NUM_LABEL_PER_SUBJECT = 130

# Constants

In [None]:
marker = 'ECG'
data_dir = "../CleandDataV2/"

# CNN

## Load data

In [None]:
import pickle

with open('./data/ecg_rr_peaks.pkl', 'rb') as fp:
    ecg_features = pickle.load(fp)
    ecg_features = ecg_features['ecg_rr_peaks']

channel_name = marker + ''
feature_name = 'ECG_HRV_RR' #Feature.ECG_LFHF.name

marker_features = []
for data in ecg_features: #all_features
    marker_features.append({channel_name: {feature_name: data}})
marker_features[0][channel_name][feature_name].shape    

In [None]:
# Read labels pkl file with slicing behavioral
with open("./data/behavioral_labels.pkl", "rb") as fp:
    behavioral_labels = pickle.load(fp)

sliced_valence_labels, sliced_arousal_labels, _ = (
    behavioral_labels["valence_labels"],
    behavioral_labels["arousal_labels"],
    behavioral_labels["attention_labels"],
)
print(len(sliced_valence_labels), len(sliced_arousal_labels[0]))

# Load labels without slice
num_slice_per_trial = 5
valence_labels, arousal_labels, label_thresholds = [], [], []
subject_list = []
si = 0
for d in os.listdir(data_dir):
    dir_name = data_dir + d
    if not os.path.isdir(dir_name):
        continue    

    vls, als = sliced_valence_labels[si], sliced_arousal_labels[si]
    si += 1
    
    filter_indexes = np.arange(0, len(vls), num_slice_per_trial)
    valence_labels.append(np.array(vls)[filter_indexes])
    arousal_labels.append(np.array(als)[filter_indexes])    
    label_thresholds.append((np.mean(vls), np.mean(als)))
    subject_list.append(d)

len(subject_list)

## Prepare labels and dataset builder

In [None]:
from resample.resample import get_consecutive_validation_indexes
from model.dataset import DatasetBuilder

n_step_trial = 3
num_slice_per_trial = 1
val_indexes = [
    get_consecutive_validation_indexes(
        len(valence_labels[0]), len(AUDIO_BLOCKS), num_slice_per_trial, i, n_step_trial
    )
    for i in range(1, 13, n_step_trial)
]
print(len(val_indexes), val_indexes)


dataset_builder = DatasetBuilder(len(valence_labels[0]), val_indexes_group=val_indexes)
len(valence_labels[0])

In [None]:
import tensorflow as tf

from tensorflow import keras
from keras import layers, initializers
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.legacy import Adam

def get_label_category(labels, label_type, v_thred, a_thred):
    threshold = a_thred if label_type == "arousal" else v_thred
    return [0 if p < threshold else 1 for p in labels]

def prepare_dataset(
    data_dict,
    dataset_builder,
    labels,
):  
    # Combine with ecg...
    dataset_dict = {k: {} for k in data_dict.keys()}
    for k, feature_to_data in data_dict.items():
        for f, fd in feature_to_data.items():
            dataset_dict[k][f] = dataset_builder.train_test_split(fd, [], labels)
    return dataset_dict


def create_model(input_x: int, input_y: int=1, units=8, dropout=0.01):
    input_layer = keras.Input(shape=(input_x, input_y))

    conv_layer_1 = layers.Conv1D(
        filters=units, kernel_size=3, dilation_rate=1, groups=1,
        padding='causal', kernel_initializer=initializers.he_normal(seed=11)
    )(input_layer)
    conv_layer_1 = layers.Activation('gelu')(conv_layer_1)
    conv_layer_1 = layers.SpatialDropout1D(rate=dropout)(conv_layer_1)
    
    cnn_outputs = layers.Flatten()(conv_layer_1)
    # Dense layers
    activation_func = 'relu'
    outputs = layers.Dense(int(units/2), activation=activation_func)(cnn_outputs)
    outputs = layers.Dropout(dropout)(outputs)
    # outputs = layers.Dense(int(units/4), activation=activation_func)(outputs)
    # outputs = Dropout(dropout2)(outputs)

    output_layer = layers.Dense(2, activation="softmax")(outputs)
    optimizer = Adam(learning_rate=3e-4)
    model = keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy', metrics=["accuracy"], optimizer=optimizer)
    return model

In [None]:
from keras.callbacks import EarlyStopping

subject_accuracy_summary = {
    "subject": [],
    "channel": [],
    "feature": [],
    "label_type": [],
    "cv_mean_score": [],
}

for idx in range(len(subject_list)):
    subj = subject_list[idx]
    print("decoding subject...", subj)

    v_thred, a_thred = label_thresholds[idx]
    for lt in ["valence", "arousal"]:
        labels = valence_labels[idx] if lt == "valence" else arousal_labels[idx]
        thred = v_thred if lt == "valence" else a_thred

        dataset_dict = prepare_dataset(
            marker_features[idx],
            dataset_builder,
            labels,
        )

        for channel, feature_to_data in dataset_dict.items():
            for f, dataset in feature_to_data.items():
                scores = []
                for _, (train_data, train_labels, val_data, val_labels) in enumerate(
                    dataset
                ):
                    normalized_train_labels = tf.keras.utils.to_categorical(
                        get_label_category(train_labels, lt, v_thred, a_thred),
                        num_classes=2,
                    )
                    normalized_val_labels = tf.keras.utils.to_categorical(
                        get_label_category(val_labels, lt, v_thred, a_thred),
                        num_classes=2,
                    )
                    # The first data is eeg and then ecg...
                    for idx, td in enumerate(train_data):
                        if idx == 1:
                            continue
                        standard_scaler = StandardScaler()

                        X_train_standard = standard_scaler.fit_transform(td)
                        X_test_standard = standard_scaler.transform(val_data[idx])
                        X_train_standard = np.expand_dims(X_train_standard, axis=2)
                        X_test_standard = np.expand_dims(X_test_standard, axis=2)

                        best_model = create_model(
                            td.shape[-1], units=64, dropout=0.1
                        )  # 0.05
                        callbacks = [
                            EarlyStopping(
                                monitor="val_loss",
                                patience=35,
                                restore_best_weights=True,
                            )
                        ]
                        history = best_model.fit(
                            x=X_train_standard,
                            y=normalized_train_labels,
                            validation_data=(X_test_standard, normalized_val_labels),
                            callbacks=callbacks,
                            epochs=200,
                            verbose=0,
                            batch_size=20,
                        )
                        # print(history.history['val_accuracy'][-1])
                        scores.append(history.history["val_accuracy"][-1])
                print(np.mean(scores), scores)
                subject_accuracy_summary["subject"].append(subj)
                subject_accuracy_summary["channel"].append(channel)
                subject_accuracy_summary["feature"].append(f)
                subject_accuracy_summary["cv_mean_score"].append(np.mean(scores))
                subject_accuracy_summary["label_type"].append(lt)

In [None]:
subject_accuracy_summary = pd.DataFrame(subject_accuracy_summary)
subject_accuracy_summary["subject"] = subject_accuracy_summary["subject"].astype(int)
print(subject_accuracy_summary[subject_accuracy_summary.label_type =='valence']['cv_mean_score'].mean())
print(subject_accuracy_summary[subject_accuracy_summary.label_type =='arousal']['cv_mean_score'].mean())
subject_accuracy_summary.head()

In [None]:
import seaborn as sns 

data = subject_accuracy_summary
title = f"{marker} HRV time domain - CNN "  #
g = sns.swarmplot(
    data=data,
    x="label_type",
    y="cv_mean_score",
    alpha=0.6,
    dodge=True,
    legend=False,
)
g.set_ylim((0.2, 1))
g.set_title(title)

df_means = (
    data.groupby(["label_type", "channel"])["cv_mean_score"].agg("mean").reset_index()
)

pp = sns.pointplot(
    x="label_type",
    y="cv_mean_score",
    data=df_means,
    linestyles="",
    scale=2.5,
    markers="_",
    order=["valence", "arousal"],
)
sns.despine(bottom = True, left = True)
g.axhline(0.5, color="red", dashes=(2, 2))
#sns.move_legend(pp, "upper right", bbox_to_anchor=(1.4, 1))


# XGBoost

In [None]:
"""
    load features from csv
"""
import shap

from labels import (
    get_tranformed_labels,
    binary_label,
    print_label_count,
    get_categorical_labels,
)

from sklearn.model_selection import GroupKFold
import xgboost as xgb
from xgboost import XGBClassifier

dir_name = "eeg_features1"
result = load_data_from_csv(dir_name)

all_label_array, label_list = get_labels_from_result(result)
all_feature_array, feature_names = get_features_from_result(
    result, ["Subject", "Valence", "Arousal", "Attention"], False
)
# all_feature_array = all_feature_array.drop(["index"], axis=1)

# filter_pattern = ".*(?<!BETA2)$"
# only_specific_feature = ".*GAMMA$"
# all_feature_array = all_feature_array.filter(regex=only_specific_feature)
feature_names = all_feature_array.columns
print(all_feature_array.shape, len(feature_names), len(label_list))

In [None]:
from sklearn.decomposition import PCA  # Principal Component Analysis

scaler = StandardScaler()
# Scale each column in numer
normalized_all_feature_array = pd.DataFrame(
    scaler.fit_transform(all_feature_array), columns=all_feature_array.columns
)


# reduced_data = PCA(n_components=0.95).fit_transform(normalized_all_feature_array)
# normalized_all_feature_array = pd.DataFrame(reduced_data)
normalized_all_feature_array.head()

In [None]:
transformed = get_tranformed_labels(all_label_array)
label_list = get_categorical_labels(all_label_array, valence_threshold=0.6)
valence_lables = binary_label(all_label_array["valence"], 0.65)
is_multi = False

label_list = valence_lables
print_label_count(label_list)

In [None]:
housing_dmatrix = xgb.DMatrix(data=normalized_all_feature_array, label=np.array(label_list))

# Creata the parameter dictionary for each tree: params
params = {"objective":"reg:squarederror", "max_depth":3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
best_rmse = []

# Systematicallyvary the eta
for curr_val in eta_vals:
    params['eta'] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
                        early_stopping_rounds=5, num_boost_round=10, metrics='rmse', seed=123, 
                       as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-rmse-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=['eta', 'best_rmse']))

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.5, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}
eval_metric = ["auc","error"]
# Instantiate the regressor: gbm
gbm = xgb.XGBClassifier(use_label_encoder=False, objective= 'binary:logistic',eval_metric=eval_metric)

# Perform grid search: grid_mse
grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm, 
                        scoring='accuracy', cv=5, verbose=2)

# Fit grid_mse to the data
grid_mse.fit(normalized_all_feature_array, np.array(label_list))

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
#print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) neg_mean_squared_error
print(grid_mse.best_score_)

In [None]:
colors = ["c", "y", "m", "r"]


accuracy = []
gkf = GroupKFold()
label_array = np.array(label_list)
groups_list = [[i / NUM_LABEL_PER_SUBJECT] for i, j in enumerate(label_list)]
group_array = np.hstack(groups_list)

list_shap_values = list()
list_test_sets = list()
for train_index, val_index in gkf.split(
    normalized_all_feature_array, label_array, groups=group_array
):
    train_features, train_labels = (
        normalized_all_feature_array.iloc[train_index],
        label_array[train_index],
    )
    val_features, val_labels = (
        normalized_all_feature_array.iloc[val_index],
        label_array[val_index],
    )

    # create model instance
    model = XGBClassifier(n_estimators=2)
    # fit model
    model.fit(train_features, train_labels)
    # Print accuracy.
    acc = model.score(val_features, val_labels)
    print("Accuracy: %.2f%%" % (acc * 100.0))
    accuracy.append(acc)

    # Summary plot
    shap_values = shap.TreeExplainer(model).shap_values(val_features)
    # for each iteration we save the test_set index and the shap_values
    list_shap_values.append(shap_values)
    list_test_sets.append(val_index)

In [None]:
# combining results from all iterations
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1, len(list_test_sets)):
    test_set = np.concatenate((test_set, list_test_sets[i]), axis=0)
    shap_values = (
        np.concatenate((shap_values, np.array(list_shap_values[i])), axis=1)
        if is_multi
        else np.concatenate((shap_values, np.array(list_shap_values[i])), axis=0)
    )  # for binary

# bringing back variable names
X_test = pd.DataFrame(
    normalized_all_feature_array.iloc[test_set], columns=feature_names
)

# creating explanation plot for the whole experiment, the first dimension from shap_values indicate the class we are predicting (0=0, 1=1)
if is_multi:
    shap.summary_plot(shap_values[1], X_test)  # for multi i = class_num
else:
    shap.summary_plot(shap_values, X_test)  # for binary