In [1]:
# Basic imports
import numpy as np
import pandas as pd

from dataframe.csv_utils import (
    load_data_from_csv,
    get_labels_from_result,
    get_features_from_result,
)

from labels import (
    get_tranformed_labels,
    binary_label,
    print_label_count,
    get_categorical_labels,
)

# For machine learning modeling
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import shap

  from pandas import MultiIndex, Int64Index
2023-03-02 19:57:36.005261: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
"""
    load features from csv
"""

dir_name = "eeg_features"
result = load_data_from_csv(dir_name)

all_label_array, label_list = get_labels_from_result(result)
all_feature_array, feature_names = get_features_from_result(
    result, ["Subject", "Valence", "Arousal", "Attention"], False
)
# all_feature_array = all_feature_array.drop(["index"], axis=1)

filter_pattern = ".*(?<!BETA2)$"
only_specific_feature = ".*GAMMA$"
all_feature_array = all_feature_array.filter(regex=only_specific_feature)
feature_names = all_feature_array.columns
print(all_feature_array.shape, len(feature_names), len(label_list))

(2600, 128) 128 2600


In [3]:
from sklearn.decomposition import PCA  # Principal Component Analysis

scaler = StandardScaler()
# Scale each column in numer
normalized_all_feature_array = pd.DataFrame(
    scaler.fit_transform(all_feature_array), columns=all_feature_array.columns
)


# reduced_data = PCA(n_components=0.95).fit_transform(normalized_all_feature_array)
# normalized_all_feature_array = pd.DataFrame(reduced_data)
normalized_all_feature_array.head()

Unnamed: 0,A1 _GAMMA,A2_GAMMA,A3_GAMMA,A4_GAMMA,A5_GAMMA,A6_GAMMA,A7_GAMMA,A8_GAMMA,A9_GAMMA,A10_GAMMA,...,D23_GAMMA,D24_GAMMA,D25_GAMMA,D26_GAMMA,D27_GAMMA,D28_GAMMA,D29_GAMMA,D30_GAMMA,D31_GAMMA,D32_GAMMA
0,-0.47909,-0.311888,-0.492659,-0.347831,-0.116001,-0.291658,-0.126837,-0.478507,-0.53826,-0.564768,...,-0.239421,-0.304126,-0.158946,-0.131946,-0.000331,-0.150412,-0.145069,-0.367127,-0.29908,-0.237976
1,-0.505647,-0.2866,-0.428497,-0.29902,-0.128029,-0.296326,-0.117517,-0.429467,-0.54001,-0.558105,...,-0.221309,-0.292674,-0.144726,-0.114893,-0.060822,-0.191325,-0.128049,-0.33566,-0.252869,-0.202727
2,-0.401754,-0.221094,-0.426138,-0.322846,-0.102756,-0.291172,-0.085032,-0.326415,-0.354534,-0.41378,...,-0.224512,-0.292734,-0.149627,-0.123155,-0.116123,-0.226912,-0.096278,-0.293091,-0.196464,-0.144948
3,-0.357187,-0.182894,-0.378165,-0.267862,-0.058537,-0.21711,0.258177,-0.256747,-0.321044,-0.427033,...,-0.187362,-0.037455,0.389004,-0.092358,-0.045344,-0.211147,1.028505,0.160161,-0.174986,-0.006723
4,-0.377578,-0.165979,-0.386596,-0.297554,-0.095457,-0.276595,-0.067011,-0.327632,-0.315442,-0.360763,...,-0.210668,-0.281156,-0.131378,-0.101341,-0.107014,-0.218165,-0.073542,-0.277478,-0.1646,-0.100293


In [4]:
transformed = get_tranformed_labels(all_label_array)
label_list = get_categorical_labels(all_label_array, valence_threshold=0.6)
valence_lables = binary_label(all_label_array["valence"], 0.65)
is_multi = False

label_list = valence_lables
print_label_count(label_list)

0 label: 188, 1 label: 2412
0 label: 1370, 1 label: 1230
0 label: 185, 1 label: 2415
0 label: 1208, 1 label: 1392
{0: 1208, 1: 1392, 2: 0, 3: 0}


In [9]:
housing_dmatrix = xgb.DMatrix(data=normalized_all_feature_array, label=np.array(label_list))

# Creata the parameter dictionary for each tree: params
params = {"objective":"reg:squarederror", "max_depth":3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
best_rmse = []

# Systematicallyvary the eta
for curr_val in eta_vals:
    params['eta'] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
                        early_stopping_rounds=5, num_boost_round=10, metrics='rmse', seed=123, 
                       as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-rmse-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=['eta', 'best_rmse']))

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


     eta  best_rmse
0  0.001   0.499544
1  0.010   0.495831
2  0.100   0.472759
3  0.200   0.461789
4  0.300   0.454020
5  0.400   0.454511
6  0.500   0.456266


In [13]:
# Create the parameter dictionary
params = {"objective":"reg:squarederror", "eta": 0.3}

# Create list of max_depth values
max_depths = [2,3, 5, 10, 20]
best_rmse = []

for curr_val in max_depths:
    params['max_depth'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, 
                       early_stopping_rounds=5, num_boost_round=10, metrics='rmse', seed=123,
                        as_pandas=True)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results['test-rmse-mean'].tail().values[-1])
    
# Print the result DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)), columns=['max_depth', 'best_rmse']))

   max_depth  best_rmse
0          2   0.467203
1          3   0.458538
2          5   0.444916
3         10   0.448686
4         20   0.458807


In [17]:
# Creata the parameter dictionary for each tree: params
params = {"objective":"reg:squarederror", "max_depth":5, "eta": 0.3}
# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.3, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value 
for curr_val in colsample_bytree_vals:
    params['colsample_bytree'] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), 
                   columns=["colsample_bytree","best_rmse"]))

   colsample_bytree  best_rmse
0               0.3   0.451638
1               0.5   0.444846
2               0.8   0.444688
3               1.0   0.444916


In [5]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.5, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}
eval_metric = ["auc","error"]
# Instantiate the regressor: gbm
gbm = xgb.XGBClassifier(use_label_encoder=False, objective= 'binary:logistic',eval_metric=eval_metric)

# Perform grid search: grid_mse
grid_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=gbm, 
                        scoring='accuracy', cv=5, verbose=2)

# Fit grid_mse to the data
grid_mse.fit(normalized_all_feature_array, np.array(label_list))

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
#print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) neg_mean_squared_error
print(grid_mse.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .colsample_bytree=0.3, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.3, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.3, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.3, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.3, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.3, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.3, max_depth=5, n_estimators=50; total time=   0.2s
[CV] END .colsample_bytree=0.3, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.3, max_depth=5, n_estimators=50; total time=   0.2s
[CV] END .colsample_bytree=0.3, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.5, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.5, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=5, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.5, max_depth=5, n_estimators=50; total time=   0.2s
[CV] END .colsample_bytree=0.7, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.7, max_depth=2, n_estimators=50; total time=   0.1s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=2, n_estimators=50; total time=   0.1s
[CV] END .colsample_bytree=0.7, max_depth=2, n_estimators=50; total time=   0.2s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=5, n_estimators=50; total time=   0.3s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=5, n_estimators=50; total time=   0.3s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=5, n_estimators=50; total time=   0.4s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=5, n_estimators=50; total time=   0.3s


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


[CV] END .colsample_bytree=0.7, max_depth=5, n_estimators=50; total time=   0.3s
Best parameters found:  {'colsample_bytree': 0.5, 'max_depth': 2, 'n_estimators': 50}
0.4638461538461539


pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [None]:
colors = ["c", "y", "m", "r"]

NUM_LABEL_PER_SUBJECT = 130
accuracy = []
gkf = GroupKFold()
label_array = np.array(label_list)
groups_list = [[i / NUM_LABEL_PER_SUBJECT] for i, j in enumerate(label_list)]
group_array = np.hstack(groups_list)

list_shap_values = list()
list_test_sets = list()
for train_index, val_index in gkf.split(
    normalized_all_feature_array, label_array, groups=group_array
):
    train_features, train_labels = (
        normalized_all_feature_array.iloc[train_index],
        label_array[train_index],
    )
    val_features, val_labels = (
        normalized_all_feature_array.iloc[val_index],
        label_array[val_index],
    )

    # create model instance
    model = XGBClassifier(n_estimators=2)
    # fit model
    model.fit(train_features, train_labels)
    # Print accuracy.
    acc = model.score(val_features, val_labels)
    print("Accuracy: %.2f%%" % (acc * 100.0))
    accuracy.append(acc)

    # Summary plot
    shap_values = shap.TreeExplainer(model).shap_values(val_features)
    # for each iteration we save the test_set index and the shap_values
    list_shap_values.append(shap_values)
    list_test_sets.append(val_index)

In [None]:
print(accuracy)
print(np.mean(accuracy))


In [None]:
# combining results from all iterations
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1, len(list_test_sets)):
    test_set = np.concatenate((test_set, list_test_sets[i]), axis=0)
    shap_values = (
        np.concatenate((shap_values, np.array(list_shap_values[i])), axis=1)
        if is_multi
        else np.concatenate((shap_values, np.array(list_shap_values[i])), axis=0)
    )  # for binary

# bringing back variable names
X_test = pd.DataFrame(
    normalized_all_feature_array.iloc[test_set], columns=feature_names
)

# creating explanation plot for the whole experiment, the first dimension from shap_values indicate the class we are predicting (0=0, 1=1)
if is_multi:
    shap.summary_plot(shap_values[1], X_test)  # for multi i = class_num
else:
    shap.summary_plot(shap_values, X_test)  # for binary

In [None]:
from data_utils import (
    load_data_from_dir,
    concatenate_all_data,
)


def get_raw_signal(dir_names, markers):
    dir_to_data = {}
    for dir_name in dir_names:
        all_data = load_data_from_dir(dir_name)
        dir_to_data[dir_name] = all_data

    """ 
        concatenate raw signal (x)
    """
    data_list = []
    block_names = list(dir_to_data[dir_names[0]].keys())
    block_names.sort()

    for m in markers:  # EOG.__name__, , EMG.__name__, EGG.__name__
        all_participants_data, condition_to_labels = concatenate_all_data(
            dir_to_data, m
        )
        all_epoch_data = np.swapaxes(
            all_participants_data, 0, -1
        )  # (num_channels, num_data_points, num_epochs) => (num_epochs, num_data_points, num_channels)

        data_list.append(all_epoch_data)

    return np.concatenate(data_list, axis=2), condition_to_labels


ALL_DIRS = [
    "../CleandDataV1/2017",
    "../CleandDataV1/2018",
]

data_array, condition_to_labels = get_raw_signal(ALL_DIRS, ["EEG"])

In [None]:
from tensorflow.keras import utils as np_utils
from models import train_with_cnn

""" 
    prepare labels (y)
"""
# to one-hot encoding vector
# label_array = np_utils.to_categorical(
#     label_list, num_classes=4
# )  # nvla, nvha, hvla, hvha
label_list = binary_label(condition_to_labels["valence"])
label_array = np.array(label_list)

groups_list = [[i / NUM_LABEL_PER_SUBJECT] for i, j in enumerate(label_list)]
group_array = np.hstack(groups_list)

print(data_array.shape, label_array.shape, group_array.shape)

In [None]:
""" 
    with CNN
"""
num_channel = data_array.shape[2]
accuracy = train_with_cnn(12288, num_channel, data_array, label_array, group_array)
print(accuracy)
print(np.mean(accuracy))


In [None]:
""" 
    with logistic regression
"""
# from models import train_with_logistic

# label_name = "attention"
# best_model = train_with_logistic(all_features, label_name, condition_to_labels, group_array)
# name_to_transformed = get_tranformed_labels(condition_to_labels)

# # assume bigger coefficents has more contribution to the model
# # but have to be sure that the features has THE SAME SCALE otherwise this assumption is not correct.
# importance = best_model["classifier"].coef_[0]

# feat_importances = pd.Series(importance, index=get_feature_names(importance))
# feat_importances.nlargest(10).plot(
#     kind="barh", title=f"{label_name} Feature Importance"
# )