In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from data_utils.data_utils import (
    load_data_from_dir,
    AUDIO_BLOCKS,
    get_block_raw_data_by_marker,
)

# Extract statistical features

In [None]:
### CHANGE ME
marker = 'EGG'
channeel_no = 0
data_dir = "../data/"  # Replace with your own data dir
###

data_raw, subject_list = [], []
subject_to_data = {}
for i, d in enumerate(os.listdir(data_dir)):
    dir_name = data_dir + d
    if not os.path.isdir(dir_name):
        continue

    data = load_data_from_dir(dir_name)
    block_to_data = get_block_raw_data_by_marker(
        data, AUDIO_BLOCKS, marker, channeel_no,
    )
    data_raw.append(block_to_data)
    subject_list.append(d)

In [None]:
from features.stats_features import (
    FEATURE_TO_FUNC,
    StatsFeature,
)


def _extact_block_features(raw_data, feature_key):
    features = []
    for b in AUDIO_BLOCKS:
        for bd in raw_data[b]:
            f = FEATURE_TO_FUNC[feature_key](bd)
            features.append(f)
    return np.array(features)


## CHANGE ME
feature_list = [
    StatsFeature.MAXIM,
    StatsFeature.MINIM,
]
###

all_features = []
for i in range(len(subject_list)):
    features = [
        _extact_block_features(data_raw[i], f)
        for f in feature_list
    ]
    all_features.append(np.swapaxes(features, 0, 1))

all_features = np.array(all_features)
all_features.shape

# Load behavioral labels
Please look into eeg_features_analysis.ipynb to see how to extract labels

In [None]:
# Read labels pkl file
with open("./data/replace_me.pkl", "rb") as fp:
    behavioral_labels = pickle.load(fp)

valence_labels, arousal_labels, label_thresholds = (
    behavioral_labels["valence_labels"],
    behavioral_labels["arousal_labels"],
    behavioral_labels["label_thresholds"],
)

# Optional inspect feature correlation with user rating

In [None]:
from scipy import stats

feature_key=StatsFeature.ABS_DIFF
for l, labels in {"valence": valence_labels, "arousal": arousal_labels}.items():
    n_row, n_col = (5, 8)
    fig, axes = plt.subplots(
        nrows=n_row,
        sharey=False,
        ncols=n_col,
        figsize=(n_col * 3, n_row * 3),
    )

    for i, ax in enumerate(axes.flat):
        s = str(subject_lists[i])

        feature = _extact_block_features(data_raw[i], feature_key)
        r_v, p_v = stats.pearsonr(labels[i], feature)

        color = "red" if p_v < 0.05 else "grey"
        result = pd.DataFrame({"user rating": labels[i], "feature": feature})
        g1 = sns.regplot(data=result, x="user rating", y="feature", ax=ax, color=color)
        g1.set(xlabel=None, ylabel=None)
        g1.set_title(f"{s} r:{r_v:2.4f}, p:{p_v:2.4f}", fontsize=12, color=color)

    fig.suptitle(f"{feature_key.name} vs user rating - {l}", y=1, size=24)
    fig.tight_layout(pad=1.8)

# Model training with PCA + KNN

In [None]:
from training_utils.dataset import get_consecutive_validation_indexes
from training_utils.dataset import DatasetBuilder

n_trial_per_block = 13
n_step_trial = 3
val_indexes = [
    get_consecutive_validation_indexes(
        len(valence_labels[0]), len(AUDIO_BLOCKS), 1, i, n_step_trial
    )
    for i in range(1, n_trial_per_block, n_step_trial)
]
print(len(val_indexes), val_indexes)

dataset_builder = DatasetBuilder(len(valence_labels[0]), val_indexes_group=val_indexes)
len(valence_labels[0])

## Cross validation
Check out eeg_feature_analysis.ipynb for further evaluation helpers

In [None]:
from training_utils.training import decode_marker_data, get_metadata

subject_accuracy_summary = {
    "subject": [],
    "channel": [],
    "feature": [],
    "label_type": [],
    "cv_scores": [],
    "cv_mean_score": [],
}

###CHANGE ME####
method = "PCA"
feature_name = ""
output_dim = 4
###############

subject_to_embedding = {s: {"valence": [], "arousal": []} for s in subject_list}

for idx in range(len(subject_list)):
    subj = subject_list[idx]
    print("decoding subject...", subj)

    v_thred, a_thred = label_thresholds[idx]
    for lt in ["valence", "arousal"]:
        labels = valence_labels[idx] if lt == "valence" else arousal_labels[idx]
        thred = v_thred if lt == "valence" else a_thred

        dataset_dict = {
            marker: {feature_name: dataset_builder.train_test_split(features, labels)}
        }

        subject_to_embedding[subj][lt], accuracy = decode_marker_data(
            dataset_dict,
            lt,
            v_thred,
            a_thred,
            method,
            output_dim,
            thred,
        )

        all_channels, all_feature_name, cv_scores = get_metadata(accuracy)

        subject_accuracy_summary["subject"].extend([subj] * len(all_feature_name))
        subject_accuracy_summary["channel"].extend(all_channels)
        subject_accuracy_summary["feature"].extend(all_feature_name)
        subject_accuracy_summary["cv_mean_score"].extend([round(np.mean(cv_scores), 2)])
        subject_accuracy_summary["cv_scores"].extend(cv_scores)
        subject_accuracy_summary["label_type"].extend([lt] * len(all_feature_name))

subject_accuracy_summary = pd.DataFrame(subject_accuracy_summary)
subject_accuracy_summary["subject"] = subject_accuracy_summary["subject"].astype(int)