# HRV-Stress_correlation

This part is optional, only used to see the best preprocessing from correlation perpective

## kendall correlation

In [1]:
from pprint import pprint
import pandas as pd
import os
import os.path as osp

In [2]:
path_merged = 'data/collective'
file_list = [
    'jenks_reaction_time_penalized_inliers_minmax.csv',
    'jenks_reaction_time_penalized_minmax.csv',
    'jenks_reaction_time_true_inliers_minmax.csv',
    'jenks_reaction_time_true_minmax.csv',
    'kde_reaction_time_penalized_inliers_minmax.csv',
    'kde_reaction_time_penalized_minmax.csv',
    'kde_reaction_time_true_inliers_minmax.csv',
    'kde_reaction_time_true_minmax.csv',
    'kmeans_reaction_time_penalized_inliers_minmax.csv',
    'kmeans_reaction_time_penalized_minmax.csv',
    'kmeans_reaction_time_true_inliers_minmax.csv',
    'kmeans_reaction_time_true_minmax.csv'
]

In [3]:
# Based on literature, monotonic data with small group options is better analyzed using Kendall's correlation
var_60_names = ['mrr_60s', 'mhr_60s', 'sdrr_60s', 'sdhr_60s', 'cvrr_60s', 'rmsd_60s', 'prr20_60s', 'prr50_60s']
var_45_names = ['mrr_45s', 'mhr_45s', 'sdrr_45s', 'sdhr_45s', 'cvrr_45s', 'rmsd_45s', 'prr20_45s', 'prr50_45s']
var_30_names = ['mrr_30s', 'mhr_30s', 'sdrr_30s', 'sdhr_30s', 'cvrr_30s', 'rmsd_30s', 'prr20_30s', 'prr50_30s']
var_15_names = ['mrr_15s', 'mhr_15s', 'sdrr_15s', 'sdhr_15s', 'cvrr_15s', 'rmsd_15s', 'prr20_15s', 'prr50_15s']

dict_correlation = {}
for file_name in file_list:
    dict_correlation[file_name] = {}
    df = pd.read_csv(osp.join(path_merged,file_name))
    col_names = df.columns
    for col_name in col_names:
        if 'reaction_time' in col_name:
            dict_correlation[file_name][col_name] = {
                '60s': {},
                '45s': {},
                '30s': {},
                '15s': {}
            }
            for var_name in var_60_names:
                dict_correlation[file_name][col_name]['60s'][var_name] = df[var_name].corr(df[col_name], method='kendall')
            for var_name in var_45_names:
                dict_correlation[file_name][col_name]['45s'][var_name] = df[var_name].corr(df[col_name], method='kendall')
            for var_name in var_30_names:
                dict_correlation[file_name][col_name]['30s'][var_name] = df[var_name].corr(df[col_name], method='kendall')
            for var_name in var_15_names:
                dict_correlation[file_name][col_name]['15s'][var_name] = df[var_name].corr(df[col_name], method='kendall')

In [4]:
import pickle

with open(osp.join(path_merged,'correlation_kendall.pickle'), 'wb') as handle:
    pickle.dump(dict_correlation, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(osp.join(path_merged,'correlation_kendall.pickle'), 'rb') as handle:
    dict_correlation_load = pickle.load(handle)

print(dict_correlation == dict_correlation_load)

try:
    if dict_correlation == dict_correlation_load:
        dict_correlation = dict_correlation_load
except:
    dict_correlation = dict_correlation_load

True


## kmeans vs jenks vs kde, 15s vs 30s vs 45s vs 60s

In [5]:
import numpy as np

In [6]:
file_names = [
    'jenks_reaction_time_penalized_minmax.csv',
    'kde_reaction_time_penalized_minmax.csv',
    'kmeans_reaction_time_penalized_minmax.csv',
]

best_score = 0
dict_correlation_comparison = {}
for file_name in file_names:
    dict_correlation_comparison[file_name] = {}
    for window_size, value in dict_correlation[file_name]['reaction_time'].items():
        score_temp = []
        for variable, score in value.items():
            if 'mrr' in variable:
                score_temp.append(score) # correlation must be possitive for mrr
            else:
                score_temp.append(-1 * score) # correlation must be possitive for not mrr
                
        mean_correlation_of_all_variable = np.mean(score_temp)
        dict_correlation_comparison[file_name][window_size] = mean_correlation_of_all_variable
        if mean_correlation_of_all_variable > best_score:
            best_score = mean_correlation_of_all_variable
            best_correlation = {
                'file_name': file_name,
                'window_size': window_size,
                'score': np.mean(score_temp)
            }
pprint(dict_correlation_comparison)
pprint(best_correlation)

{'jenks_reaction_time_penalized_minmax.csv': {'15s': 0.017252045395165862,
                                              '30s': 0.021419181414563646,
                                              '45s': 0.01640679774762485,
                                              '60s': 0.01778058594621585},
 'kde_reaction_time_penalized_minmax.csv': {'15s': 0.005950508289244217,
                                            '30s': 0.003499327749637632,
                                            '45s': 0.0032649558409788434,
                                            '60s': 0.0011638208516312068},
 'kmeans_reaction_time_penalized_minmax.csv': {'15s': 0.013459673126614703,
                                               '30s': 0.016463004517018674,
                                               '45s': 0.015584569216246447,
                                               '60s': 0.018580860045374778}}
{'file_name': 'jenks_reaction_time_penalized_minmax.csv',
 'score': 0.021419181414563646,
 'window_si

In [7]:
pprint(dict_correlation[best_correlation['file_name']]['reaction_time'][best_correlation['window_size']])

{'cvrr_30s': -0.022200745235003233,
 'mhr_30s': -0.032943362513045256,
 'mrr_30s': 0.03215259805531322,
 'prr20_30s': -0.009568232243579772,
 'prr50_30s': -0.013542319558146434,
 'rmsd_30s': -0.03131453904844864,
 'sdhr_30s': -0.015497253245815619,
 'sdrr_30s': -0.014134401417156979}


## inlier only vs with outlier, correct answer only vs with wrong anser

In [8]:
file_names = [
    'jenks_reaction_time_true_minmax.csv',
    'jenks_reaction_time_penalized_minmax.csv',
    'jenks_reaction_time_true_inliers_minmax.csv',
    'jenks_reaction_time_penalized_inliers_minmax.csv',
]

best_score = 0
dict_preprocess_comparison = {}
for file_name in file_names:
    dict_preprocess_comparison[file_name] = {}
    for window_size, value in dict_correlation[file_name]['reaction_time'].items():
        if window_size == best_correlation['window_size']:
            score_temp = []
            for variable, score in value.items():
                if 'mrr' in variable:
                    score_temp.append(score) # correlation must be possitive for mrr
                else:
                    score_temp.append(-1 * score) # correlation must be possitive for not mrr

            mean_correlation_of_all_variable = np.mean(score_temp)
            dict_preprocess_comparison[file_name][best_correlation['window_size']] = mean_correlation_of_all_variable
            if mean_correlation_of_all_variable > best_score:
                best_score = mean_correlation_of_all_variable
                best_preprocess = {
                    'file_name': file_name,
                    'score': np.mean(score_temp)
                }
pprint(dict_preprocess_comparison)
pprint(best_preprocess)

{'jenks_reaction_time_penalized_inliers_minmax.csv': {'30s': 0.00023682574902131933},
 'jenks_reaction_time_penalized_minmax.csv': {'30s': 0.021419181414563646},
 'jenks_reaction_time_true_inliers_minmax.csv': {'30s': 0.004632219987139257},
 'jenks_reaction_time_true_minmax.csv': {'30s': 0.014766132774288556}}
{'file_name': 'jenks_reaction_time_penalized_minmax.csv',
 'score': 0.021419181414563646}


## penalty score

In [9]:
file_name = 'jenks_reaction_time_penalized_minmax.csv'

best_score = 0
dict_penalty_comparison = {}
dict_penalty_comparison[file_name] = {}
for penalty_score in dict_correlation[file_name]:
    score_temp = []
    dict_penalty_comparison[file_name][penalty_score] = {}
    for variable, score in dict_correlation[file_name][penalty_score][best_correlation['window_size']].items():
        if 'mrr' in variable:
            score_temp.append(score) # correlation must be possitive for mrr
        else:
            score_temp.append(-1 * score) # correlation must be possitive for not mrr

    mean_correlation_of_all_variable = np.mean(score_temp)
    dict_penalty_comparison[file_name][penalty_score][best_correlation['window_size']] = mean_correlation_of_all_variable
    if mean_correlation_of_all_variable > best_score:
        best_score = mean_correlation_of_all_variable
        best_preprocess = {
            'file_name': file_name,
            'penalty_score': penalty_score,
            'score': np.mean(score_temp)
        }
pprint(dict_penalty_comparison)
pprint(best_preprocess)

{'jenks_reaction_time_penalized_minmax.csv': {'reaction_time': {'30s': 0.021419181414563646},
                                              'reaction_time_penalized_1.1': {'30s': 0.021380951204400908},
                                              'reaction_time_penalized_1.2': {'30s': 0.018985092963250752},
                                              'reaction_time_penalized_1.3': {'30s': 0.019360995685245228},
                                              'reaction_time_penalized_1.4': {'30s': 0.018467759100209355},
                                              'reaction_time_penalized_1.5': {'30s': 0.017990574321107317},
                                              'reaction_time_penalized_1.6': {'30s': 0.01901317259252109},
                                              'reaction_time_penalized_1.7': {'30s': 0.01744853358383054},
                                              'reaction_time_penalized_1.8': {'30s': 0.015823268106674313},
                                            

## complete correlation

In [11]:
file_name = best_preprocess['file_name']
if best_correlation['window_size'] == '15s':
    var_names = ['mrr_15s', 'mhr_15s', 'sdrr_15s', 'sdhr_15s', 'cvrr_15s', 'rmsd_15s', 'prr20_15s', 'prr50_15s']
elif best_correlation['window_size'] == '30s':
    var_names = ['mrr_30s', 'mhr_30s', 'sdrr_30s', 'sdhr_30s', 'cvrr_30s', 'rmsd_30s', 'prr20_30s', 'prr50_30s']
elif best_correlation['window_size'] == '45s':
    var_names = ['mrr_45s', 'mhr_45s', 'sdrr_45s', 'sdhr_45s', 'cvrr_45s', 'rmsd_45s', 'prr20_45s', 'prr50_45s']
else:
    var_names = ['mrr_60s', 'mhr_60s', 'sdrr_60s', 'sdhr_60s', 'cvrr_60s', 'rmsd_60s', 'prr20_60s', 'prr50_60s']
label_name = best_preprocess['penalty_score']
dict_correlation = {
    'pearson': {}, # Pearson's r
    'spearman': {}, # Spearman's rho
    'kendall': {} # Kendall's tau
}
df = pd.read_csv(osp.join(path_merged,file_name))
for var_name in var_names:
    dict_correlation['pearson'][var_name] = df[var_name].corr(df[label_name], method='pearson')
    dict_correlation['spearman'][var_name] = df[var_name].corr(df[label_name], method='spearman')    
    dict_correlation['kendall'][var_name] = df[var_name].corr(df[label_name], method='kendall')

pprint(dict_correlation)

{'kendall': {'cvrr_30s': -0.022200745235003233,
             'mhr_30s': -0.032943362513045256,
             'mrr_30s': 0.03215259805531322,
             'prr20_30s': -0.009568232243579772,
             'prr50_30s': -0.013542319558146434,
             'rmsd_30s': -0.03131453904844864,
             'sdhr_30s': -0.015497253245815619,
             'sdrr_30s': -0.014134401417156979},
 'pearson': {'cvrr_30s': -0.03424322355972337,
             'mhr_30s': -0.034839075517401,
             'mrr_30s': 0.02921929824430517,
             'prr20_30s': -0.015109365152852246,
             'prr50_30s': -0.022452736962043533,
             'rmsd_30s': -0.04421784971177618,
             'sdhr_30s': -0.024200335004843726,
             'sdrr_30s': -0.026716773923412818},
 'spearman': {'cvrr_30s': -0.02821781696499679,
              'mhr_30s': -0.04197000544662074,
              'mrr_30s': 0.041037742219652805,
              'prr20_30s': -0.012218333557801948,
              'prr50_30s': -0.017237023721110665