In [14]:
import dask.dataframe as dd
import glob
import re
import pandas as pd
from pathlib import Path
from dask_ml.preprocessing import StandardScaler
from dask_ml.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from dask.distributed import Client
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.model_selection import TimeSeriesSplit

In [15]:
def zip_csv_files(folder_path_1, folder_path_2):
    files_1 = glob.glob(folder_path_1 + '/*.csv')
    files_2 = glob.glob(folder_path_2 + '/*.csv')

    # Create a dictionary with keys as (subject_num, video_num) and values as the file path
    files_dict_1 = {(int(s), int(v)): f for f in files_1 for s, v in re.findall(r'sub_(\d+)_vid_(\d+)', f)}
    files_dict_2 = {(int(s), int(v)): f for f in files_2 for s, v in re.findall(r'sub_(\d+)_vid_(\d+)', f)}

    # Create a list of tuples with corresponding CSV file paths in both folders
    zipped_files = [(files_dict_1[key], files_dict_2[key]) for key in files_dict_1 if key in files_dict_2]

    return zipped_files
    

def sliding_window_with_step(data, window_size, step):
    windows = []
    num_windows = (len(data) - window_size) // step + 1
    for i in range(num_windows):
        end_index = window_size + i * step
        start_index = end_index - window_size
        windows.append(data[start_index:end_index])
    return windows

def preprocess(df_physiology, df_annotations, window_size = 100, step = 5):
    ecg = df_physiology['ecg']
    eda = df_physiology['gsr']
    arousal = df_annotations['arousal']

    from scipy.signal import resample

    ecg_resampled = resample(ecg, len(ecg) // 10)
    eda_resampled = resample(eda, len(eda) // 10)
    
    aligned_ecg = sliding_window_with_step(ecg_resampled, window_size, step)
    aligned_eda = sliding_window_with_step(eda_resampled, window_size, step)
    
    X = np.column_stack((aligned_ecg, aligned_eda))
    y = arousal[20:].values
    
    return X, y

In [16]:
def time_series_cross_validation_with_hyperparameters(X, y, kernel, C, epsilon, n_splits=5):
    # This function is similar to the previous time_series_cross_validation function,
    # but takes hyperparameters as input
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rmse_values = []

    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        svm_regressor = SVR(kernel=kernel, C=C, epsilon=epsilon)
        svm_regressor.fit(X_train_scaled, y_train)
        
        # Make predictions and evaluate the performance of the model
        y_pred = svm_regressor.predict(X_test_scaled)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        rmse_values.append(rmse)

    # Calculate the average mean squared error across all splits
    average_rmse = np.mean(rmse_values)
    print("Average Root Mean Squared Error:", average_rmse)
    return average_rmse

In [17]:
kernel_options = ['linear', 'rbf']
C_options = [0.1, 1, 10]
epsilon_options = [0.1, 0.01]

hyperparameters = [(kernel, C, epsilon) for kernel in kernel_options for C in C_options for epsilon in epsilon_options]

In [18]:
annotations_folder = '../data/raw/scenario_1/train/annotations/'
physiology_folder = '../data/raw/scenario_1/train/physiology/'
zipped_files = zip_csv_files(annotations_folder, physiology_folder)
results = {}

for kernel, C, epsilon in hyperparameters:
    print(f"Testing hyperparameters: kernel={kernel}, C={C}, epsilon={epsilon}")
    
    rmse_values = []
    for annotation_file, physiology_file in zipped_files:
        df_annotations = pd.read_csv(annotation_file)
        df_physiology = pd.read_csv(physiology_file)
        X, y = preprocess(df_physiology, df_annotations)
        
        average_rmse = time_series_cross_validation_with_hyperparameters(X, y, kernel, C, epsilon)
        rmse_values.append(average_rmse)
    
    overall_average_rmse = np.mean(rmse_values)
    results[(kernel, C, epsilon)] = overall_average_rmse
    print(f"Overall average root mean squared error for hyperparameters: {overall_average_rmse}")


Testing hyperparameters: kernel=linear, C=0.1, epsilon=0.1
Average Root Mean Squared Error: 0.2702716384258762
Average Root Mean Squared Error: 0.5052056106584261
Average Root Mean Squared Error: 0.990096087127586
Average Root Mean Squared Error: 0.903830011491374
Average Root Mean Squared Error: 0.30243318615073544
Average Root Mean Squared Error: 0.3044765302807789
Average Root Mean Squared Error: 0.5494290520773577
Average Root Mean Squared Error: 0.08521210521222458
Average Root Mean Squared Error: 0.3142041068364454
Average Root Mean Squared Error: 0.8565143279291009
Average Root Mean Squared Error: 0.2396771837061164
Average Root Mean Squared Error: 0.3630030092708271
Average Root Mean Squared Error: 0.09050540193196717
Average Root Mean Squared Error: 0.4642052999936322
Average Root Mean Squared Error: 0.07068191087837024
Average Root Mean Squared Error: 0.1426442724621911
Average Root Mean Squared Error: 0.44348677950131216
Average Root Mean Squared Error: 0.26918368969388873
A