In [2]:
import glob
import re
import json
import itertools
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.model_selection import TimeSeriesSplit

# all the functions from helpers.py
from helpers_scenario2 import *

In [None]:
scenario_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/"
fold = 0

phys_folder, ann_folder = create_folder_structure(scenario_folder, fold)

annotations_folder = '../data/raw/scenario_2/fold_0/train/annotations/'
physiology_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_3/fold_0/train/physiology/" 
zipped_files = zip_csv_files(annotations_folder, physiology_folder)

subjects, videos = get_subs_vids('../data/preprocessed/cleaned_and_prepro_improved/scenario_2/fold_0/train/physiology')

cat_dict = {1: [16, 20], 2: [0, 3], 3: [10, 22], 4: [4, 21]}

splits = splits = split_subjects_train_test(subjects, 3)

In [None]:
num_cpu_cores = multiprocessing.cpu_count()

# Define aggregate metric combinations
aggregate_combinations = [
    # ['enlarged'],
    # ['mean'],
    # ['std'],
    # ['max'],
    # ['min'],
    # ['mean', 'std'],
    # ['mean', 'max'],
    # ['mean', 'min'],
    # ['std', 'max'],
    # ['std', 'min'],
    # ['max', 'min'],
    ['mean', 'std', 'max', 'min']
]

# Define models and hyperparameters
models_hyperparameters = [
    # (LinearRegression, {}),
    # (SVR, {
    #     'kernel': ['linear', 'rbf'],
    #     'C': [0.1, 1, 10],
    #     'epsilon': [0.1, 1],
    #     'gamma': ['scale', 'auto'],  # Only used for 'rbf' kernel
    # }),
    (RandomForestRegressor, {
        'n_estimators': [100],#[50, 100],
        'max_depth': [None],#[10, None],
        'min_samples_split': [5],#[2, 5],
        'min_samples_leaf': [1],
    }),
    # (XGBRegressor, {
    #     'n_estimators': [50, 100],
    #     'max_depth': [6, 10],
    #     'learning_rate': [0.01, 0.1],
    #     'subsample': [0.5, 0.8],
        # 'colsample_bytree': [0.5, 0.8],
        # 'reg_alpha': [0, 0.1],
        # 'reg_lambda': [0.1, 1],
    # }),
]

windows = [[-10000, 10000], [-5000, 5000], [-2000,2000],]
partitions = [1, 2, 3, 5]

# Initialize an empty DataFrame for the best results and a dictionary for all results
best_results_df = pd.DataFrame()
all_results = {}


def process_files(annotation_file, physiology_file,):
    df_annotations = pd.read_csv(annotation_file)
    df_physiology = pd.read_csv(physiology_file)
    
    # print(physiology_file)
    X, y, numeric_column_indices = preprocess(df_physiology, df_annotations,  predictions_cols=['arousal','valence'], aggregate=['mean','min'], window=[-10000, 5000])
    # print(X.shape, y.shape)
    
    save_files(X, y, annotation_file, phys_folder, ann_folder)
    
    return None

def process_video(subject, models_hyperparameters, splits, phys_folder, ann_folder, window, partition_window, ):
    results = []

    for model, hyperparameters in models_hyperparameters:
        for hp_set in itertools.product(*hyperparameters.values()):
            hp_dict = dict(zip(hyperparameters.keys(), hp_set))

            model_name = model.__name__

            rmses = []
            for split in splits:

                X_train, X_test = load_and_concatenate_files(phys_folder, split, subject)
                y_train, y_test = load_and_concatenate_files(ann_folder, split, subject)

                rmse = time_series_cross_validation_with_hyperparameters(
                    X_train, X_test, y_train, y_test, model, hp_dict, n_jobs=-1,
                    numeric_column_indices=np.array(range(X_train.shape[1])))

                rmses.append(rmse)

            average_rmse = np.mean(rmses, axis=0)
            print(f"Average Root Mean Squared Error per output: {average_rmse}. ")

            if y_train.ndim > 1 and y_train.shape[1] > 1:
                # Unpack the average_rmse array into separate keys in the result dictionary
                result = {
                    'model': model_name,
                    'hyperparameters': hyperparameters,
                    'aggregate': ['mean','min'],
                    'average_rmse_arousal': average_rmse[0],
                    'average_rmse_valence': average_rmse[1],
                    'window': window,
                    'partition_window': partition_window
                    
                }
            else:
                result = {
                    'model': model_name,
                    'hyperparameters': hyperparameters,
                    'aggregate': ['mean','min'],
                    'average_rmse': average_rmse,
                    'window': window,
                    'partition_window': partition_window
                }
            results.append(result)
            
    # # Update the all_results dictionary
    # all_results[f"{subject}"] = results

    # # Save all_results as JSON
    # with open('../results/scenario_3/clean_all_results_shallow_models_window.json', 'w') as f:
    #     json.dump(all_results, f, default=str, indent=4)
    return results
        
 

In [1]:
with open('../results/scenario_2/clean_all_results_shallow_models_window.json', 'r') as f:
    all_results = json.load(f)
    
for window in tqdm(windows, desc="Windows"):
    for partition_window in tqdm(partitions[1:], desc="Partitions", leave=False):
        
        zipped_files = zip_csv_files(annotations_folder, physiology_folder)        
        
        # Process the files using the context manager
        with parallel_backend('multiprocessing', n_jobs=num_cpu_cores - 5):
            with tqdm_joblib(tqdm(total=len(zipped_files), desc="Files", leave=False)) as progress_bar:
                results = Parallel()(
                    (delayed(process_files)(ann_file, phys_file) for ann_file, phys_file in zipped_files)
                )

        total_videos = len(videos)
        with parallel_backend('multiprocessing', n_jobs= num_cpu_cores - 5):
            with tqdm_joblib(tqdm(total=total_videos, desc="Videos", leave=False)) as progress_bar:
                all_subject_results = Parallel()(
                    (delayed(process_video)(video, models_hyperparameters, splits, phys_folder, ann_folder, window, partition_window) 
                     for video in videos)
                )


        # Combine results for all subjects
        for subject_idx, subject in enumerate(subjects):
            subject_results = all_subject_results[subject_idx]
            all_results[f"{subject}"] = subject_results

            # Save all_results as JSON
            with open('../results/scenario_2/clean_all_results_shallow_models_window.json', 'w') as f:
                json.dump(all_results, f, default=str, indent=4)

                    
    results_df = pd.DataFrame(results)

    best_result_output_1 = results_df.loc[results_df['average_rmse_arousal'].idxmin()]
    best_result_output_2 = results_df.loc[results_df['average_rmse_valence'].idxmin()]

    # Concatenate the best results for each output variable to the best_results_df
    best_results_df =pd.concat([best_results_df, best_result_output_1.to_frame().T, best_result_output_2.to_frame().T], ignore_index=True)

    # Save best_results_df as CSV
    best_results_df.to_csv('../results/scenario_2/clean_shallow_models_best_result_window.csv', index=False)

print("\nThe best combination of features and hyperparameters for each file pair is:")
print(best_results_df)



FileNotFoundError: [Errno 2] No such file or directory: '../results/scenario_2/clean_all_results_shallow_models_window.json'

In [9]:
annotations_folder = '../data/raw/scenario_2/scenario_2/fold_0/train/annotations/'
# physiology_folder = "../data/preprocessed/cleaned/scenario_1/fold_0/train/physiology/" #'../data/raw/scenario_1/train/physiology/'
physiology_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/fold_0/train/physiology/" #'../data/raw/scenario_1/train/physiology/'data\preprocessed\

df_physiology = load_read_and_append_csvs(physiology_folder)
df_annotations = load_read_and_append_csvs(annotations_folder)

videos = df_physiology.video.unique()
subjects = df_physiology.subject.unique()

splits = split_subjects_train_test(subjects, 3)

In [11]:
from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
import threading
import multiprocessing
from tqdm import tqdm
import itertools


num_cpu_cores = multiprocessing.cpu_count()

# Define aggregate metric combinations
aggregate_combinations = [
    # ['enlarged'],
    # ['mean'],
    # ['std'],
    # ['max'],
    # ['min'],
    # ['mean', 'std'],
    # ['mean', 'max'],
    # ['mean', 'min'],
    # ['std', 'max'],
    # ['std', 'min'],
    # ['max', 'min'],
    ['mean', 'std', 'max', 'min']
]

# Define models and hyperparameters
models_hyperparameters = [
    # (LinearRegression, {}),
    # (SVR, {
    #     'kernel': ['linear', 'rbf'],
    #     'C': [0.1, 1, 10],
    #     'epsilon': [0.1, 1],
    #     'gamma': ['scale', 'auto'],  # Only used for 'rbf' kernel
    # }),
    # (RandomForestRegressor, {
    #     'n_estimators': [50, 100],
    #     'max_depth': [10, None],
    #     'min_samples_split': [2, 5],
        # 'min_samples_leaf': [1, 2],
        # 'max_features': ['auto', 'sqrt'],
    # }),
    (XGBRegressor, {
        'n_estimators':[50],#[50, 100],
        'max_depth':[6] ,#[6, 10],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.5, 0.8],
        # 'colsample_bytree': [0.5, 0.8],
        # 'reg_alpha': [0, 0.1],
        # 'reg_lambda': [0.1, 1],
    }),
]

windows = [10000, 5000, 2000, 1000, 500]

# Define a function to process a single hyperparameter set
def process_hp_set(hp_set, model, hyperparameters, iter_aggregate, splits, windows, df_physiology_video, df_annotations_video):
    hp_dict = dict(zip(hyperparameters.keys(), hp_set))
    model_name = model.__name__

    print(f"Testing model: {model_name} with hyperparameters: {hp_dict} and aggregate: {iter_aggregate}")

    rmses = []
    
    for window in windows: 
        for split in splits:
            # print(split)

            X_train, X_test, y_train, y_test, numeric_column_indices, categorical_column_indices = preprocess(
                df_physiology_video.copy(), df_annotations_video.copy(), split=split, predictions_cols=['arousal', 'valence'], aggregate=iter_aggregate,
                window_duration=window, resample_rate=100)

            rmse = time_series_cross_validation_with_hyperparameters(
                X_train, X_test, y_train, y_test, model, hp_dict, n_jobs=1,
                numeric_column_indices=numeric_column_indices,
                categorical_column_indices=categorical_column_indices)

            rmses.append(rmse)

        average_rmse = np.mean(rmses, axis=0)

        if y_train.ndim > 1 and y_train.shape[1] > 1:
            # Unpack the average_rmse array into separate keys in the result dictionary
            result = {
                'model': model_name,
                'hyperparameters': hp_dict,
                'aggregate': iter_aggregate,
                'average_rmse_arousal': average_rmse[0],
                'average_rmse_valence': average_rmse[1],
                'window': window
            }
        else:
            result = {
                'model': model_name,
                'hyperparameters': hp_dict,
                'aggregate': iter_aggregate,
                'average_rmse': average_rmse,
                'window': window
            }

        return result

# Initialize an empty DataFrame for the best results and a dictionary for all results
best_results_df = pd.DataFrame()
all_results = {}

# Wrap the outer loop with tqdm
for video in tqdm(videos[1:], desc="Processing videos", unit="video", bar_format="{l_bar}%s{bar}%s{r_bar}" % ('\033[32m', '\033[0m')):
    print(f"Processing video: {video}")

    df_physiology_video = df_physiology.loc[df_physiology.video == video]
    df_annotations_video = df_annotations.loc[df_annotations.video == video]

    results = []

    for iter_aggregate in aggregate_combinations:
        # Preprocess data

        for model, hyperparameters in models_hyperparameters:

            # Use ProcessPoolExecutor to parallelize the loop
            with ThreadPoolExecutor(max_workers=num_cpu_cores-1) as executor:
                # Prepare the list of arguments for each task
                tasks = [
                    (hp_set, model, hyperparameters, iter_aggregate, splits, windows, df_physiology_video, df_annotations_video)
                    for hp_set in itertools.product(*hyperparameters.values())
                ]
                
                # Create a progress bar for ThreadPoolExecutor
                progress = tqdm(total=len(tasks), desc=f"Processing hyperparameters (threads: {threading.active_count()})", unit="hp", leave=False)

                def callback(future):
                    progress.update(1)
                    result = future.result()
                    results.append(result)

                    # Update the all_results dictionary
                    all_results[f"{video}"] = results

                    # Save all_results as JSON
                    with open('../results/scenario_2/clean_all_results_shallow_models_windows.json', 'w') as f:
                        json.dump(all_results, f, default=str, indent=4)


                # Submit the tasks to the executor and add the callback
                futures = [executor.submit(process_hp_set, *task_args) for task_args in tasks]
                for future in futures:
                    future.add_done_callback(callback)
                # Close the progress bar
                progress.close()
                    

    results_df = pd.DataFrame(results)

    # if y_train.ndim > 1 and y_train.shape[1] > 1:
    # Find the best result for each output variable
    best_result_output_1 = results_df.loc[results_df['average_rmse_arousal'].idxmin()]
    best_result_output_2 = results_df.loc[results_df['average_rmse_valence'].idxmin()]

    # Concatenate the best results for each output variable to the best_results_df
    best_results_df =pd.concat([best_results_df, best_result_output_1.to_frame().T, best_result_output_2.to_frame().T], ignore_index=True)
    # else:
    #     best_result = results_df.loc[results_df['average_rmse'].idxmin()]

    #     best_results_df = best_results_df.append(best_result, ignore_index=True)

    # Save best_results_df as CSV
    best_results_df.to_csv('../results/scenario_2/clean_shallow_models_best_result_windows.csv', index=False)

print("\nThe best combination of features and hyperparameters for each file pair is:")
print(best_results_df)



Processing videos:   0%|[32m          [0m| 0/7 [00:00<?, ?video/s]

Processing video: 10
