In [2]:
import glob
import re
import json
import itertools
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.model_selection import TimeSeriesSplit

# all the functions from helpers.py
from helpers_scenario2 import *

In [9]:
annotations_folder = '../data/raw/scenario_2/scenario_2/fold_0/train/annotations/'
# physiology_folder = "../data/preprocessed/cleaned/scenario_1/fold_0/train/physiology/" #'../data/raw/scenario_1/train/physiology/'
physiology_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/fold_0/train/physiology/" #'../data/raw/scenario_1/train/physiology/'data\preprocessed\

df_physiology = load_read_and_append_csvs(physiology_folder)
df_annotations = load_read_and_append_csvs(annotations_folder)

videos = df_physiology.video.unique()
subjects = df_physiology.subject.unique()

splits = split_subjects_train_test(subjects, 3)

In [10]:
from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
import threading
import multiprocessing
from tqdm import tqdm
import itertools


num_cpu_cores = multiprocessing.cpu_count()

# Define aggregate metric combinations
aggregate_combinations = [
    # ['enlarged'],
    # ['mean'],
    # ['std'],
    # ['max'],
    ['min'],
    # ['mean', 'std'],
    # ['mean', 'max'],
    # ['mean', 'min'],
    # ['std', 'max'],
    # ['std', 'min'],
    # ['max', 'min'],
    ['mean', 'std', 'max', 'min']
]

# Define models and hyperparameters
models_hyperparameters = [
    # (LinearRegression, {}),
    # (SVR, {
    #     'kernel': ['linear', 'rbf'],
    #     'C': [0.1, 1, 10],
    #     'epsilon': [0.1, 1],
    #     'gamma': ['scale', 'auto'],  # Only used for 'rbf' kernel
    # }),
    (RandomForestRegressor, {
        'n_estimators': [50, 100],
        'max_depth': [10, None],
        'min_samples_split': [2, 5],
        # 'min_samples_leaf': [1, 2],
        # 'max_features': ['auto', 'sqrt'],
    }),
    (XGBRegressor, {
        'n_estimators': [50, 100],
        'max_depth': [6, 10],
    #     'learning_rate': [0.01, 0.1],
    #     'subsample': [0.5, 0.8],
        # 'colsample_bytree': [0.5, 0.8],
        # 'reg_alpha': [0, 0.1],
        # 'reg_lambda': [0.1, 1],
    }),
]

# Define a function to process a single hyperparameter set
def process_hp_set(hp_set, model, hyperparameters, iter_aggregate, splits, df_physiology_video, df_annotations_video):
    hp_dict = dict(zip(hyperparameters.keys(), hp_set))
    model_name = model.__name__

    print(f"Testing model: {model_name} with hyperparameters: {hp_dict} and aggregate: {iter_aggregate}")

    rmses = []
    for split in splits:
        # print(split)

        X_train, X_test, y_train, y_test, numeric_column_indices, categorical_column_indices = preprocess(
            df_physiology_video.copy(), df_annotations_video.copy(), split=split, predictions_cols=['arousal', 'valence'], aggregate=iter_aggregate,
            window_duration=10000, resample_rate=100)

        rmse = time_series_cross_validation_with_hyperparameters(
            X_train, X_test, y_train, y_test, model, hp_dict, n_jobs=1,
            numeric_column_indices=numeric_column_indices,
            categorical_column_indices=categorical_column_indices)

        rmses.append(rmse)

    average_rmse = np.mean(rmses, axis=0)

    if y_train.ndim > 1 and y_train.shape[1] > 1:
        # Unpack the average_rmse array into separate keys in the result dictionary
        result = {
            'model': model_name,
            'hyperparameters': hp_dict,
            'aggregate': iter_aggregate,
            'average_rmse_arousal': average_rmse[0],
            'average_rmse_valence': average_rmse[1],
        }
    else:
        result = {
            'model': model_name,
            'hyperparameters': hp_dict,
            'aggregate': iter_aggregate,
            'average_rmse': average_rmse
        }

    return result

# Initialize an empty DataFrame for the best results and a dictionary for all results
best_results_df = pd.DataFrame()
all_results = {}

# Wrap the outer loop with tqdm
for video in tqdm(videos, desc="Processing videos", unit="video", bar_format="{l_bar}%s{bar}%s{r_bar}" % ('\033[32m', '\033[0m')):
    print(f"Processing video: {video}")

    df_physiology_video = df_physiology.loc[df_physiology.video == video]
    df_annotations_video = df_annotations.loc[df_annotations.video == video]

    results = []

    for iter_aggregate in aggregate_combinations:
        # Preprocess data

        for model, hyperparameters in models_hyperparameters:

            # Use ProcessPoolExecutor to parallelize the loop
            with ThreadPoolExecutor(max_workers=num_cpu_cores-1) as executor:
                # Prepare the list of arguments for each task
                tasks = [
                    (hp_set, model, hyperparameters, iter_aggregate, splits, df_physiology_video, df_annotations_video)
                    for hp_set in itertools.product(*hyperparameters.values())
                ]
                
                # Create a progress bar for ThreadPoolExecutor
                progress = tqdm(total=len(tasks), desc=f"Processing hyperparameters (threads: {threading.active_count()})", unit="hp", leave=False)

                def callback(future):
                    progress.update(1)
                    result = future.result()
                    results.append(result)

                    # Update the all_results dictionary
                    all_results[f"{video}"] = results

                    # Save all_results as JSON
                    with open('../results/scenario_2/clean_all_results_shallow_models.json', 'w') as f:
                        json.dump(all_results, f, default=str, indent=4)


                # Submit the tasks to the executor and add the callback
                futures = [executor.submit(process_hp_set, *task_args) for task_args in tasks]
                for future in futures:
                    future.add_done_callback(callback)
                # Close the progress bar
                progress.close()
                    

    results_df = pd.DataFrame(results)

    # if y_train.ndim > 1 and y_train.shape[1] > 1:
    # Find the best result for each output variable
    best_result_output_1 = results_df.loc[results_df['average_rmse_arousal'].idxmin()]
    best_result_output_2 = results_df.loc[results_df['average_rmse_valence'].idxmin()]

    # Concatenate the best results for each output variable to the best_results_df
    best_results_df =pd.concat([best_results_df, best_result_output_1.to_frame().T, best_result_output_2.to_frame().T], ignore_index=True)
    # else:
    #     best_result = results_df.loc[results_df['average_rmse'].idxmin()]

    #     best_results_df = best_results_df.append(best_result, ignore_index=True)

    all_results[f"{video}"] = results_df.to_dict(orient='records')

    # Save all_results as JSON
    with open('../results/scenario_2/clean_all_results_shallow_models.json', 'w') as f:
        json.dump(all_results, f, default=str, indent=4)

    # Save best_results_df as CSV
    best_results_df.to_csv('../results/scenario_2/clean_shallow_models_best_result.csv', index=False)

print("\nThe best combination of features and hyperparameters for each file pair is:")
print(best_results_df)



Processing videos:   0%|[32m          [0m| 0/8 [00:00<?, ?video/s]

Processing video: 9




Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2} and aggregate: ['min']Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 5} and aggregate: ['min']
Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 2} and aggregate: ['min']

Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 5} and aggregate: ['min']
Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2} and aggregate: ['min']
Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5} and aggregate: ['min']
Testing model: RandomForestRegressor with hyperparameters: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2} and ag