In [2]:
import pandas as pd
from pathlib import Path


from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
import joblib
from joblib import Parallel, delayed, parallel_backend
import multiprocessing
from multiprocessing import Value
from tqdm.auto import tqdm
import itertools


from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.model_selection import TimeSeriesSplit

# all the functions from helpers.py
from helpers_scenario2 import *

In [6]:
scenario_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/"
fold = 0

phys_folder, ann_folder = create_folder_structure(scenario_folder, fold)

annotations_folder = '../data/raw/scenario_2/fold_0/train/annotations/'
physiology_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/fold_0/train/physiology/" 
zipped_files = zip_csv_files(annotations_folder, physiology_folder)

subjects, videos = get_subs_vids('../data/preprocessed/cleaned_and_prepro_improved/scenario_2/fold_0/train/physiology')

cat_dict = {1: [16, 20], 2: [0, 3], 3: [10, 22], 4: [4, 21]}

splits = splits = split_subjects_train_test(subjects, 3)

In [4]:
num_cpu_cores = multiprocessing.cpu_count()

# Define aggregate metric combinations
aggregate_combinations = [
    # ['enlarged'],
    # ['mean'],
    # ['std'],
    # ['max'],
    # ['min'],
    # ['mean', 'std'],
    # ['mean', 'max'],
    # ['mean', 'min'],
    # ['std', 'max'],
    # ['std', 'min'],
    # ['max', 'min'],
    ['mean', 'std', 'max', 'min']
]

# Define models and hyperparameters
models_hyperparameters = [
    # (LinearRegression, {}),
    # (SVR, {
    #     'kernel': ['linear', 'rbf'],
    #     'C': [0.1, 1, 10],
    #     'epsilon': [0.1, 1],
    #     'gamma': ['scale', 'auto'],  # Only used for 'rbf' kernel
    # }),
    (RandomForestRegressor, {
        'n_estimators': [100],#[50, 100],
        'max_depth': [None],#[10, None],
        'min_samples_split': [5],#[2, 5],
        'min_samples_leaf': [1],
    }),
    # (XGBRegressor, {
    #     'n_estimators': [50, 100],
    #     'max_depth': [6, 10],
    #     'learning_rate': [0.01, 0.1],
    #     'subsample': [0.5, 0.8],
        # 'colsample_bytree': [0.5, 0.8],
        # 'reg_alpha': [0, 0.1],
        # 'reg_lambda': [0.1, 1],
    # }),
]

windows = [[-10000, 10000], [-5000, 5000], [-2000,2000],]
partitions = [1, 2, 3, 5]

# Initialize an empty DataFrame for the best results and a dictionary for all results
best_results_df = pd.DataFrame()
all_results = {}


def process_files(annotation_file, physiology_file,):
    df_annotations = pd.read_csv(annotation_file)
    df_physiology = pd.read_csv(physiology_file)
    
    # print(physiology_file)
    X, y, numeric_column_indices = preprocess(df_physiology, df_annotations,  predictions_cols=['arousal','valence'], aggregate=['mean','min'], window=[-10000, 5000])
    # print(X.shape, y.shape)
    
    save_files(X, y, annotation_file, phys_folder, ann_folder)
    
    return None

def process_video(subject, models_hyperparameters, splits, phys_folder, ann_folder, window, partition_window, ):
    results = []

    for model, hyperparameters in models_hyperparameters:
        for hp_set in itertools.product(*hyperparameters.values()):
            hp_dict = dict(zip(hyperparameters.keys(), hp_set))

            model_name = model.__name__

            rmses = []
            for split in splits:

                X_train, X_test = load_and_concatenate_files(phys_folder, split, subject)
                y_train, y_test = load_and_concatenate_files(ann_folder, split, subject)

                rmse = time_series_cross_validation_with_hyperparameters(
                    X_train, X_test, y_train, y_test, model, hp_dict, n_jobs=-1,
                    numeric_column_indices=np.array(range(X_train.shape[1])))

                rmses.append(rmse)

            average_rmse = np.mean(rmses, axis=0)
            print(f"Average Root Mean Squared Error per output: {average_rmse}. ")

            if y_train.ndim > 1 and y_train.shape[1] > 1:
                # Unpack the average_rmse array into separate keys in the result dictionary
                result = {
                    'model': model_name,
                    'hyperparameters': hyperparameters,
                    'aggregate': ['mean','min'],
                    'average_rmse_arousal': average_rmse[0],
                    'average_rmse_valence': average_rmse[1],
                    'window': window,
                    'partition_window': partition_window
                    
                }
            else:
                result = {
                    'model': model_name,
                    'hyperparameters': hyperparameters,
                    'aggregate': ['mean','min'],
                    'average_rmse': average_rmse,
                    'window': window,
                    'partition_window': partition_window
                }
            results.append(result)
            
    # # Update the all_results dictionary
    # all_results[f"{subject}"] = results

    # # Save all_results as JSON
    # with open('../results/scenario_3/clean_all_results_shallow_models_window.json', 'w') as f:
    #     json.dump(all_results, f, default=str, indent=4)
    return results
        
 

In [None]:
scenario_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/"
fold = 0

phys_folder, ann_folder = create_folder_structure(scenario_folder, fold)

annotations_folder = '../data/raw/scenario_2/fold_0/train/annotations/'
physiology_folder = "../data/preprocessed/cleaned_and_prepro_improved/scenario_2/fold_0/train/physiology/" 
zipped_files = zip_csv_files(annotations_folder, physiology_folder)

subjects, videos = get_subs_vids(physiology_folder)

splits = split_subjects_train_test(subjects, 3)

#####################

def process_files(annotation_file, physiology_file,):
    df_annotations = pd.read_csv(annotation_file)
    df_physiology = pd.read_csv(physiology_file)
    
    # print(physiology_file)
    X, y, numeric_column_indices = preprocess(df_physiology, df_annotations,  predictions_cols=['arousal','valence'], aggregate=None, window=[-10000, 5000], partition = 1)
    # print(X.shape, y.shape)
    
    save_files(X, y, annotation_file, phys_folder, ann_folder)
    
    return None
     

# Process the files using the context manager
with parallel_backend('multiprocessing', n_jobs=num_cpu_cores //2):
    with tqdm_joblib(tqdm(total=len(zipped_files), desc="Files", leave=False)) as progress_bar:
        results = Parallel()(
            (delayed(process_files)(ann_file, phys_file) for ann_file, phys_file in zipped_files)
        )
        
################## Para el modelo ################        

video = '0'
for split in splits:

    X_train, X_test = load_and_concatenate_files(phys_folder, split, video)
    y_train, y_test = load_and_concatenate_files(ann_folder, split, video)

In [7]:
# with open(r'../results/scenario_2/clean_all_results_shallow_models_window.json', 'r') as f:
#     all_results = json.load(f)
    
for window in tqdm(windows, desc="Windows"):
    for partition_window in tqdm(partitions[1:], desc="Partitions", leave=False):
        
        zipped_files = zip_csv_files(annotations_folder, physiology_folder)        
        
        # Process the files using the context manager
        with parallel_backend('multiprocessing', n_jobs=num_cpu_cores // 2):
            with tqdm_joblib(tqdm(total=len(zipped_files), desc="Files", leave=False)) as progress_bar:
                results = Parallel()(
                    (delayed(process_files)(ann_file, phys_file) for ann_file, phys_file in zipped_files)
                )

        total_videos = len(videos)
        with parallel_backend('multiprocessing', n_jobs= num_cpu_cores - 5):
            with tqdm_joblib(tqdm(total=total_videos, desc="Videos", leave=False)) as progress_bar:
                all_subject_results = Parallel()(
                    (delayed(process_video)(video, models_hyperparameters, splits, phys_folder, ann_folder, window, partition_window) 
                     for video in videos)
                )


        # Combine results for all subjects
        for subject_idx, subject in enumerate(subjects):
            subject_results = all_subject_results[subject_idx]
            all_results[f"{subject}"] = subject_results

            # Save all_results as JSON
            with open('../results/scenario_2/clean_all_results_shallow_models_window.json', 'w') as f:
                json.dump(all_results, f, default=str, indent=4)

                    
    results_df = pd.DataFrame(results)

    best_result_output_1 = results_df.loc[results_df['average_rmse_arousal'].idxmin()]
    best_result_output_2 = results_df.loc[results_df['average_rmse_valence'].idxmin()]

    # Concatenate the best results for each output variable to the best_results_df
    best_results_df =pd.concat([best_results_df, best_result_output_1.to_frame().T, best_result_output_2.to_frame().T], ignore_index=True)

    # Save best_results_df as CSV
    best_results_df.to_csv('../results/scenario_2/clean_shallow_models_best_result_window.csv', index=False)

print("\nThe best combination of features and hyperparameters for each file pair is:")
print(best_results_df)



Windows:   0%|          | 0/3 [00:00<?, ?it/s]

Partitions:   0%|          | 0/3 [00:00<?, ?it/s]

Files:   0%|          | 0/192 [00:00<?, ?it/s]