# General Model Generation

General models are generated in this file. For $n$ subjects, the models use $n - 3$ of the subjects' data for training, and $3$ for testing.

The total number of combinations of the $n - 3$ training subjects and $3$ testing subjects is $n \choose 3$. For each combination of $n - 3$ training subjects and 3 testing subjects, 18 models are generated.

The models are generated and then saved in a CSV file called `train_general.csv` for the training models and `test_general.csv` for the testing models. The feature importance data is also saved for each model. This data is saved in a CSV file called `feature_importance_general.csv`. These files are in the directory `outputs/general/`.

In [1]:
from pycaret.datasets import get_data
from pycaret.regression import *
from itertools import combinations
import csv
import pandas as pd

In [None]:
%%capture
data_dir = '../../data'
outputs_dir = '../../outputs/csv/general'
df = read(f'{data_dir}/initial_features')

COMBINATIONS = 3

combinations_list = list(combinations(df['subject'].unique(), COMBINATIONS))

## Cross Validation Results

In [None]:
train_general_df = pd.DataFrame(columns=["Model", "MAE", "MSE", "RMSE", "R2", "RMSLE", "MAPE", "TT (Sec)", "test_set"])
test_general_df = pd.DataFrame(columns=["Model", "MAE", "MSE", "RMSE", "R2", "RMSLE", "MAPE", "test_set"])
feature_importance_general_df = pd.DataFrame(columns=["Feature", "Value", "Model", "test_set"])

for sub in combinations_list:
    # Split data into training and testing based on subject
    train = df[~df['subject'].isin(sub)]
    test = df[df['subject'].isin(sub)]

    reg = setup(data=train, target='rpe', ignore_features=['experimental_condition', 'subject', 'wrist_acc_time'])
    best = compare_models(sort='MAE', n_select = 18)
    all = pull()
    all['test_set'] = str(sub)

    # Output trained model results to csv
    train_general_df = pd.concat([train_general_df, all], ignore_index=True)
    
    
    test_results = pd.DataFrame()
    for model in best:
        # Run models on test data
        test_result = predict_model(model, data=test, verbose = False)
        test_result = pull()
        test_results = test_results.append(test_result)
        
        # Add feature importance of model to dataframe
        try:
            importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(model.feature_importances_)}).sort_values(by='Value', ascending=False).reset_index().drop('index', axis=1)
        except:
            try:
                importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(model.coef_)}).sort_values(by='Value', ascending=False).reset_index().drop('index', axis=1)
            except:
                importance = pd.DataFrame({'Feature': ['error'], 'Value': [0]})

        # Export to csv
        importance['Model'] = str(model)
        importance['test_set'] = str(sub)
        feature_importance_general_df = pd.concat([feature_importance_general_df, importance], ignore_index=True)

    # Save test model results to csv
    test_results['test_set'] = str(sub)
    test_general_df = pd.concat([test_general_df, test_results], ignore_index=True)

train_general_df.to_csv(f'{outputs_dir}/train_general.csv', index=False)
test_general_df.to_csv(f'{outputs_dir}/test_general.csv', index=False)
feature_importance_general_df.to_csv(f'{outputs_dir}/feature_importance_general.csv', index=False)

Unnamed: 0,Description,Value
0,Session id,2718
1,Target,rpe
2,Target type,Regression
3,Original data shape,"(2109, 61)"
4,Transformed data shape,"(2109, 58)"
5,Transformed train set shape,"(1476, 58)"
6,Transformed test set shape,"(633, 58)"
7,Ignore features,3
8,Numeric features,57
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)


Processing:   0%|          | 0/102 [00:00<?, ?it/s]

KeyboardInterrupt: 