# Classification Model Generation

Classification models are generated in this file. The data used in this model is constructed in the "Classification Dataset" section of `data_construction.ipynb`.

For $n$ subjects, the models use $n - 3$ of the subjects' data for training, and $3$ for testing.

The total number of combinations of the $n - 3$ training subjects and $3$ testing subjects is $n \choose 3$. For each combination of $n - 3$ training subjects and $3$ testing subjects, 18 models are generated.

The models are generated and then saved in a CSV file called `train_general.csv` for the training models and `test_general.csv` for the testing models. The feature importance data is also saved for each model. This data is saved in a CSV file called `feature_importance_general.csv`. These files are in the directory `outputs/general/`.

In [1]:
from pandas import read_csv as read
from pycaret.classification import *
from itertools import combinations
import csv
import pandas as pd

In [None]:
%%capture
data_dir = '../../data'
outputs_dir = '../../outputs/classification'
df = read(f'{data_dir}/initial_features_classification')

# The number of subjects used in testing data. Should be set to 3 to match the data used in results.
COMBINATIONS = 3

combinations_list = list(combinations(df['subject'].unique(), COMBINATIONS))

In [None]:
train_general_df = pd.DataFrame()
test_general_df = pd.DataFrame()
feature_importance_general_df = pd.DataFrame()

for sub in combinations_list:  # EX: [(2, 3, 6)]
    # Split data into training and testing based on subject
    train = df[~df['subject'].isin(sub)]
    test = df[df['subject'].isin(sub)]

    clf = setup(data=train, target='rpe', ignore_features=['experimental_condition', 'subject', 'wrist_acc_time'], verbose=False)
    best = compare_models(sort='Accuracy', n_select=18)
    all = pull()
    all['test_set'] = str(sub)

    # Append trained model results to dataframe
    train_general_df = pd.concat([train_general_df, all], ignore_index=True)
    
    test_results = pd.DataFrame()
    for model in best:
        # Run models on test data
        test_result = predict_model(model, data=test, verbose=False)
        test_result_df = pull()
        test_result_df['Model'] = str(model).split('(')[0]
        test_result_df['test_set'] = str(sub)
        test_results = pd.concat([test_results, test_result_df], ignore_index=True)
        
        # Add feature importance of model to dataframe
        try:
            importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value': abs(model.feature_importances_)}).sort_values(by='Value', ascending=False).reset_index(drop=True)
        except:
            try:
                importance = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value': abs(model.coef_)}).sort_values(by='Value', ascending=False).reset_index(drop=True)
            except:
                importance = pd.DataFrame({'Feature': ['error'], 'Value': [0]})

        # Append feature importance to dataframe
        importance['Model'] = str(model)
        importance['test_subjects'] = str(sub)
        feature_importance_general_df = pd.concat([feature_importance_general_df, importance], ignore_index=True)

    # Append test model results to dataframe
    test_general_df = pd.concat([test_general_df, test_results], ignore_index=True)

# Save dataframes to csv
train_general_df.to_csv(f'{outputs_dir}/train_classif.csv', index=False)
test_general_df.to_csv(f'{outputs_dir}/test_classif.csv', index=False)
feature_importance_general_df.to_csv(f'{outputs_dir}/feature_importance_classif.csv', index=False)