# Evaluating the CPAC_Insole_Errors datasets

- **Model**: Gradient Boosted Trees (histogram-based)
- **Target(s)**: `TF_Pelvis_Moment_X_BWBH`, `TF_Pelvis_Moment_Y_BWBH`
- **Features**: all, Single IMU (T8)
- **Results**: 
  - $r^2$ scores (by cross-validation)
  - feature importances (permutation-based, using the full dataset for training)
  - predictions (merged, by cross-validation)
- **Evaluation strategy**: cross-validation (leave one subject out)

In [1]:
%load_ext autoreload
%autoreload 2

## Libraries

In [2]:
# Standard library
import warnings
import os


# Third party
import numpy as np
import pandas as pd
import sklearn
assert sklearn.__version__ >= "0.21", "Use the conda_python3_latest kernel!"
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn import (ensemble, metrics, preprocessing, 
                     pipeline, inspection, model_selection)

from IPython.display import display, Markdown


# Local
import utils

## Load Dataset

In [3]:
#DATASET = "Compiled_Yinsole_and_T8"
#DATASET_TYPE = "xlsx"

#DATASET = "Compiled_SW_and_T8"
#DATASET_TYPE = "csv"

#DATASET = "Unadjusted_InsoleY_compiled"
#DATASET_TYPE = "csv"

#DATASET = "JustLifts_Compiled_SW_and_T8"
#DATASET_TYPE = "csv"

#DATASET = "JustLifts_Compiled_Yinsole_and_T8"
#DATASET_TYPE = "xlsx"

DATASET = "JustLifts_Unadjusted_InsoleY_compiled"
DATASET_TYPE = "csv"

DATASET_S3 = f"s3://cpac/ORIG/CPAC_Insole_Errors/{DATASET}.{DATASET_TYPE}"
RESULTS_DIR = f"results/{DATASET}"


df_orig = utils.load_dataset(DATASET_S3)
df_orig.describe()

Unnamed: 0,M_Mass,M_Trial_Name,M_Mass_to_L5S1,M_sub_task_indices,M_sub_task_num,M_include_overall,M_Index,M_Sub,M_sub_task_num_overall,M_Index_overall,...,RWRO_03_04_00_00_INSOLE_LX_ML_mm,RWRO_03_04_00_00_INSOLE_LY_AP_mm,RWRO_01_02_00_00_INSOLE_RX_ML_mm,RWRO_01_02_00_00_INSOLE_RY_AP_mm,RWRF_03_00_00_00_INSOLE_LFORCE_BW,RWRF_01_00_00_00_INSOLE_RFORCE_BW,RWRF_03_04_00_00_INSOLE_LX_ML_BH,RWRF_03_04_00_00_INSOLE_LY_AP_BH,RWRF_01_02_00_00_INSOLE_RX_ML_BH,RWRF_01_02_00_00_INSOLE_RY_AP_BH
count,754140.0,754140.0,0.0,0.0,0.0,754140.0,754140.0,754140.0,754140.0,754140.0,...,754140.0,754140.0,754140.0,754140.0,754140.0,754140.0,754140.0,754140.0,754140.0,754140.0
mean,79.464447,9.852873,,,,0.882417,3550.709208,5.761006,0.0,126216.734765,...,51.508807,119.60916,46.925616,138.942242,0.490595,0.517201,0.02773,0.064395,0.025241,0.074794
std,16.967378,5.134464,,,,0.322114,2584.089654,2.865929,0.0,54239.72577,...,10.399163,55.220508,12.52775,56.781453,0.321761,0.33513,0.005661,0.029957,0.006773,0.031036
min,38.0,5.0,,,,0.0,1.0,2.0,0.0,17329.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,79.0,5.0,,,,1.0,1451.0,4.0,0.0,92837.0,...,47.36,72.55,42.97,97.12,0.195368,0.224205,0.025349,0.039056,0.022911,0.052021
50%,85.0,10.0,,,,1.0,3028.0,5.0,0.0,130544.0,...,52.99,113.16,50.33,140.61,0.475343,0.500101,0.028589,0.060573,0.026979,0.075169
75%,91.0,15.0,,,,1.0,5189.0,8.0,0.0,168251.0,...,57.87,164.17,54.8,182.55,0.761614,0.779576,0.031151,0.088073,0.029594,0.097547
max,96.0,23.0,,,,1.0,13069.0,10.0,0.0,236897.0,...,80.31,270.0,85.0,270.0,1.422158,1.439392,0.045118,0.151685,0.047191,0.151685


## Associate column names

In [4]:
def _get_columns_with_prefix(df, prefix):
    columns = []
    for column in df.columns:
        if column.startswith(prefix):
            columns.append(column)
    return columns
    
def get_target_names(df):
    return _get_columns_with_prefix(df, "T_")

def get_meta_names(df):
    return _get_columns_with_prefix(df, "M_")    

## Clean-up dataset

- Remove samples based on `M_include_overall`

In [5]:
df = df_orig[df_orig["M_include_overall"] > 0]

# Weed out wonky subjects
#df = df[df["M_Sub"].isin([2,4,5,6,7,8,9])]
#RESULTS_DIR += "_nowonky"

print(f"Number of samples: {df.shape[0]:,d} (before clean-up: {df_orig.shape[0]:,d})")
print(f"Number of trials: {len(df['M_Trial_Name'].unique())} (before clean-up: {len(df_orig['M_Trial_Name'].unique())})")
print(f"Number of subjects: {len(df['M_Sub'].unique())}")

Number of samples: 665,466 (before clean-up: 754,140)
Number of trials: 4 (before clean-up: 4)
Number of subjects: 5


## Predictor configurations (recipes)

In [6]:
def predictor_short_name(predictor):
    return predictor[17:]

def predictor_sensor_number(predictor):
    #return int(predictor[5:7])
    return predictor[5:7]

def filter_predictors(all_predictors, patterns):
    if isinstance(patterns, str):
        patterns = (patterns,)
        
    predictors = []
    for predictor in all_predictors:
        for pattern in patterns:
            if pattern in predictor:
                predictors.append(predictor)
                break
    return predictors


# def build_feature_sets(df):
#     readme_xls = utils.download_dataset(DATASET_README)
#     readme = pd.read_excel(readme_xls, sheet_name="Recipe_FINAL")
#    
#     feature_sets = {}
#    
#     recipes = readme.iteritems()
#     next(recipes)   # first column is bogus
#     for recipe_num, recipe in recipes:
#         recipe_desc = recipe[3]
#         recipe_filter_1 = [filter for filter in (recipe[7], recipe[9]) if isinstance(filter, str)]
#         recipe_filter_2 = [filter for filter in recipe[11:] if isinstance(filter, str)]
#         recipe_name = f"Recipe {recipe_num}: {recipe_desc}"
#         feature_sets[recipe_name] = filter_predictors(filter_predictors(df.columns, recipe_filter_1), recipe_filter_2)
#    
#     return feature_sets

#feature_sets = build_feature_sets(df)

feature_sets = {
    "All": df.loc[:, "RWRF_12_00_00_00_T8_orientation_q1":].columns,
    "Single IMU": df.loc[:, "RWRF_12_00_00_00_T8_orientation_q1":"RWRF_12_00_00_00_T8_acceleration_Z_ver"].columns,
}

for feature_set_name, predictors in feature_sets.items():
    sensors = set(map(predictor_sensor_number, predictors))
    print(f"{feature_set_name}\n\tPredictors: {len(predictors)}, Sensors: {len(sensors)}\n")

All
	Predictors: 25, Sensors: 3

Single IMU
	Predictors: 13, Sensors: 1



## Train and evaluate boosted tree models

In [7]:
def evaluate(target_name, feature_names):
    X, y, groups = df[feature_names], df[target_name], df["M_Sub"]
    
    model = pipeline.Pipeline([
        ('scaler', preprocessing.StandardScaler()),
        ('gboost', ensemble.HistGradientBoostingRegressor())
    ])
    
    logo = model_selection.LeaveOneGroupOut()

    prediction = model_selection.cross_val_predict(
        model, X, y, cv=logo, groups=groups, n_jobs=-1)

    r2_score = {}
    for idx_train, idx_test in logo.split(df, groups=groups):
        subject = df.iloc[idx_test[0]]["M_Sub"]
        r2_score[subject] = metrics.r2_score(y.iloc[idx_test], prediction[idx_test])
        
    r2_score = pd.Series(r2_score)
    prediction = pd.Series(prediction, index=y.index)
    
    # Feature importances on the full training set
    model.fit(X, y)
    perm_imp = inspection.permutation_importance(model, X, y, n_repeats=5, n_jobs=10)
    importance = pd.Series(perm_imp.importances_mean, index=X.columns)
    importance.sort_values(ascending=False, inplace=True)

    return r2_score, importance, prediction, y

## Run experiments, save data

In [8]:
os.makedirs(RESULTS_DIR, exist_ok=True)


target_name_x = "TO_Pelvis_Moment_X_Nm"
target_name_y = "TO_Pelvis_Moment_Y_Nm"
target_name_z = "TO_Pelvis_Moment_Z_Nm"


r2_mean_scores_x = {}
r2_mean_scores_y = {}
r2_mean_scores_z = {}


for feature_set_name, feature_names in feature_sets.items():
    r2_score_x, importance_x, prediction_x, target_x = evaluate(target_name_x, feature_names)
    r2_mean_scores_x[feature_set_name] = r2_score_x.mean()
    r2_score_y, importance_y, prediction_y, target_y = evaluate(target_name_y, feature_names)
    r2_mean_scores_y[feature_set_name] = r2_score_y.mean()
    r2_score_z, importance_z, prediction_z, target_z = evaluate(target_name_z, feature_names)
    r2_mean_scores_z[feature_set_name] = r2_score_z.mean()
    display(
        Markdown(
            "---\n"
            f"**Features**: {feature_set_name}  \n"
            f"**$R^2$ ({target_name_x}) = {r2_mean_scores_x[feature_set_name]:.3f}**\n"
            f"**$R^2$ ({target_name_y}) = {r2_mean_scores_y[feature_set_name]:.3f}**\n"
            f"**$R^2$ ({target_name_z}) = {r2_mean_scores_z[feature_set_name]:.3f}**\n"
        )
    )

    with pd.ExcelWriter(f"{RESULTS_DIR}/R2_scores.xlsx") as writer:
        df_results = pd.DataFrame({f"R2 - {target_name_x}": r2_mean_scores_x, 
                                   f"R2 - {target_name_y}": r2_mean_scores_y,
                                   f"R2 - {target_name_z}": r2_mean_scores_z,})
        df_results.to_excel(writer, sheet_name="R2 Scores")


    short_name = feature_set_name.split(":")[0].replace(" ", "_")
    with pd.ExcelWriter(f"{RESULTS_DIR}/{short_name}_results.xlsx") as writer:

        df_results = pd.DataFrame({f"R2 - {target_name_x}": r2_score_x,
                                   f"R2 - {target_name_y}": r2_score_y,
                                   f"R2 - {target_name_z}": r2_score_z})
        df_results.to_excel(writer, index_label="Test Subject", sheet_name="R2 Scores")


        df_results = pd.DataFrame(
            {
                #"Short name": map(predictor_short_name, importance_x.index),
                f"Importance - {target_name_x}": importance_x,
                f"Importance - {target_name_y}": importance_y,
                f"Importance - {target_name_z}": importance_z,
            }
        )
        df_results.to_excel(writer, sheet_name="Importance")
        
    df_results = pd.DataFrame(
        {
            f"Predictions - {target_name_x}": prediction_x,
            f"Target - {target_name_x}": target_x,
            f"Predictions - {target_name_y}": prediction_y,
            f"Target - {target_name_y}": target_y,
            f"Predictions - {target_name_z}": prediction_z,
            f"Target - {target_name_z}": target_z
        }
    )
    df_results.to_csv(f"{RESULTS_DIR}/{short_name}_predictions.csv")



---
**Features**: All  
**$R^2$ (TO_Pelvis_Moment_X_Nm) = 0.714**
**$R^2$ (TO_Pelvis_Moment_Y_Nm) = 0.701**
**$R^2$ (TO_Pelvis_Moment_Z_Nm) = -0.502**


---
**Features**: Single IMU  
**$R^2$ (TO_Pelvis_Moment_X_Nm) = 0.752**
**$R^2$ (TO_Pelvis_Moment_Y_Nm) = 0.475**
**$R^2$ (TO_Pelvis_Moment_Z_Nm) = -0.243**
