# Initial Experiments with the CPAC10S_S02_S03 dataset

## Libraries

In [1]:
import sklearn
assert sklearn.__version__ >= "0.21", "Use the conda_python3_latest kernel!"

# Standard library
import os
import re
import warnings

# Third party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn import (dummy, linear_model, ensemble, 
                     metrics, preprocessing, pipeline, inspection, 
                     model_selection)

# Local
import utils

# matplotlib hackery
%matplotlib inline

## Load Dataset

In [2]:
df_orig = utils.load_dataset("s3://cpac/ORIG/CPAC_S02_S03_05_26_20/CPAC10S_S02_S03_05_26_20.csv")
df_orig.describe()

Unnamed: 0,M_Trial_Num,M_Mass,M_Mass_to_L5S1,M_sub_task_indices,M_sub_task_num,M_include_overall,M_Index,M_Sub,M_sub_task_num_overall,M_Index_overall,...,RWEO_03_04_00_00_INSOLE_LY_AP_threshF50_mm,RWEO_01_00_00_00_INSOLE_RFORCE_threshF50_N,RWEO_01_02_00_00_INSOLE_RX_ML_threshF50_mm,RWEO_01_02_00_00_INSOLE_RY_AP_threshF50_mm,RWEF_03_00_00_00_INSOLE_LFORCE_threshF50_BW,RWEF_03_04_00_00_INSOLE_LX_ML_threshF50_BH,RWEF_03_04_00_00_INSOLE_LY_AP_threshF50_BH,RWEF_01_00_00_00_INSOLE_RFORCE_threshF50_BW,RWEF_01_02_00_00_INSOLE_RX_ML_threshF50_BH,RWEF_01_02_00_00_INSOLE_RY_AP_threshF50_BH
count,363815.0,363815.0,288738.0,363815.0,363815.0,363815.0,363815.0,363815.0,363815.0,363815.0,...,330242.0,363794.0,326972.0,326972.0,363794.0,330242.0,330242.0,363794.0,326972.0,326972.0
mean,68.079216,10.486291,0.307655,265.108517,5.624807,0.846394,2713.826898,2.473021,223.563355,50517.296391,...,117.304437,445.347758,49.239987,134.553623,0.537672,0.030179,0.065063,0.552295,0.027297,0.074479
std,22.686262,5.744458,0.16256,197.355745,4.801182,0.360571,2506.57876,0.499272,116.207719,62489.322519,...,46.379116,301.023338,7.994325,46.350097,0.333517,0.004993,0.025914,0.359465,0.004361,0.025628
min,1.0,0.0,0.083662,1.0,1.0,0.0,1.0,2.0,1.0,-0.195016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,51.0,5.0,0.155201,116.0,1.0,1.0,583.0,2.0,127.0,0.0,...,81.41,181.055,45.62,103.77,0.244256,0.028717,0.044869,0.233063,0.025235,0.05717
50%,77.0,10.0,0.269868,232.0,4.0,1.0,1909.0,2.0,233.0,9816.0,...,112.07,449.021,51.03,133.12,0.57121,0.031225,0.06152,0.574611,0.02821,0.0731
75%,85.0,15.0,0.453899,370.0,9.0,1.0,4304.0,3.0,319.0,100769.5,...,153.22,649.83725,54.65,172.78,0.795275,0.033164,0.084676,0.8008,0.030183,0.095087
max,96.0,23.0,0.701759,1589.0,18.0,1.0,11317.0,3.0,429.0,191723.0,...,242.32,1558.065,71.26,247.56,1.795448,0.039795,0.140069,1.796131,0.041191,0.143098


## Associate column names

In [3]:
def _get_columns_with_prefix(df, prefix):
    columns = []
    for column in df.columns:
        if column.startswith(prefix):
            columns.append(column)
    return columns
    
def get_target_names(df):
    return _get_columns_with_prefix(df, "T_")

def get_meta_names(df):
    return _get_columns_with_prefix(df, "M_")    

## Clean-up dataset

- Remove samples based on `M_include_overall`

In [4]:
df = df_orig[df_orig["M_include_overall"] > 0]
print(f"Number of samples: {df.shape[0]} (before clean-up: {df_orig.shape[0]})")
print(f"Number of trials: {len(df['M_Trial_Name'].unique())} (before clean-up: {len(df_orig['M_Trial_Name'].unique())})", )

Number of samples: 307931 (before clean-up: 363815)
Number of trials: 127 (before clean-up: 129)


## Predictor configurations (recipes)

In [5]:
def predictor_short_name(predictor):
    return predictor[17:]

def predictor_sensor_number(predictor):
    return int(predictor[5:7])

def filter_predictors(all_predictors, patterns):
    if isinstance(patterns, str):
        patterns = (patterns,)
        
    predictors = []
    for predictor in all_predictors:
        for pattern in patterns:
            if pattern in predictor:
                predictors.append(predictor)
                break
    return predictors


feature_sets = {
    "Recipe 1: Simulated Wearable - All Regular": 
        filter_predictors(df.columns, "SWRF"),
    
    "Recipe 2: Simulated Wearable - All Regular & Exploratory": 
        filter_predictors(df.columns, ("SWRF", "SWEF")),
    
    "Recipe 3: Real Wearable - All Regular": 
        filter_predictors(df.columns, "RWRF"),
    
    "Recipe 4: Real Wearable - All Regular & Exploratory": 
        filter_predictors(df.columns, ("RWRF", "RWEF")),
    
    "Recipe 5: Real Wearable - Insole Only":
        filter_predictors(
            filter_predictors(df.columns, ("RWRF", "RWEF")),
            ("03_00_00_00", "01_00_00_00", "03_04_00_00", "01_02_00_00", "01_03_00_00")
        ),
    
    "Recipe 6: Real Wearable - Trunk Orientatoin Only":
        filter_predictors(
            filter_predictors(df.columns, ("RWRF", "RWEF")),
            ("12_00_00_00",)
        ),
    
    "Recipe 7: Real Wearable - Insole & Trunk Orientation":
        filter_predictors(
            filter_predictors(df.columns, ("RWRF", "RWEF")),
            ("03_00_00_00", "01_00_00_00", "03_04_00_00", "01_02_00_00", "01_03_00_00", 
             "12_00_00_00", "01_03_12_00")
        ),
    
    "Recipe 8: Real Wearable - Insole & Hip Angles & Trunk Orientation":
        filter_predictors(
            filter_predictors(df.columns, ("RWRF", "RWEF")),
            ("03_00_00_00", "01_00_00_00", "03_04_00_00", "01_02_00_00", "01_03_00_00", 
             "12_00_00_00", "01_03_12_00",
             "05_06_00_00", "05_09_00_00", "06_00_00_00", "09_00_00_00", "05_12_00_00", "05_06_01_03", "05_09_01_03")
        ),
    
}

for feature_set_name, predictors in feature_sets.items():
    sensors = set(map(predictor_sensor_number, predictors))
    print(f"{feature_set_name}\n\tPredictors: {len(predictors)}, Sensors: {len(sensors)}\n")

Recipe 1: Simulated Wearable - All Regular
	Predictors: 75, Sensors: 10

Recipe 2: Simulated Wearable - All Regular & Exploratory
	Predictors: 104, Sensors: 10

Recipe 3: Real Wearable - All Regular
	Predictors: 166, Sensors: 10

Recipe 4: Real Wearable - All Regular & Exploratory
	Predictors: 201, Sensors: 10

Recipe 5: Real Wearable - Insole Only
	Predictors: 17, Sensors: 2

Recipe 6: Real Wearable - Trunk Orientatoin Only
	Predictors: 56, Sensors: 1

Recipe 7: Real Wearable - Insole & Trunk Orientation
	Predictors: 77, Sensors: 3

Recipe 8: Real Wearable - Insole & Hip Angles & Trunk Orientation
	Predictors: 133, Sensors: 6



## Train/Test configurations


In [None]:
target_name = "TF_Pelvis_Moment_X_BWBH"

test_selectors = {
    "Subject 2 -> Subject 3": df["M_Sub"] == 3,
    "Subject 3 -> Subject 2": df["M_Sub"] == 2,
    "Subject 2 & 3 -> 5kg Test": df["M_Trial_Name"].str.match("S0[0-9]_5kg")
}

train_test_sets = {}
for test_selector_name, test_selector in test_selectors.items():
    for target_limit in -np.inf, -0.20:
        df_train, df_test = df[~test_selector].dropna(), df[test_selector].dropna()
        train_test_set_name = test_selector_name
        if np.isfinite(target_limit):
            train_test_set_name += f", target > {target_limit}"
            df_train = df_train[df_train[target_name] > target_limit]
            df_test = df_test[df_test[target_name] > target_limit]
        train_test_sets[train_test_set_name] = (df_train, df_test)
        print(f"{train_test_set_name}: {len(df_train)}/{len(df_test)} "
              f"({len(df_train) / len(df):.1%}/{len(df_test) / len(df):.1%})")

Subject 2 -> Subject 3: 106652/67581 (34.6%/21.9%)
Subject 2 -> Subject 3, target > -0.2: 106366/67581 (34.5%/21.9%)
Subject 3 -> Subject 2: 67581/106652 (21.9%/34.6%)
Subject 3 -> Subject 2, target > -0.2: 67581/106366 (21.9%/34.5%)
Subject 2 & 3 -> 5kg Test: 159901/14332 (51.9%/4.7%)


## Train and evaluate boosted tree models

In [None]:
def train_model(X_train, y_train):
    model = pipeline.Pipeline([
        ('scaler', preprocessing.StandardScaler()),
        ('gboost', ensemble.HistGradientBoostingRegressor())
    ]).fit(X_train, y_train)
    return model
    
def evaluate_model(title, model, X_test, y_test, show_plot=False, show_importance=False):
    y_test_pred = model.predict(X_test)
    
    # Quantitative results
    rmse = np.sqrt(metrics.mean_squared_error(y_test_pred, y_test))
    r2 = metrics.r2_score(y_test_pred, y_test)
    
    if show_plot:
        ax = sns.jointplot(y_test, y_test_pred, alpha=0.1, s=1.0, color="steelblue", height=8)
        ax.set_axis_labels("Actual Pelvis Moment", 'Predicted Pelvis Moment', fontsize=14)
        ax.ax_joint.grid()
        ax.ax_marg_x.set_title(title, fontsize=14)

        summary = '\n'.join((
            r"$\mathrm{RMSE}=%.2f$" % (rmse,),
            r"$\mathrm{R}^2=%.3f$" % (r2,)
        ))

        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        ax.ax_joint.text(0.05, 0.95, summary, transform=ax.ax_joint.transAxes, fontsize=14,
            verticalalignment='top', bbox=props)
        plt.show()
        
    if show_importance:
        feature_importances = inspection.permutation_importance(model, 
                                           X_test, 
                                           y_test, 
                                           n_repeats=10, n_jobs=-1)
        top_idxs = feature_importances.importances_mean.argsort()[::-1][:6]
        fig, ax = plt.subplots()
        ax.boxplot(feature_importances.importances[top_idxs].T,
               vert=False, labels=[predictor_short_name(X_test.columns[idx]) for idx in top_idxs])
        ax.set_title("Feature Importances " + title)
        #fig.tight_layout()
        plt.show()
    
    return r2

warnings.filterwarnings('ignore')
results = {}
for train_test_set_name, (df_train, df_test) in train_test_sets.items():
    for feature_set_name, feature_names in feature_sets.items():
        title = f'{target_name} from "{feature_set_name}" on "{train_test_set_name}"'
        model = train_model(df_train[feature_names], df_train[target_name])
        r2 = evaluate_model(title, model, df_test[feature_names], df_test[target_name], True, True)
        print(f"{title}\n\tR2={r2:.3f}")
        results.setdefault(train_test_set_name, {})[feature_set_name] = r2
warnings.filterwarnings('default')


In [None]:
with pd.ExcelWriter('CPAC10S_S02_S03 - Results.xlsx') as writer:
    pd.DataFrame(results).to_excel(writer, sheet_name='Recipes')