# Evaluating multiple feature recipes on the Simulation_Rerun_12_01_21 dataset

- **Model**: Gradient Boosted Trees (histogram-based)
- **Target(s)**: _coming from the recipes_
- **Features**: various (71 alternative _recipes_)
- **Results**: 
  - $r^2$ scores (by cross-validation)
  - feature importances (permutation-based, using the full dataset for training)
  - predictions (merged, by cross-validation)
- **Evaluation strategy**: cross-validation (leave one subject out)

## Libraries

In [1]:
# Standard library
import warnings
import os
from pathlib import Path


# Third party
import numpy as np
import pandas as pd
import sklearn
assert sklearn.__version__ >= "0.21", "Use the conda_python3_latest kernel!"
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn import (ensemble, metrics, preprocessing, 
                     pipeline, inspection, model_selection)

from IPython.display import display, Markdown

## Load Dataset

In [2]:
DATASET = Path("../datasets/Simulation_Rerun_12_01_21")
DATASET_CSV = DATASET / "Simulation_table.csv"
DATASET_README = DATASET / "Recipes and Thresholds.xlsx"
#VARIANT_NAME = "Test Moment Threshold"
#VARIANT_NAME = "Damage"
VARIANT_NAME = "Compression"

RESULTS_DIR =  Path("results") / DATASET.name / VARIANT_NAME

print(f"Reading {DATASET_CSV}...")
df_orig = pd.read_csv(DATASET_CSV)
df_orig.describe()

Reading ../datasets/Simulation_Rerun_12_01_21/Simulation_table.csv...


Unnamed: 0,M_Trial_Num,M_Mass,M_Mass_to_L5S1,M_sub_task_indices,M_sub_task_num,M_include_overall,M_Index,M_Sub,M_sub_task_num_overall,M_Index_overall,...,RWEO_01_02_00_00_INSOLE_RX_ML_threshF50_mm,RWEO_01_02_00_00_INSOLE_RY_AP_threshF50_mm,RWEF_03_00_00_00_INSOLE_LFORCE_threshF50_BW,RWEF_03_04_00_00_INSOLE_LX_ML_threshF50_BH,RWEF_03_04_00_00_INSOLE_LY_AP_threshF50_BH,RWEF_01_00_00_00_INSOLE_RFORCE_threshF50_BW,RWEF_01_02_00_00_INSOLE_RX_ML_threshF50_BH,RWEF_01_02_00_00_INSOLE_RY_AP_threshF50_BH,TO_Compression,TO_Damage
count,1444342.0,1444342.0,0.0,0.0,0.0,1444342.0,1444342.0,1444342.0,1444342.0,1444342.0,...,1207234.0,1207234.0,1442756.0,1193679.0,1193679.0,1442756.0,1207234.0,1207234.0,1424272.0,1424272.0
mean,78.9892,9.926241,,,,0.8743102,3502.123,5.401267,0.0,112182.3,...,48.81444,132.3979,0.4309802,0.0278902,0.06610251,0.4421784,0.0268693,0.07280565,1263.874,1.766263
std,16.60174,5.168718,,,,0.3314995,2566.335,2.93113,0.0,61951.9,...,9.433342,54.18441,0.3308654,0.005220985,0.03077229,0.3416963,0.005268231,0.03001277,910.217,968.0966
min,35.0,5.0,,,,0.0,1.0,1.0,0.0,0.0,...,11.49,24.32,0.0,0.00632687,0.01569274,0.0,0.006227642,0.01318157,281.0005,1.526047e-06
25%,78.0,5.0,,,,1.0,1423.0,3.0,0.0,77175.25,...,44.74,86.73,0.1332044,0.02578125,0.03892602,0.1437663,0.02426415,0.04760981,478.1508,2.540399e-06
50%,84.0,10.0,,,,1.0,2973.0,5.0,0.0,119690.0,...,50.85,131.47,0.4021949,0.02881461,0.06135938,0.4001025,0.02809375,0.07169231,979.5552,9.285635e-06
75%,90.0,15.0,,,,1.0,5127.0,8.0,0.0,159811.0,...,54.84,179.03,0.6882456,0.03123035,0.09174005,0.6990125,0.03025281,0.09754377,1878.973,9.496359e-05
max,96.0,23.0,,,,1.0,14117.0,10.0,0.0,236897.0,...,78.81,267.78,1.696594,0.04425281,0.1506798,1.675757,0.04326744,0.1504382,10766.18,901419.1


## Associate column names

In [3]:
def _get_columns_with_prefix(df, prefix):
    columns = []
    for column in df.columns:
        if column.startswith(prefix):
            columns.append(column)
    return columns
    
def get_target_names(df):
    return _get_columns_with_prefix(df, "T_")

def get_meta_names(df):
    return _get_columns_with_prefix(df, "M_")    

## Clean-up dataset

- Remove samples based on `M_include_overall`

In [4]:
df = df_orig[df_orig["M_include_overall"] > 0]

# Weed out wonky subjects
#df = df[df["M_Sub"].isin([2,4,5,6,7,8,9])]
#RESULTS_DIR += "_nowonky"

print(f"Number of samples: {df.shape[0]:,d} (before clean-up: {df_orig.shape[0]:,d})")
print(f"Number of trials: {len(df['M_Trial_Name'].unique())} (before clean-up: {len(df_orig['M_Trial_Name'].unique())})")
print(f"Number of subjects: {len(df['M_Sub'].unique())}")

Number of samples: 1,262,803 (before clean-up: 1,444,342)
Number of trials: 57 (before clean-up: 57)
Number of subjects: 10


## Predictor configurations (recipes)

In [5]:
def predictor_short_name(predictor):
    return predictor[17:]

def predictor_sensor_number(predictor):
    #return int(predictor[5:7])
    return predictor[5:7]

def filter_predictors(all_predictors, patterns):
    if isinstance(patterns, str):
        patterns = (patterns,)
        
    predictors = []
    for predictor in all_predictors:
        for pattern in patterns:
            if pattern in predictor:
                predictors.append(predictor)
                break
    return predictors


def build_feature_sets(df, readme):    
    feature_sets = {}
    
    recipes = readme.iteritems()
    next(recipes)   # first column is bogus
    for recipe_num, recipe in recipes:
        recipe_desc = recipe[3]
        recipe_filter_1 = [filter for filter in (recipe[6], recipe[7]) if isinstance(filter, str)]
        recipe_filter_2 = [filter for filter in recipe[10:16] if isinstance(filter, str)]
        recipe_name = f"Recipe {recipe_num}: {recipe_desc}"
        feature_sets[recipe_name] = filter_predictors(filter_predictors(df.columns, recipe_filter_1), recipe_filter_2)
    
    return feature_sets

readme = pd.read_excel(DATASET_README, sheet_name=VARIANT_NAME)
target_name = readme.iloc[4, 1]
target_threshold = readme.iloc[17, 1]

# Threshold filtering for everything
df = df[df[target_name] > target_threshold]

feature_sets = build_feature_sets(df, readme)

for feature_set_name, predictors in feature_sets.items():
    sensors = set(map(predictor_sensor_number, predictors))
    print(f"{feature_set_name}\n\tPredictors: {len(predictors)}, Sensors: {len(sensors)}\n")

Recipe 1: insole,trunk
	Predictors: 15, Sensors: 3

Recipe 2: trunk
	Predictors: 3, Sensors: 1



In [6]:
def evaluate(target_name, feature_names):
    X, y, groups = df[feature_names], df[target_name], df["M_Sub"]
    
    model = pipeline.Pipeline([
        ('scaler', preprocessing.StandardScaler()),
        ('gboost', ensemble.HistGradientBoostingRegressor())
    ])
    
    logo = model_selection.LeaveOneGroupOut()

    prediction = model_selection.cross_val_predict(
        model, X, y, cv=logo, groups=groups, n_jobs=-1)

    r2_score = {}
    for idx_train, idx_test in logo.split(df, groups=groups):
        subject = df.iloc[idx_test[0]]["M_Sub"]
        test_target = y.iloc[idx_test]
        test_pred = prediction[idx_test]
        
        # Threshold filtering for testing, only
        threshold_mask = test_target > target_threshold
        #print("DBG Before", test_target.shape, test_pred.shape)
        test_target = test_target[threshold_mask]
        test_pred = test_pred[threshold_mask]
        #print("DBG After", test_target.shape, test_pred.shape)
        
        r2_score[subject] = metrics.r2_score(test_target, test_pred)
        
    r2_score = pd.Series(r2_score)
    prediction = pd.Series(prediction, index=y.index)
    
    # Feature importances on the full training set
    model.fit(X, y)
    perm_imp = inspection.permutation_importance(model, X, y, n_repeats=5, n_jobs=10)
    importance = pd.Series(perm_imp.importances_mean, index=X.columns)
    importance.sort_values(ascending=False, inplace=True)

    return r2_score, importance, prediction, y

## Run experiments, save data

In [7]:
os.makedirs(RESULTS_DIR, exist_ok=True)


r2_mean_scores = {}

for feature_set_name, feature_names in feature_sets.items():
    r2_score, importance, prediction, target = evaluate(target_name, feature_names)
    r2_mean_scores[feature_set_name] = r2_score.mean()
    display(
        Markdown(
            "---\n"
            f"**Features**: {feature_set_name}  \n"
            f"**$R^2$ ({target_name}) = {r2_mean_scores[feature_set_name]:.3f}**\n"
        )
    )

    with pd.ExcelWriter(f"{RESULTS_DIR}/R2_scores.xlsx") as writer:
        df_results = pd.DataFrame({f"R2 - {target_name}": r2_mean_scores,})
        df_results.to_excel(writer, sheet_name="R2 Scores")


    short_name = feature_set_name.split(":")[0].replace(" ", "_")
    with pd.ExcelWriter(f"{RESULTS_DIR}/{short_name}_results.xlsx") as writer:

        df_results = pd.DataFrame({f"R2 - {target_name}": r2_score,})
        df_results.to_excel(writer, index_label="Test Subject", sheet_name="R2 Scores")


        df_results = pd.DataFrame(
            {
                #"Short name": map(predictor_short_name, importance_x.index),
                f"Importance - {target_name}": importance,
            }
        )
        df_results.to_excel(writer, sheet_name="Importance")
        
    df_results = pd.DataFrame(
        {
            f"Predictions - {target_name}": prediction,
            f"Target - {target_name}": target,
        }
    )
    df_results.to_csv(f"{RESULTS_DIR}/{short_name}_predictions.csv")



---
**Features**: Recipe 1: insole,trunk  
**$R^2$ (TO_Compression) = 0.359**


---
**Features**: Recipe 2: trunk  
**$R^2$ (TO_Compression) = -0.817**
