# Evaluating multiple feature recipes on the CPAC_N10_10_25_20 dataset

- **Model**: Gradient Boosted Trees (histogram-based)
- **Target(s)**: `TF_Pelvis_Moment_X_BWBH`
- **Features**: various (approx. 63 alternative _recipes_)
- **Results**: 
  - $r^2$ scores (by cross-validation)
  - feature importances (permutation-based, using the full dataset for training)
  - predictions (merged, by cross-validation)
- **Evaluation strategy**: cross-validation (leave one subject out)

## Libraries

In [1]:
# Standard library
import warnings
import os


# Third party
import numpy as np
import pandas as pd
import sklearn
assert sklearn.__version__ >= "0.21", "Use the conda_python3_latest kernel!"
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn import (ensemble, metrics, preprocessing, 
                     pipeline, inspection, model_selection)

from IPython.display import display, Markdown


# Local
import utils

## Load Dataset

In [None]:
DATASET = "CPAC_N10_10_25_20"
DATASET_CSV = f"s3://cpac/ORIG/{DATASET}/CPAC10S_N10_10_25_20.csv"
DATASET_README = f"s3://cpac/ORIG/{DATASET}/READ_ME.xlsx"
RESULTS_DIR = f"results/{DATASET}"


df_orig = utils.load_dataset("s3://cpac/ORIG/CPAC_N10_10_25_20/CPAC10S_N10_10_25_20.csv")
df_orig.describe()

## Associate column names

In [None]:
def _get_columns_with_prefix(df, prefix):
    columns = []
    for column in df.columns:
        if column.startswith(prefix):
            columns.append(column)
    return columns
    
def get_target_names(df):
    return _get_columns_with_prefix(df, "T_")

def get_meta_names(df):
    return _get_columns_with_prefix(df, "M_")    

## Clean-up dataset

- Remove samples based on `M_include_overall`

In [None]:
df = df_orig[df_orig["M_include_overall"] > 0]

# Weed out wonky subjects
#df = df[df["M_Sub"] in (2,4,5,6,7,8,9)]
#RESULTS_DIR += "_nowonky"

print(f"Number of samples: {df.shape[0]:,d} (before clean-up: {df_orig.shape[0]:,d})")
print(f"Number of trials: {len(df['M_Trial_Name'].unique())} (before clean-up: {len(df_orig['M_Trial_Name'].unique())})")
print(f"Number of subjects: {len(df['M_Sub'].unique())}")

## Predictor configurations (recipes)

In [None]:
def predictor_short_name(predictor):
    return predictor[17:]

def predictor_sensor_number(predictor):
    return int(predictor[5:7])

def filter_predictors(all_predictors, patterns):
    if isinstance(patterns, str):
        patterns = (patterns,)
        
    predictors = []
    for predictor in all_predictors:
        for pattern in patterns:
            if pattern in predictor:
                predictors.append(predictor)
                break
    return predictors


def build_feature_sets(df):
    readme_xls = utils.download_dataset(DATASET_README)
    readme = pd.read_excel(readme_xls, sheet_name="Recipe_FINAL")
    
    feature_sets = {}
    
    recipes = readme.iteritems()
    next(recipes)   # first column is bogus
    for recipe_num, recipe in recipes:
        if recipe_num < 65:
            continue
        recipe_desc = recipe[3]
        recipe_filter_1 = [filter for filter in (recipe[6], recipe[8]) if isinstance(filter, str)]
        recipe_filter_2 = [filter for filter in recipe[10:] if isinstance(filter, str)]
        recipe_name = f"Recipe {recipe_num}: {recipe_desc}"
        feature_sets[recipe_name] = filter_predictors(filter_predictors(df.columns, recipe_filter_1), recipe_filter_2)
    
    return feature_sets

feature_sets = build_feature_sets(df)

for feature_set_name, predictors in feature_sets.items():
    sensors = set(map(predictor_sensor_number, predictors))
    print(f"{feature_set_name}\n\tPredictors: {len(predictors)}, Sensors: {len(sensors)}\n")

## Train and evaluate boosted tree models

In [None]:
def evaluate(target_name, feature_names):
    X, y, groups = df[feature_names], df[target_name], df["M_Sub"]
    
    model = pipeline.Pipeline([
        ('scaler', preprocessing.StandardScaler()),
        ('gboost', ensemble.HistGradientBoostingRegressor())
    ])
    
    logo = model_selection.LeaveOneGroupOut()

    prediction = model_selection.cross_val_predict(
        model, X, y, cv=logo, groups=groups, n_jobs=-1)

    r2_score = {}
    for idx_train, idx_test in logo.split(df, groups=groups):
        subject = df.iloc[idx_test[0]]["M_Sub"]
        r2_score[subject] = metrics.r2_score(y.iloc[idx_test], prediction[idx_test])
        
    r2_score = pd.Series(r2_score)
    prediction = pd.Series(prediction, index=y.index)
    
    # Feature importances on the full training set
    model.fit(X, y)
    perm_imp = inspection.permutation_importance(model, X, y, n_repeats=5, n_jobs=-1)
    importance = pd.Series(perm_imp.importances_mean, index=X.columns)
    importance.sort_values(ascending=False, inplace=True)

    return r2_score, importance, prediction, y

## Run experiments, save data

In [None]:
os.makedirs(RESULTS_DIR, exist_ok=True)


target_name = "TF_Pelvis_Moment_X_BWBH"

r2_mean_scores = {}

for feature_set_name, feature_names in feature_sets.items():
    r2_score, importance, prediction, target = evaluate(target_name, feature_names)
    r2_mean_scores[feature_set_name] = r2_score.mean()
    display(
        Markdown(
            "---\n"
            f"**Target**: {target_name}  \n"
            f"**Features**: {feature_set_name}  \n"
            f"**$R^2$ = {r2_mean_scores[feature_set_name]:.3f}**"
        )
    )

    with pd.ExcelWriter(f"{RESULTS_DIR}/R2_scores.xlsx") as writer:
        df_results = pd.DataFrame({f"R2 - {target_name}": r2_mean_scores})
        df_results.to_excel(writer, sheet_name="R2 Scores")


    short_name = feature_set_name.split(":")[0].replace(" ", "_")
    with pd.ExcelWriter(f"{RESULTS_DIR}/{short_name}_results.xlsx") as writer:

        df_results = pd.DataFrame({f"R2 - {target_name}": r2_score})
        df_results.to_excel(writer, index_label="Test Subject", sheet_name="R2 Scores")


        df_results = pd.DataFrame(
            {
                "Short name": map(predictor_short_name, importance.index),
                f"Importance - {target_name}": importance,
            }
        )
        df_results.to_excel(writer, sheet_name="Importance")
        
    df_results = pd.DataFrame(
        {
            f"Predictions - {target_name}": prediction,
            f"Target - {target_name}": target
        }
    )
    df_results.to_csv(f"{RESULTS_DIR}/{short_name}_predictions.csv")

