# Linear Regression on CPAC_N10_11_10_20 dataset

- **Model**: Linear Regression
- **Target(s)**: `TF_Pelvis_Moment_X_BWBH`
- **Features**: sag trunk angle
- **Results**: slope/intercept


## Libraries

In [2]:
# Standard library
import warnings
import os


# Third party
import numpy as np
import pandas as pd
from sklearn import linear_model

from IPython.display import display, Markdown


# Local
import utils

## Load Dataset

In [3]:
DATASET = "CPAC_N10_11_10_20"
DATASET_CSV = f"s3://cpac/ORIG/{DATASET}/CPAC10S_N10_11_10_20.csv"
DATASET_README = f"s3://cpac/ORIG/{DATASET}/READ_ME.xlsx"
RESULTS_DIR = f"results/{DATASET}"


df_orig = utils.load_dataset(DATASET_CSV)
df_orig.describe()

Unnamed: 0,M_Trial_Num,M_Mass,M_Mass_to_L5S1,M_sub_task_indices,M_sub_task_num,M_include_overall,M_Index,M_Sub,M_sub_task_num_overall,M_Index_overall,...,RWEO_01_02_00_00_INSOLE_RX_ML_threshF50_mm,RWEO_01_02_00_00_INSOLE_RY_AP_threshF50_mm,RWEF_03_00_00_00_INSOLE_LFORCE_threshF50_BW,RWEF_03_04_00_00_INSOLE_LX_ML_threshF50_BH,RWEF_03_04_00_00_INSOLE_LY_AP_threshF50_BH,RWEF_01_00_00_00_INSOLE_RFORCE_threshF50_BW,RWEF_01_02_00_00_INSOLE_RX_ML_threshF50_BH,RWEF_01_02_00_00_INSOLE_RY_AP_threshF50_BH,RWSF_SCALED_RINSOLE_BW,RWSF_SCALED_LINSOLE_BW
count,1971017.0,1971017.0,567.0,0.0,0.0,1971017.0,1971017.0,1971017.0,1971017.0,1971017.0,...,1683991.0,1683991.0,1967609.0,1661377.0,1661377.0,1967609.0,1683991.0,1683991.0,1967609.0,1967609.0
mean,69.42408,10.53204,0.0,,,0.8154633,2716.049,5.396072,0.0,92362.36,...,48.99182,131.988,0.4390082,0.02785011,0.06597227,0.4570948,0.02698083,0.07262811,0.4262551,0.3744147
std,23.67938,5.845028,0.0,,,0.3879214,2566.75,2.934289,0.0,63885.43,...,9.183171,53.35901,0.3258924,0.005077019,0.03046704,0.337089,0.005108799,0.02956883,0.3576745,0.3189683
min,1.0,0.0,0.0,,,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.05407405,-0.05624677
25%,49.0,5.0,0.0,,,1.0,598.0,3.0,0.0,35680.0,...,45.14,88.28,0.1586822,0.02579404,0.03957558,0.1723314,0.0245814,0.04846361,0.1368566,0.1042724
50%,79.0,10.0,0.0,,,1.0,1875.0,5.0,0.0,90430.0,...,50.76,130.21,0.4173078,0.02872237,0.06118229,0.4304055,0.02806366,0.07106925,0.3676134,0.3228916
75%,87.0,15.0,0.0,,,1.0,4215.0,8.0,0.0,145181.0,...,54.73,177.78,0.6735348,0.031,0.09114589,0.695824,0.03011671,0.09697091,0.6673267,0.6007689
max,96.0,23.0,0.0,,,1.0,14117.0,10.0,0.0,236897.0,...,79.11,267.78,2.068044,0.04425281,0.1506798,2.041016,0.04326744,0.1504382,1.98141,1.950301


## Clean-up dataset

- Remove samples based on `M_include_overall`

In [5]:
df = df_orig[df_orig["M_include_overall"] > 0]

# Weed out wonky subjects
#df = df[df["M_Sub"].isin([2,4,5,6,7,8,9])]
#RESULTS_DIR += "_nowonky"

print(f"Number of samples: {df.shape[0]:,d} (before clean-up: {df_orig.shape[0]:,d})")
print(f"Number of trials: {len(df['M_Trial_Name'].unique())} (before clean-up: {len(df_orig['M_Trial_Name'].unique())})")
print(f"Number of subjects: {len(df['M_Sub'].unique())}")

Number of samples: 1,607,292 (before clean-up: 1,971,017)
Number of trials: 174 (before clean-up: 174)
Number of subjects: 10


## Linear Regression

In [32]:
df_linreg = df[["TF_Pelvis_Moment_X_BWBH", "SWRF_05_12_00_00_TRUNK_ANGLE_VL_X_sag"]].dropna()
X = df_linreg["SWRF_05_12_00_00_TRUNK_ANGLE_VL_X_sag"].values.reshape(-1, 1)
y = df_linreg["TF_Pelvis_Moment_X_BWBH"].values


estimator = linear_model.LinearRegression()
estimator.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [33]:
estimator.coef_

array([0.00121295])

In [34]:
estimator.intercept_

-0.014837700062068568

In [38]:
estimator.predict(X[100:101]), y[100]

(array([-0.1027784]), -0.19516205451610896)

In [43]:
np.linalg.lstsq(np.hstack((X, np.ones_like(X))), y)

(array([ 0.00121295, -0.0148377 ]),
 array([883.82060204]),
 2,
 array([55680.12152307,   864.24368316]))

In [44]:
estimator.score(X, y)

0.7065912033269665