In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance

In [None]:
# whoop_data_filepath = put filepath to "physiological_cycles.csv" from whoop export here
df = pd.read_csv(whoop_data_filepath)

In [None]:
df.head()

In [None]:
# Define some functions to normalize either by mean or min-max
def normalize_by_mean(df, column):
    mean_value = df[column].mean()
    df[column] = (df[column] - mean_value) / (df[column].max() - df[column].min())
    return df

def normalize_min_max(df, column):
    df[column] = 2 * (df[column] - df[column].min()) / (df[column].max() - df[column].min()) - 1
    return df

In [None]:
# Features that are unlikely to impact recover
cols_to_drop = ['Cycle start time', 'Cycle end time', 'Cycle timezone', 'Sleep onset', 'Wake onset']
df = df.drop(cols_to_drop, axis=1)

In [None]:
# Removing recovery score
df = df.dropna()
recovery = df['Recovery score %']
df = df.drop('Recovery score %', axis=1)

In [None]:
# Choose which columns might make sense to normalize by mean
cols_to_norm_mean = ['Resting heart rate (bpm)', 'Heart rate variability (ms)', 'Energy burned (cal)', 'Max HR (bpm)', 'Respiratory rate (rpm)']
other_cols = [col for col in df.columns.tolist() if col not in cols_to_norm_mean]

In [None]:
for col in cols_to_norm_mean:
  df = normalize_by_mean(df, col)
for col in other_cols:
  df = normalize_min_max(df, col)

In [None]:
df.shape

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df, recovery, test_size=0.3, random_state=15)

In [None]:
# Linear model, L1
linear_model = Lasso(alpha=.5)

linear_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
print(f"Linear Regression Mean Squared Error: {mse_linear}")

In [None]:
# Create df to compare the actual with predicted
results_linear = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_linear, 'Difference': y_pred_linear - y_test})

In [None]:
results_linear

In [None]:
# Analyze the coefficients of the linear model
coefficients = linear_model.coef_
intercept = linear_model.intercept_

print("Intercept:", intercept)
print("Coefficients:")
for feature, coef in zip(df.columns, coefficients):
    print(feature, ':', coef)

In [None]:
# Same thing using GBM model
gbm_model = GradientBoostingRegressor()

gbm_model.fit(X_train, y_train)

y_pred_gbm = gbm_model.predict(X_test)

mse_gbm = mean_squared_error(y_test, y_pred_gbm)
print(f"Gradient Boosting Machine Mean Squared Error: {mse_gbm}")

In [None]:
# Create df to compare the results
results_gbm = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_gbm, 'Difference': y_pred_gbm-y_test})

In [None]:
results_gbm

In [None]:
# Create a df for the feature importance
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': gbm_model.feature_importances_})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

In [None]:
# SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)

svr_model.fit(X_train, y_train)

y_pred_svr = gbm_model.predict(X_test)

mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"SVR Mean Squared Error: {mse_svr}")

In [None]:
results_svr = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_svr, 'Difference': y_pred_svr-y_test})

In [None]:
results_svr

In [None]:
# Permutation Importance - can be used with any of the three models
result = permutation_importance(gbm_model, X_test, y_test, random_state=3)
importance = result.importances_mean

# Display permutation importance
feature_importance = pd.DataFrame({'Feature': X_test.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display as %
total = feature_importance['Importance'].abs().sum()
feature_importance['% Importance'] = (feature_importance['Importance'].abs() / total) * 100
feature_importance.sort_values(by='% Importance', ascending=False)