## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler

## Define the files to be loaded

In [2]:
# Participant number whose data should be loaded
participant_number = 2

# The version of the questionnaires to load
quest_version = 2

## Loading the pre-processed data

In [3]:
df = pd.read_csv('data/preprocessed/preprocessed_data_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df = df.drop(labels=['actual_day', 'actual_day.1'], axis=1)

df_oura = pd.read_csv('data/preprocessed/preprocessed_sleep_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df_oura = df_oura.drop(labels=['actual_day'], axis=1)

df_quest = pd.read_csv('data/preprocessed/preprocessed_questionnaires_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df_quest = df_quest.drop(labels=['actual_day'], axis=1)

In [4]:
# Remove these to try and fix the issues raised by LazyRegressor for them 
# example: 
# AdaBoostRegressor model failed to execute
# Found unknown categories ['[0.0, 51.0, 53.0, 53.0, 54.0, 53.0, ...
df = df.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec', 'segment_state', 'type'], axis=1)
df_oura = df_oura.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec', 'segment_state', 'type'], axis=1)

In [5]:
df.fillna(999, inplace=True)
df_oura.fillna(999, inplace=True)
df_quest.fillna(999, inplace=True)

In [6]:
for i in range(len(df.columns)):
    if isinstance(df.iloc[1,i], str):
        print('column ' + df.columns[i])
        print('     ' + df.iloc[1,i])

## Apply Lasso

In [7]:
# Separate the indenpendent variables from the dependent variables
y = df['score']
X = df.drop(labels=['score', 'subjective_sleep_score'], axis=1)
X_oura = df_oura.drop(labels=['score'], axis=1)
X_quest = df_quest

In [8]:
def lasso_model(X, y, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

    reg = Lasso(alpha=0.1)
    reg.fit(X_train, y_train)

    print("Number of coefficients", len(reg.coef_))
    print("Non-zero coeffcients", np.count_nonzero(reg.coef_))
    print("Coefficients", reg.coef_)
    return reg, X_train, X_test, y_train, y_test 

### Using as independent variables all variables (Oura + questionnaire data)

In [9]:
reg, X_train, X_test, y_train, y_test = lasso_model(X, y, random_state=np.random.randint(50000))

Number of coefficients 78
Non-zero coeffcients 28
Coefficients [ 0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -7.47665293e-05  6.00569637e-05  1.42928156e-21
 -8.97052432e-05 -3.83501729e-08  0.00000000e+00 -0.00000000e+00
  3.56094344e-03  3.66727841e-03  5.35246042e-03 -0.00000000e+00
  8.30908070e-03  3.81266317e-05  0.00000000e+00  0.00000000e+00
  1.39679734e-04  2.18025872e-05  0.00000000e+00 -4.85756174e-07
  0.00000000e+00  5.37496756e-04  5.10823232e-04 -5.81800057e-04
  1.43006165e-04  2.09096086e-02 -1.31171692e-04 -3.17687876e-04
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  4.45941058e-05 -0.00000000e+00 -0.00000000e+00 -9.20579440e-05
 -1.27867417e-05 -0.00000000e+00  9.79635239e-06 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.0000000

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [10]:
print('R squared training set', round(reg.score(X_train, y_train)*100, 2))
print('R squared test set', round(reg.score(X_test, y_test)*100, 2))

R squared training set 99.41
R squared test set 97.85


In [11]:
# Training data
pred_train = reg.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)
print('MSE training set', round(mse_train, 2))

# Test data
pred_test = reg.predict(X_test)
mse_test =mean_squared_error(y_test, pred_test)
print('MSE test set', round(mse_test, 2))

MSE training set 0.0
MSE test set 0.01


## Apply LassoCV

In [12]:
reg_cv = LassoCV(alphas=np.arange(0.01, 5, 0.01), cv=5, max_iter=10000)
reg_cv.fit(X_train, y_train)

print('R squared training set', round(reg_cv.score(X_train, y_train)*100, 2))
print('R squared test set', round(reg_cv.score(X_test, y_test)*100, 2))

# Training data
pred_train_cv = reg_cv.predict(X_train)
mse_train_cv = mean_squared_error(y_train, pred_train_cv)
print('MSE training set', round(mse_train_cv, 2))

# Test data
pred_test_cv = reg_cv.predict(X_test)
mse_test_cv = mean_squared_error(y_test, pred_test_cv)
print('MSE test set', round(mse_test_cv, 2))

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


R squared training set 99.89
R squared test set 99.62
MSE training set 0.0
MSE test set 0.0


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [13]:
print('alpha: %f' % reg_cv.alpha_)

alpha: 0.010000


In [14]:
scaler = StandardScaler()
scaler.fit(X_train)
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=10))
sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=Lasso(alpha=0.001, random_state=10))

In [15]:
sel_.get_support()

array([False,  True,  True,  True, False,  True,  True, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False, False, False, False, False, False,
       False,  True,  True,  True, False, False, False, False,  True,
        True, False,  True, False, False,  True, False,  True, False,
        True,  True, False,  True,  True,  True, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
        True,  True, False,  True, False, False,  True, False, False,
       False, False,  True, False,  True, False])

In [16]:
print(sel_.get_feature_names_out())
print(len(sel_.get_feature_names_out()))

['x1' 'x2' 'x3' 'x5' 'x6' 'x9' 'x10' 'x11' 'x12' 'x13' 'x14' 'x15' 'x16'
 'x17' 'x20' 'x28' 'x29' 'x30' 'x35' 'x36' 'x38' 'x41' 'x43' 'x45' 'x46'
 'x48' 'x49' 'x50' 'x58' 'x59' 'x63' 'x64' 'x66' 'x69' 'x74' 'x76']
36


In [17]:
selected_features = []
for i in range(len(sel_.get_support())):
    if sel_.get_support()[i] == True:
        selected_features.append(X_train.columns[i])

selected_features

['average_breath',
 'average_breath_variation',
 'average_heart_rate',
 'awake_time',
 'bedtime_end_delta',
 'bedtime_start_seconds',
 'contributors.deep_sleep',
 'contributors.efficiency',
 'contributors.latency',
 'contributors.rem_sleep',
 'contributors.restfulness',
 'contributors.timing',
 'contributors.total_sleep',
 'deep_sleep_duration',
 'latency',
 'readiness.contributors.previous_day_activity',
 'readiness.contributors.previous_night',
 'readiness.contributors.recovery_index',
 'readiness.temperature_trend_deviation',
 'rem_sleep_duration',
 'restless_periods',
 'timezone',
 'wake_ups',
 'bed_time',
 'Comfort',
 'I walked a bit',
 'I went to the toilet',
 'Naturally',
 'prev_subjective_score',
 'Mentally active',
 'Stressful',
 'alcohol_amount',
 'coffee_amount',
 'intervention',
 'stress_levels',
 'workout_intensity']

### Using as independent variables only the Oura data

In [18]:
reg_oura, X_train_oura, X_test_oura, y_train_oura, y_test_oura = lasso_model(X_oura, y, random_state=995)

Number of coefficients 47
Non-zero coeffcients 22
Coefficients [-8.64271361e-04  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -7.13347442e-05  5.47466805e-05
  6.79890259e-18 -9.34642929e-05  5.98758452e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  4.68314830e-03  8.19895173e-03
  0.00000000e+00  1.00729466e-02  3.05899517e-05  0.00000000e+00
  0.00000000e+00  3.02672166e-04  1.09255701e-05  0.00000000e+00
 -2.30471108e-06  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -6.63404699e-06  0.00000000e+00  1.61907087e-02 -1.29909462e-04
 -5.68955130e-04 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  3.20101327e-05 -0.00000000e+00 -0.00000000e+00
 -7.21189611e-05 -3.31376231e-06 -0.00000000e+00  1.08074263e-05
 -0.00000000e+00 -8.85537854e-04 -0.00000000e+00]


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [19]:
print('R squared training set', round(reg_oura.score(X_train_oura, y_train_oura)*100, 2))
print('R squared test set', round(reg_oura.score(X_test_oura, y_test_oura)*100, 2))

R squared training set 99.28
R squared test set 94.12


## Applying the same model on the questionnaire data only

In [20]:
reg_quest, X_train_quest, X_test_quest, y_train_quest, y_test_quest = lasso_model(X_quest, y, random_state=995)

Number of coefficients 34
Non-zero coeffcients 9
Coefficients [-3.74646738e-03 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  1.91832384e-01  2.98162620e-04 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -1.97572877e-02
 -1.01430656e-01 -4.28180564e-05 -0.00000000e+00  1.16778449e-04
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -2.62180584e-04  1.32713321e-04]


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [21]:
print('R squared training set', round(reg_quest.score(X_train_quest, y_train_quest)*100, 2))
print('R squared test set', round(reg_quest.score(X_test_quest, y_test_quest)*100, 2))

R squared training set 33.07
R squared test set 43.83
