## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler

## Define the files to be loaded

In [2]:
# Participant number whose data should be loaded
participant_number = 1

# The version of the questionnaires to load
quest_version = 1

## Loading the pre-processed data

In [3]:
df = pd.read_csv('data/preprocessed/preprocessed_data_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df = df.drop(labels=['actual_day', 'actual_day.1'], axis=1)

df_oura = pd.read_csv('data/preprocessed/preprocessed_sleep_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df_oura = df_oura.drop(labels=['actual_day'], axis=1)

df_quest = pd.read_csv('data/preprocessed/preprocessed_questionnaires_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df_quest = df_quest.drop(labels=['actual_day'], axis=1)

In [4]:
# Remove these to try and fix the issues raised by LazyRegressor for them 
# example: 
# AdaBoostRegressor model failed to execute
# Found unknown categories ['[0.0, 51.0, 53.0, 53.0, 54.0, 53.0, ...
df = df.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec', 'segment_state', 'type'], axis=1)
df_oura = df_oura.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec', 'segment_state', 'type'], axis=1)

In [5]:
df.fillna(999, inplace=True)
df_oura.fillna(999, inplace=True)
df_quest.fillna(999, inplace=True)

In [6]:
for i in range(len(df.columns)):
    if isinstance(df.iloc[1,i], str):
        print('column ' + df.columns[i])
        print('     ' + df.iloc[1,i])

## Apply LassoCV

In [7]:
# Separate the indenpendent variables from the dependent variables
y = df['score']
X = df.drop(labels=['score', 'subjective_sleep_score'], axis=1)
X_oura = df_oura.drop(labels=['score'], axis=1)

### Using as independent variables all variables (Oura + questionnaire data)

In [8]:
def lasso_model(X, y, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

    reg = Lasso(alpha=1)
    reg.fit(X_train, y_train)

    return reg, X_train, X_test, y_train, y_test 

In [9]:
reg, X_train, X_test, y_train, y_test = lasso_model(X, y, random_state=995)

In [10]:
print('R squared training set', round(reg.score(X_train, y_train)*100, 2))
print('R squared test set', round(reg.score(X_test, y_test)*100, 2))

R squared training set 82.42
R squared test set 91.73


In [11]:
# Training data
pred_train = reg.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)
print('MSE training set', round(mse_train, 2))

# Test data
pred = reg.predict(X_test)
mse_test =mean_squared_error(y_test, pred)
print('MSE test set', round(mse_test, 2))

MSE training set 0.02
MSE test set 0.01


In [12]:
X_reg, y_reg = make_regression(noise=4, random_state=0)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define model
model = LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)

# fit model
model.fit(X, y)
# model.fit(X_reg, y_reg)

# summarize chosen configuration
print('alpha: %f' % model.alpha_)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

alpha: 0.010000


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [21]:
model = LassoCV(cv=5, random_state=0, max_iter=10000)
model.fit(X_train, y_train)
# model.fit(X_reg, y_reg)

print(model.score(X_train, y_train))
print(model.predict(X_train))

# print(model.score(X_test, y_test))
# print(model.predict(X_test))

0.8867931639081338
[9.13732241 8.94940834 8.65317872 9.184987   9.113976   8.75849268
 9.04011704 9.37807262 9.4163629  8.92478778 9.62957434 9.50611132
 9.18330363 8.66998378 8.98331897 8.90872347 8.64523056 9.58246386
 9.13902626 9.21181021 9.33111642 8.49423517 9.20602881 8.89271898
 9.11243392 9.70844159 9.44882449 9.03467275 8.81116874 9.47382263
 9.50257097 8.97524176 8.93038377 9.14763152 9.31922603 8.47074958
 8.51189262 9.38314429 8.8701181  9.18046607 9.12587192 9.64208046
 9.30900986 9.40189766]


In [14]:
scaler = StandardScaler()
scaler.fit(X_train)
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=10))
sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=Lasso(alpha=0.001, random_state=10))

In [15]:
sel_.get_support()

array([ True,  True, False, False, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False,  True,  True, False, False,  True,  True,  True,  True,
        True,  True, False,  True, False,  True, False,  True, False,
       False, False, False, False,  True, False, False,  True, False,
       False, False, False,  True,  True, False,  True, False,  True,
       False, False,  True,  True, False, False,  True, False, False,
       False, False,  True, False, False,  True,  True,  True,  True,
       False,  True,  True, False, False, False, False,  True,  True,
       False])

In [16]:
print(sel_.get_feature_names_out())
print(len(sel_.get_feature_names_out()))

['x0' 'x1' 'x5' 'x11' 'x19' 'x20' 'x23' 'x24' 'x25' 'x26' 'x27' 'x28'
 'x30' 'x32' 'x34' 'x40' 'x43' 'x48' 'x49' 'x51' 'x53' 'x56' 'x57' 'x60'
 'x65' 'x68' 'x69' 'x70' 'x71' 'x73' 'x74' 'x79' 'x80']
33


In [17]:
selected_features = []
for i in range(len(sel_.get_support())):
    if sel_.get_support()[i] == True:
        selected_features.append(X_train.columns[i])

selected_features

['average_breath',
 'average_breath_variation',
 'deep_sleep_duration',
 'lowest_heart_rate_time_offset',
 'restless',
 'timezone',
 'contributors.total_sleep',
 'contributors.deep_sleep',
 'contributors.rem_sleep',
 'contributors.efficiency',
 'contributors.latency',
 'contributors.restfulness',
 'readiness.contributors.activity_balance',
 'readiness.contributors.hrv_balance',
 'readiness.contributors.previous_night',
 'readiness.temperature_trend_deviation',
 'prev_oura_score',
 'I stayed in bed trying to sleep',
 'I used a smart device (phone',
 'Knowing the alarm was set early',
 'Naturally',
 'Partner',
 'Partner ',
 'With an alarm',
 'work Stress',
 'Physically active',
 'Relaxed',
 'Sleepy',
 'Socially active',
 'Visitors made me go to bed later',
 'alcohol_time',
 'stress_levels',
 'stress_relief_time']

### Using as independent variables only the Oura data

In [18]:
reg_oura, X_train_oura, X_test_oura, y_train_oura, y_test_oura = lasso_model(X_oura, y, random_state=995)

In [19]:
print('R squared training set', round(reg_oura.score(X_train_oura, y_train_oura)*100, 2))
print('R squared test set', round(reg_oura.score(X_test_oura, y_test_oura)*100, 2))

R squared training set 82.91
R squared test set 91.71
