## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold, GridSearchCV

## Define the files to be loaded

In [2]:
# Participant number whose data should be loaded
participant_number = 1

# The version of the questionnaires to load
quest_version = 1

## Loading the pre-processed data

In [3]:
df = pd.read_csv('data/preprocessed/preprocessed_data_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df = df.drop(labels=['actual_day', 'actual_day.1'], axis=1)

df_oura = pd.read_csv('data/preprocessed/preprocessed_sleep_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df_oura = df_oura.drop(labels=['actual_day'], axis=1)

df_quest = pd.read_csv('data/preprocessed/preprocessed_questionnaires_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
df_quest = df_quest.drop(labels=['actual_day'], axis=1)

In [4]:
# Remove these to try and fix the issues raised by LazyRegressor for them 
# example: 
# AdaBoostRegressor model failed to execute
# Found unknown categories ['[0.0, 51.0, 53.0, 53.0, 54.0, 53.0, ...
df = df.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec', 'segment_state', 'type'], axis=1)
df_oura = df_oura.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec', 'segment_state', 'type'], axis=1)

In [5]:
df.fillna(999, inplace=True)
df_oura.fillna(999, inplace=True)
df_quest.fillna(999, inplace=True)

In [6]:
for i in range(len(df.columns)):
    if isinstance(df.iloc[1,i], str):
        print('column ' + df.columns[i])
        print('     ' + df.iloc[1,i])

## Apply LassoCV

In [7]:
# Separate the indenpendent variables from the dependent variables
y = df['score']
X = df.drop(labels=['score', 'subjective_sleep_score'], axis=1)
X_oura = df_oura.drop(labels=['score'], axis=1)

### Using as independent variables all variables (Oura + questionnaire data)

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 995)

# reg = Lasso(alpha=1)
# reg.fit(X_train, y_train)

Lasso(alpha=1)

In [9]:
def lasso_model(X, y, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

    reg = Lasso(alpha=1)
    reg.fit(X_train, y_train)

    return reg,  X_train, X_test, y_train, y_test 

In [10]:
# max_r = 0
# for i in range(0, 1000):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
#     reg.fit(X_train, y_train)
#     if (reg.score(X_test, y_test)*100) > max_r:
#         max_r = reg.score(X_train, y_train)*100
#         best_rand = i

# print(best_rand)
# print(max_r)

995
82.4238029217967


In [11]:
reg,  X_train, X_test, y_train, y_test = lasso_model(X, y, random_state=995)

In [12]:
print('R squared training set', round(reg.score(X_train, y_train)*100, 2))
print('R squared test set', round(reg.score(X_test, y_test)*100, 2))

R squared training set 87.17
R squared test set 69.94


In [13]:
# Training data
pred_train = reg.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train)
print('MSE training set', round(mse_train, 2))

# Test data
pred = reg.predict(X_test)
mse_test =mean_squared_error(y_test, pred)
print('MSE test set', round(mse_test, 2))

MSE training set 0.02
MSE test set 0.03


In [14]:
model = LassoCV(cv=5, random_state=0, max_iter=10000)
model.fit(X_train, y_train)

model.score(X_train, y_train)

0.8398337276520416

### Using as independent variables only the Oura data

In [15]:
reg_oura, X_train_oura, X_test_oura, y_train_oura, y_test_oura = lasso_model(X_oura, y, random_state=995)

In [16]:
print('R squared training set', round(reg_oura.score(X_train_oura, y_train_oura)*100, 2))
print('R squared test set', round(reg_oura.score(X_test_oura, y_test_oura)*100, 2))

Feature names unseen at fit time:
- Activity
- Eventually I got out of bed and moved to a place more quiet. 
- I read a book
- I stayed in bed trying to sleep
- I used a smart device (phone
- ...
Feature names seen at fit time, yet now missing:
- index



ValueError: X has 82 features, but Lasso is expecting 46 features as input.