## Imports

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import *
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

### Define the files to be loaded

In [2]:
# Participant number whose data should be loaded
participant_number = 1

# The version of the questionnaires to load
quest_version = 1

## Loading the data

In [3]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/data/oura_json/oura_' + str(participant_number) + '.json') as f:
    data = json.load(f)

# Flatten the nested json file
sleep_df = pd.json_normalize(data['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (1279, 56)


In [4]:
# Loading raw wake up data from Google Form csv file
wakeup_df = pd.read_csv ('data/questionnaire_data/Wakeup_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv file 
bedtime_df = pd.read_csv ('data/questionnaire_data/Bedtime_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
# Rename the 'Timestamp' column of the bedtime dataframe in order to differentiate it from the one in the wakeup dataframe
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (57, 9)
bedtime_df (57, 10)


## Initial pre-processing of datasets separately

In [5]:
# Used for visual inspection of the Oura ring dataset in order to determine the units of each variable
print(sleep_df.iloc[1,:])

average_breath                                                      13.25
average_breath_variation                                              4.5
average_heart_rate                                                    NaN
average_hrv                                                           NaN
awake_time                                                          120.0
bedtime_end                                     2019-08-17T20:55:29+02:00
bedtime_start                                   2019-08-17T20:53:29+02:00
day                                                            2019-08-18
deep_sleep_duration                                                   0.0
efficiency                                                            0.0
got_ups                                                               0.0
latency                                                               0.0
light_sleep_duration                                                  0.0
lowest_heart_rate                     

In [6]:
# Used for visual inspection of the wakeup dataset in order to decide on its pre-processing
print(wakeup_df.iloc[1,:])

Timestamp                                                                                                                                                                                9/27/2022 7:22:16
Did you wake up refreshed after your night of sleep?                                                                                                                                                     7
Do you recall being awake for more than 10 minutes during the night?                                                                                                                                   Yes
If you answered 'Yes' to the previous question, what did you do during this time?                                                                        I stayed in bed trying to sleep, I went to the...
Did anything in particular prevent you from sleeping earlier last night?                                                                                                                    

In [7]:
# Used for visual inspection of the Oura ring dataset in order to determine the units of each variable
print(bedtime_df.iloc[1,:])

Timestamp_bedtime                                                                                               9/26/2022 21:02:37
If you had coffee today, at what time did you take your last cup of coffee?                                             2:30:00 PM
If you had alcohol today, at what time did you have your last drink?                                                           NaN
If you did a workout today, at what time did your last workout end?                                                            NaN
If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?                   NaN
Did you take medication that might affect your sleep?                                                                   Melatonin 
How much did you eat in the last 3 hours before going to bed?                                                                    1
How would you characterize the activities you did in the last 3 hours before going 

In [8]:
# Convert the ISO 8601 notation of some variables in the oura ring data to seconds
def convert_iso_seconds(df, column_name, new_column_name):
    df[new_column_name] = np.nan
    for index, row in df.iterrows():
        utc_date = datetime.strptime(sleep_df.iloc[index][column_name], '%Y-%m-%dT%H:%M:%S%z')
        df.at[index, column_name] = utc_date
        df.at[index, new_column_name] = (utc_date - datetime(utc_date.year, utc_date.month, utc_date.day, tzinfo=utc_date.tzinfo)).total_seconds()
    return df

sleep_df = convert_iso_seconds(sleep_df, 'bedtime_end', 'bedtime_end_seconds')
sleep_df = convert_iso_seconds(sleep_df, 'bedtime_start', 'bedtime_start_seconds')

In [9]:
# Change how the date of the oura data is stored
def convert_sleep_date(df):
    for index, row in df.iterrows():
        sleep_date = df.iloc[index]['day']
        y, m, d = [int(x) for x in sleep_date.split('-')]
        df.at[index, 'day'] = date(y, m, d) 
    return sleep_df

sleep_df = convert_sleep_date(sleep_df)

In [10]:
# Rename the column names of the bedtime questionnaire to make it easier to pre-process
bedtime_df.columns = ['coffee_time' if x=='If you had coffee today, at what time did you take your last cup of coffee?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['alcohol_time' if x=='If you had alcohol today, at what time did you have your last drink?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['workout_time' if x=='If you did a workout today, at what time did you do your last workout?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['workout_time' if x=='If you did a workout today, at what time did your last workout end?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['stress_relief_time' if x=='If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['medication' if x=='Did you take medication that might affect your sleep?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['eat_bedtime' if x=='How much did you eat in the last 3 hours before going to bed?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['activities_bedtime' if x=='How would you characterize the activities you did in the last 3 hours before going to bed?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['stress_levels' if x=='How much stress and/or anxiety do you feel now?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['special_circumstances' if x=='Is there any particular reason why you think you might or might not sleep well tonight?' else x for x in bedtime_df.columns]

#Rename the column names of the wakeup questionnaire to make it easier to pre-process
wakeup_df.columns = ['subjective_sleep_score' if x=='Did you wake up refreshed after your night of sleep?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['recall_awake' if x=='Do you recall being awake for more than 10 minutes during the night?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['awake_activities' if x=="If you answered 'Yes' to the previous question, what did you do during this time?" else x for x in wakeup_df.columns]
wakeup_df.columns = ['prevent_early_sleep' if x=='Did anything in particular prevent you from sleeping earlier last night?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['prevent_better_sleep' if x=='Did anything in particular prevent you from sleeping better?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['method_woken_up' if x=='How did you wake up?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['blue_light' if x=='How many minutes before bedtime did you stop exposing yourself to blue light? (bright phone, TV, etc)' else x for x in wakeup_df.columns]
wakeup_df.columns = ['special_circumstances_wakeup' if x=='Is there anything important about or related to your night of sleep that was not covered in any of the questions in this questionnaire? If yes, what?' else x for x in wakeup_df.columns]

In [11]:
print(bedtime_df.columns.to_list())
print(wakeup_df.columns.to_list())

['Timestamp_bedtime', 'coffee_time', 'alcohol_time', 'workout_time', 'stress_relief_time', 'medication', 'eat_bedtime', 'activities_bedtime', 'stress_levels', 'special_circumstances']
['Timestamp', 'subjective_sleep_score', 'recall_awake', 'awake_activities', 'prevent_early_sleep', 'prevent_better_sleep', 'method_woken_up', 'blue_light', 'special_circumstances_wakeup']


In [12]:
# The sleep score of the Oura ring is on a scale from 0 to 100, but the subjective sleep score is between 0 and 10. 
# Divide the Oura sleep scores by 10 for an easier comparison
sleep_df.loc[:,'score'] /= 10

In [13]:
# Map the 'Yes/No' questions in the questionnaire to '1/0'
def yes_no_questions(df, column_name):
    for index, row in df.iterrows():
        if df.iloc[index][column_name] == 'Yes':
            df.at[index, column_name] = 1
        else:
            df.at[index, column_name] = 0
    return df

bedtime_df = yes_no_questions(bedtime_df, 'medication')
wakeup_df = yes_no_questions(wakeup_df, 'recall_awake')
wakeup_df = yes_no_questions(wakeup_df, 'prevent_early_sleep')

In [14]:
# Map the special circumstances questions in to 1 if there is a special circumstance, and 0 otherwise
def special_circumstances(df, column_name):
    for index, row in df.iterrows():
        if pd.isna(df.iloc[index][column_name]):
            df.at[index, column_name] = 0
        else:
            df.at[index, column_name] = 1
    return df

bedtime_df = special_circumstances(bedtime_df, 'special_circumstances')
wakeup_df = special_circumstances(wakeup_df, 'special_circumstances_wakeup')

In [15]:
# Perform the integer encoding of the 'blue_light variable'
def integer_encoding_blue_light(df):
    # The label encoder gives a result that is sorted in alphabetic order. Thereby, the variable categories are renamed alphabetically to match their order. 
    for index in range(len(df['blue_light'])):
        if df.iloc[index]['blue_light'] == 'Less than 15 minutes':
            df.at[index, 'blue_light'] = 'A'
        elif df.iloc[index]['blue_light'] == '15 to 30 minutes':
            df.at[index, 'blue_light'] = 'B'
        elif df.iloc[index]['blue_light'] == '30 to 45 minutes':
            df.at[index, 'blue_light'] = 'C'
        else:
            df.at[index, 'blue_light'] = 'D'
    le = LabelEncoder()
    le.fit(df['blue_light'])
    df['blue_light'] = le.transform(df['blue_light'])
    return df

wakeup_df = integer_encoding_blue_light(wakeup_df)

In [16]:
# One-hot encoding a variable in a given dataframe
def onehot_encoding(df, column_name):
    dum_df = pd.get_dummies(df[column_name])
    df = df.join(dum_df)
    df = df.drop(labels=[column_name], axis=1)
    return df

wakeup_df = onehot_encoding(wakeup_df, 'method_woken_up')

In [17]:
# One-hot encoding for a variable that can have multiple answers selected in a given dataframe
def multi_label_onehot_encoder(df, column_name):
    for index, row in df.iterrows():
        # Turn the string value into a list of selected answers
        if not pd.isna(df.iloc[index][column_name]):
            df.at[index, column_name] = df.iloc[index][column_name].split(", ")
        else:
            df.at[index, column_name] = ['None']
        # The split cue used above appears in one of the answers of the 'awake_activities' column. Get rid of the second list element referring to the same answer
        if (column_name == 'awake_activities'):
            for i in df.at[index, column_name]:
                if i == 'laptop...)':
                    df.at[index, column_name].remove('laptop...)')
                    
    # Multi-label one-hot encoding
    mlb = MultiLabelBinarizer()
    df = df.join(pd.DataFrame(mlb.fit_transform(df.pop(column_name)), columns=mlb.classes_, index=df.index))
    return df

wakeup_df = multi_label_onehot_encoder(wakeup_df, 'awake_activities')
wakeup_df = multi_label_onehot_encoder(wakeup_df, 'prevent_better_sleep')

bedtime_df = multi_label_onehot_encoder(bedtime_df, 'activities_bedtime')

In [18]:
# Only consider the date of the questionnaire datasets and ignore the exact time
def remove_time(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_time(quest_date).split('/')]
    return date(y, m, d)

In [19]:
# Add a 'day' column in the questionnaire dataframes to compare with oura data
def add_day_col(df, column_name):
    df['day'] = np.nan
    for index, row in df.iterrows():
        df.at[index, 'day'] = convert_questionnaire_date(df.iloc[index][column_name])
    return df

wakeup_df = add_day_col(wakeup_df, 'Timestamp')
bedtime_df = add_day_col(bedtime_df, 'Timestamp_bedtime')

In [20]:
# Bedtime data entered on day n should be matched to the wakeup and sleep data of day n+1 if the questionnaire data is entered before midnight
for index, row in bedtime_df.iterrows():
    utc_date_questionnaire = datetime.strptime(bedtime_df.iloc[index]['Timestamp_bedtime'], '%m/%d/%Y %H:%M:%S')
    # 16:00 is arbitrarily chosen as a delimiter that is late enough that the person should have woken up andearly enough that people did not go to sleep yet
    # With this implementation, we assume that people follow a sleep schedule such as they sleep at night and wake up during the day. 
    four_pm = datetime(utc_date_questionnaire.year,  utc_date_questionnaire.month,  utc_date_questionnaire.day, hour=16, minute=0)
    
    # If the questionnaire was filled after midnight but before bedtime, the date change is taken care of here
    if (utc_date_questionnaire > four_pm):
        # The questionnaire was filled after midnight before sleeeping or when the person woke up
        bedtime_df.at[index, 'day'] = bedtime_df.iloc[index]['day'] + timedelta(days=1)

## Combining the three dataframes

In [21]:
# Find the start and end dates of a dataframe
def find_start_end_dates(df):
    start_date = df.iloc[0]['day']
    end_date = df.iloc[-1]['day']
    return start_date, end_date 

sleep_start_date, sleep_end_date = find_start_end_dates(sleep_df)
wakeup_start_date, wakeup_end_date = find_start_end_dates(wakeup_df)
bedtime_start_date, bedtime_end_date = find_start_end_dates(bedtime_df)

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    if earliest:
        earliest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if earliest_date > bedtime_date:
            earliest_date = bedtime_date
        return earliest_date
    else:
        latest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if latest_date < bedtime_date:
            latest_date = bedtime_date
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True)

print('latest start date', latest_start, "\nearliest end date", earliest_end)


latest start date 2022-09-26 
earliest end date 2022-11-23


In [22]:
# Cut the data sets so that they only cover the period [latest_start, earliest_end]
def cut_df(df, start, end):
    truncated_df = pd.DataFrame()
    for index, row in df.iterrows():
        sleep_date = row['day']
        # Only add the data starting the latest_start
        if sleep_date < start:
            continue
        
        # End now if the earliest_end is missing from the dataset
        if (sleep_date > end):
            break

        truncated_df = truncated_df.append(pd.DataFrame(row).transpose(), ignore_index=True)

        # End if the earliest_end is reached
        if (sleep_date == end):
            break
        
    return truncated_df 

truncated_sleep_df = cut_df(sleep_df, latest_start, earliest_end)
truncated_wakeup_df = cut_df(wakeup_df, latest_start, earliest_end)
truncated_bedtime_df = cut_df(bedtime_df, latest_start, earliest_end)

print('truncated_sleep_df', truncated_sleep_df.shape, '\ntruncated_wakeup_df', truncated_wakeup_df.shape, '\ntruncated_bedtime_df', truncated_bedtime_df.shape)

truncated_sleep_df (59, 58) 
truncated_wakeup_df (57, 25) 
truncated_bedtime_df (57, 17)


In [23]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name):
    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # Dates which are not in the sequence are returned
    return pd.date_range(start_date, end_date).difference(df.index)

In [24]:
# Create a dataframe that goes from latest_start to earliest_end with no jumps in the dates, except that days without data are empty rows.
def complete_df(df, latest_start, earliest_end):
    missing_days = find_missing_days(df, latest_start, earliest_end, 'day')
    no_jumps_df = pd.DataFrame()
    counter = 0 
    for index, row in df.iterrows():
        sleep_date = row['day']
        
        # Prevents to add data twice for a single day
        if sleep_date > latest_start and sleep_date == latest_date:
            no_jumps_df = no_jumps_df.drop(no_jumps_df[no_jumps_df['day'] == latest_date].index.values[0])
        latest_date = sleep_date

        # If data is missing from the considered date, add an empty row
        while counter < (len(missing_days)) and sleep_date > missing_days[counter]:
            no_jumps_df = no_jumps_df.append([None], ignore_index=True)
            counter += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose(), ignore_index=True)
               
    return no_jumps_df

In [25]:
# Fill the missing days of the sleep df with empty rows
complete_sleep_df = complete_df(truncated_sleep_df, latest_start, earliest_end)

missing days are DatetimeIndex([], dtype='datetime64[ns]', freq=None)


In [26]:
# Create a list of all the dates between latest_start and earliest_end
all_dates = [latest_start+timedelta(days=x) for x in range((earliest_end-(latest_start + timedelta(days=-1))).days)]
complete_sleep_df['actual_day'] = all_dates

In [27]:
# Convert the way in which time is stored in the questionnaire to a continuous scale
def temporal_conversion(df, column_name):
    for index, row in df.iterrows():
        # If there is a time entered, convert it
        if not pd.isna(df.iloc[index][column_name]):
            # Split the 12-hour time from AM/PM
            split_string = df.iloc[index][column_name].split(" ")
            # Create a list [hour, minutes, seconds] for the activity time
            h_m_s = split_string[0].split(":")
            h_m_s = list(np.float_(h_m_s))
            oura_index = complete_sleep_df[complete_sleep_df['day'] == df.iloc[index]['day']].index.values
            
            if (split_string[-1] == 'PM') and not(h_m_s[0] == 12):
                # Convert to a 24-hour time
                h_m_s[0] += 12
            elif (len(oura_index) > 0):  
                oura_index = oura_index[-1]
                utc_date_bedtime = complete_sleep_df.iloc[oura_index]['bedtime_start']
                utc_date_wakeup = complete_sleep_df.iloc[oura_index]['bedtime_end']
                utc_date_quest = datetime.strptime(df.iloc[index]['Timestamp_bedtime'], '%m/%d/%Y %H:%M:%S')
                utc_activity_time = datetime(utc_date_quest.year,  utc_date_quest.month,  utc_date_quest.day, hour=int(h_m_s[0]), minute=int(h_m_s[1])) 
                utc_activity_time = utc_activity_time.replace(tzinfo=utc_date_bedtime.tzinfo)  
                # If the entered data is after midnight but before bedtime, convert it to a more-than 24-hour time so that time logically follows
                if (utc_date_bedtime.day == utc_date_wakeup.day) and (utc_activity_time.day == utc_date_bedtime.day) and utc_date_bedtime > utc_activity_time:
                    h_m_s[0] += 24
                    
            # Convert the time to a numerical value
            df.at[index, column_name] = h_m_s[0] + h_m_s[1]/60
    return df

truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'coffee_time')
truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'alcohol_time')
truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'workout_time')
truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'stress_relief_time')

In [28]:
# Match the bedtime questionnaire data to the correct date. This function thus modifies the column 'day'
def convert_dates_bedtime(df):
    last_date = None
    # Iterate over all the bedtime questionnaire instances 
    for index, row in df.iterrows():
        utc_date_questionnaire = datetime.strptime(df.iloc[index]['Timestamp_bedtime'], '%m/%d/%Y %H:%M:%S')
        
        # Retrieve the index, in the oura dataframe, of the day for which the questionnaire was filled 
        oura_index = complete_sleep_df[complete_sleep_df['day'] == df.iloc[index]['day']].index.values

        if len(oura_index) == 0:
            # The oura ring has no data recorded for this day. The row will be deleted later in the pre-processing. We can continue with the next questionnaire instance.
            continue
        oura_index = oura_index[-1]

        utc_date_wakeup = complete_sleep_df.iloc[oura_index]['bedtime_end']
        utc_date_questionnaire = utc_date_questionnaire.replace(tzinfo=utc_date_wakeup.tzinfo) 

        # If we have reached the end of the oura dataset and there are no data for day n+1
        if (len(complete_sleep_df) - 1) == oura_index:
            utc_date_bedtime_nextday = None
        else:
            utc_date_bedtime_nextday = complete_sleep_df.iloc[oura_index+1]['bedtime_start']

        # Add the timezone when the oura ring has data for the night n+1. It is needed for the comparison of dates
        if (not pd.isna(utc_date_bedtime_nextday)):
            utc_date_bedtime_nextday = utc_date_bedtime_nextday.replace(tzinfo=utc_date_wakeup.tzinfo)

        if (not pd.isna(utc_date_bedtime_nextday)) and (utc_date_questionnaire - utc_date_wakeup) < (utc_date_bedtime_nextday - utc_date_questionnaire):
            # The questionnaire for day n is filled on day n+1, but closer to their wake-up time than to the bedtime of day n+1
            df.at[index, 'day'] = complete_sleep_df.iloc[oura_index]['day']
        
        # If the questionnaire for day n was filled closer to bedtime of day n+1 than to wake up time of day n
        if (last_date == df.iloc[index]['day']):
            df.at[index-1, 'day'] = last_date + timedelta(days=-1)
        last_date = df.iloc[index]['day']
        
    return df

truncated_bedtime_df = convert_dates_bedtime(truncated_bedtime_df)

In [29]:
# Match the wakeup questionnaire data to the correct date. This function thus modifies the column 'day'
def convert_dates_wakeup(df):
    last_date = None
    # Iterate over all the wakeup questionnaire instances  
    for index, row in df.iterrows():
        # If two wakeup questionnaires were filled on the same day 
        if (last_date == df.iloc[index]['day']):
            # Check if there is a wakeup questionnaire instance for the previous day
            data_prev_day = df[df['day'] == last_date + timedelta(days=-1)]['day'].index.values
            if len(data_prev_day) == 0:
                # If not, then the wakeup questionnaire of day n was probably filled on day n+1
                df.at[index-1, 'day'] = last_date + timedelta(days=-1)
        last_date = df.iloc[index]['day']
        
    return df

truncated_wakeup_df = convert_dates_wakeup(truncated_wakeup_df)

In [30]:
# Create three datasets that cover the same time period with no date gap in their dataframes
complete_wakeup_df = complete_df(truncated_wakeup_df, latest_start, earliest_end)
complete_bedtime_df = complete_df(truncated_bedtime_df, latest_start, earliest_end)

print('complete_sleep_df', complete_sleep_df.shape, '\ncomplete_wakeup_df', complete_wakeup_df.shape, '\ncomplete_bedtime_df', complete_bedtime_df.shape)

missing days are DatetimeIndex(['2022-10-17', '2022-11-22'], dtype='datetime64[ns]', freq=None)
missing days are DatetimeIndex(['2022-10-20', '2022-11-22'], dtype='datetime64[ns]', freq=None)
complete_sleep_df (59, 59) 
complete_wakeup_df (59, 26) 
complete_bedtime_df (59, 18)


In [31]:
# Add a column in which the bedtime is a continuous scale (e.g. 16:00 is represented as 16 and 1:00 is represented as 25)
def make_bedtime_start_continuous(df):
    df['bed_time'] = np.nan
    for index, row in df.iterrows():
        utc_day = complete_sleep_df.iloc[index]['bedtime_start']
        utc_date_wakeup = complete_sleep_df.iloc[index]['bedtime_end']
        if not pd.isna(utc_day):
            df.at[index, 'bed_time'] = 0
            # If the bedtime is after midnight, convert it to a more-than 24-hour time so that time logically follows
            if (utc_day.day == utc_date_wakeup.day):
                df.at[index, 'bed_time'] += 24
            df.at[index, 'bed_time'] += utc_day.hour + utc_day.minute/60 + utc_day.second/360
    return df

complete_sleep_df = make_bedtime_start_continuous(complete_sleep_df)

In [32]:
for index, row in complete_sleep_df.iterrows():
    if pd.isna(complete_sleep_df.iloc[index]['score']):
        complete_sleep_df.at[index, 'day'] = np.nan

In [33]:
# Rename the day columns not to mix them up 
complete_sleep_df.columns = ['day_oura' if x=='day' else x for x in complete_sleep_df.columns]
complete_wakeup_df.columns = ['day_wakeup' if x=='day' else x for x in complete_wakeup_df.columns]
complete_bedtime_df.columns = ['day_bedtime' if x=='day' else x for x in complete_bedtime_df.columns]

# Concatenate the 3 dataframes. Merge the different columns on to the same rows 
df = pd.concat([complete_sleep_df.reset_index(), complete_wakeup_df.reset_index(), complete_bedtime_df.reset_index()], axis=1)

## Pre-process the dataset 

In [34]:
# Identify all the rows with missing data
missing_dates = []
for index, row in df.iterrows():
    if pd.isna(df.iloc[index]['day_oura']) or pd.isna(df.iloc[index]['day_wakeup']) or pd.isna(df.iloc[index]['day_bedtime']):
        missing_dates.append(df.iloc[index]['actual_day'])

# Drop the rows with missing data from dataframe
for i in range(len(missing_dates)):
    df = df.drop(df[df['actual_day'] == missing_dates[i]].index.values[0])

df = df.reset_index()
df

Unnamed: 0,level_0,index,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_start,day_oura,...,Visitors made me go to bed later,alcohol_time,coffee_time,day_bedtime,eat_bedtime,medication,special_circumstances,stress_levels,stress_relief_time,workout_time
0,0,0,12.625,4.625,54.01,58.0,6900.0,2022-09-26 07:42:56+02:00,2022-09-25 21:23:56+02:00,2022-09-26,...,0,,14.0,2022-09-26,2,0,1,2,,
1,1,1,12.25,4.25,53.45,50.0,4710.0,2022-09-27 07:20:09+02:00,2022-09-26 21:08:09+02:00,2022-09-27,...,0,,14.5,2022-09-27,1,0,0,2,,
2,2,2,12.625,4.375,56.42,43.0,8100.0,2022-09-28 07:01:02+02:00,2022-09-27 21:14:02+02:00,2022-09-28,...,0,,14.5,2022-09-28,1,0,0,1,,
3,3,3,12.875,4.5,55.15,42.0,5010.0,2022-09-29 06:46:30+02:00,2022-09-28 21:06:30+02:00,2022-09-29,...,0,,14.25,2022-09-29,2,0,0,1,,
4,4,4,12.375,4.5,57.92,40.0,6510.0,2022-09-30 07:37:41+02:00,2022-09-29 21:10:41+02:00,2022-09-30,...,0,18.0,14.5,2022-09-30,1,0,0,2,,
5,5,5,12.625,4.75,54.23,50.0,5070.0,2022-10-01 08:17:51+02:00,2022-09-30 21:26:51+02:00,2022-10-01,...,0,18.0,13.0,2022-10-01,2,0,0,1,,
6,6,6,12.25,4.75,52.93,56.0,5520.0,2022-10-02 07:37:20+02:00,2022-10-01 22:02:20+02:00,2022-10-02,...,0,17.0,16.0,2022-10-02,2,0,1,2,,
7,7,7,12.25,4.375,59.19,43.0,4530.0,2022-10-03 07:49:35+02:00,2022-10-02 21:09:35+02:00,2022-10-03,...,0,17.0,14.0,2022-10-03,2,0,0,1,,
8,8,8,12.125,4.5,55.72,36.0,5700.0,2022-10-04 07:04:00+02:00,2022-10-03 21:59:00+02:00,2022-10-04,...,0,18.0,15.0,2022-10-04,2,0,0,1,,
9,9,9,12.125,4.25,57.74,36.0,3210.0,2022-10-05 07:40:13+02:00,2022-10-04 22:19:13+02:00,2022-10-05,...,0,,14.0,2022-10-05,2,0,1,1,,


In [35]:
# Visual analysis of the column names to see which ones can be dropped
print(df.columns.tolist())

['level_0', 'index', 'average_breath', 'average_breath_variation', 'average_heart_rate', 'average_hrv', 'awake_time', 'bedtime_end', 'bedtime_start', 'day_oura', 'deep_sleep_duration', 'efficiency', 'got_ups', 'latency', 'light_sleep_duration', 'lowest_heart_rate', 'lowest_heart_rate_time_offset', 'period', 'rem_sleep_duration', 'restless_periods', 'score', 'segment_state', 'sleep_midpoint', 'time_in_bed', 'total_sleep_duration', 'type', 'wake_ups', 'sleep_phase_5_min', 'restless', 'timezone', 'bedtime_start_delta', 'bedtime_end_delta', 'midpoint_at_delta', 'contributors.total_sleep', 'contributors.deep_sleep', 'contributors.rem_sleep', 'contributors.efficiency', 'contributors.latency', 'contributors.restfulness', 'contributors.timing', 'heart_rate.interval', 'heart_rate.items', 'heart_rate.timestamp', 'hrv.interval', 'hrv.items', 'hrv.timestamp', 'readiness.contributors.activity_balance', 'readiness.contributors.body_temperature', 'readiness.contributors.hrv_balance', 'readiness.contr

In [36]:
# Drop irrelevant columns. These were determined by looking at the above list of columns. 
df = df.drop(labels=['level_0', 'index', 0, 'bedtime_end', 'bedtime_start', 'day_oura', 'day_bedtime', 'day_wakeup', 'None', 'Timestamp', 'Timestamp_bedtime', 'heart_rate.timestamp', 'hrv.timestamp', 'midpoint_at_delta', 'hrv.interval', 'heart_rate.interval'], axis=1)
df

Unnamed: 0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,deep_sleep_duration,efficiency,got_ups,latency,light_sleep_duration,...,Stressful,Visitors made me go to bed later,alcohol_time,coffee_time,eat_bedtime,medication,special_circumstances,stress_levels,stress_relief_time,workout_time
0,12.625,4.625,54.01,58.0,6900.0,6330.0,81.0,0.0,1770.0,15030.0,...,0,0,,14.0,2,0,1,2,,
1,12.25,4.25,53.45,50.0,4710.0,5460.0,87.0,0.0,1530.0,18990.0,...,0,0,,14.5,1,0,0,2,,
2,12.625,4.375,56.42,43.0,8100.0,6120.0,77.0,0.0,1230.0,14700.0,...,0,0,,14.5,1,0,0,1,,
3,12.875,4.5,55.15,42.0,5010.0,4710.0,86.0,0.0,930.0,17640.0,...,0,0,,14.25,2,0,0,1,,
4,12.375,4.5,57.92,40.0,6510.0,4680.0,83.0,0.0,1380.0,19560.0,...,0,0,18.0,14.5,1,0,0,2,,
5,12.625,4.75,54.23,50.0,5070.0,6480.0,87.0,1.0,180.0,19650.0,...,0,0,18.0,13.0,2,0,0,1,,
6,12.25,4.75,52.93,56.0,5520.0,5940.0,84.0,0.0,720.0,15870.0,...,0,0,17.0,16.0,2,0,1,2,,
7,12.25,4.375,59.19,43.0,4530.0,8310.0,88.0,0.0,1620.0,15180.0,...,0,0,17.0,14.0,2,0,0,1,,
8,12.125,4.5,55.72,36.0,5700.0,2970.0,83.0,0.0,570.0,15720.0,...,0,0,18.0,15.0,2,0,0,1,,
9,12.125,4.25,57.74,36.0,3210.0,6030.0,90.0,0.0,660.0,15840.0,...,0,0,,14.0,2,0,1,1,,


In [37]:
# Removed these to try and fix the issues raised in model.ipynb
df = df.drop(labels=['hrv.items', 'heart_rate.items', 'sleep_phase_5_min', 'movement_30_sec'], axis=1)
df

Unnamed: 0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,deep_sleep_duration,efficiency,got_ups,latency,light_sleep_duration,...,Stressful,Visitors made me go to bed later,alcohol_time,coffee_time,eat_bedtime,medication,special_circumstances,stress_levels,stress_relief_time,workout_time
0,12.625,4.625,54.01,58.0,6900.0,6330.0,81.0,0.0,1770.0,15030.0,...,0,0,,14.0,2,0,1,2,,
1,12.25,4.25,53.45,50.0,4710.0,5460.0,87.0,0.0,1530.0,18990.0,...,0,0,,14.5,1,0,0,2,,
2,12.625,4.375,56.42,43.0,8100.0,6120.0,77.0,0.0,1230.0,14700.0,...,0,0,,14.5,1,0,0,1,,
3,12.875,4.5,55.15,42.0,5010.0,4710.0,86.0,0.0,930.0,17640.0,...,0,0,,14.25,2,0,0,1,,
4,12.375,4.5,57.92,40.0,6510.0,4680.0,83.0,0.0,1380.0,19560.0,...,0,0,18.0,14.5,1,0,0,2,,
5,12.625,4.75,54.23,50.0,5070.0,6480.0,87.0,1.0,180.0,19650.0,...,0,0,18.0,13.0,2,0,0,1,,
6,12.25,4.75,52.93,56.0,5520.0,5940.0,84.0,0.0,720.0,15870.0,...,0,0,17.0,16.0,2,0,1,2,,
7,12.25,4.375,59.19,43.0,4530.0,8310.0,88.0,0.0,1620.0,15180.0,...,0,0,17.0,14.0,2,0,0,1,,
8,12.125,4.5,55.72,36.0,5700.0,2970.0,83.0,0.0,570.0,15720.0,...,0,0,18.0,15.0,2,0,0,1,,
9,12.125,4.25,57.74,36.0,3210.0,6030.0,90.0,0.0,660.0,15840.0,...,0,0,,14.0,2,0,1,1,,


In [38]:
df = df.reset_index()
df.to_csv('data/preprocessed_data_' + str(participant_number) + '.csv', index=False)