## Imports

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import *
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

## Define the files to be loaded

In [2]:
# Participant number whose data should be loaded
participant_number = 1

# The version of the questionnaires to load
quest_version = 2

## Loading the data

In [3]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/data/oura_json/oura_' + str(participant_number) + '.json') as f:
    data = json.load(f)

# Flatten the nested json file
sleep_df = pd.json_normalize(data['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (260, 56)


In [4]:
# Loading raw wake up data from Google Form csv file
wakeup_df = pd.read_csv ('data/questionnaire_data/Wakeup_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv file 
bedtime_df = pd.read_csv ('data/questionnaire_data/Bedtime_' + str(participant_number) + '_v' + str(quest_version) + '.csv')
# Rename the 'Timestamp' column of the bedtime dataframe in order to differentiate it from the one in the wakeup dataframe
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (101, 10)
bedtime_df (99, 19)


## Initial pre-processing of datasets separately

In [5]:
# Used for visual inspection of the Oura ring dataset in order to determine the units of each variable
print(sleep_df.iloc[1,:])

bedtime_end                                                             2022-06-10T06:55:22+02:00
bedtime_start                                                           2022-06-09T22:46:22+02:00
day                                                                                    2022-06-10
period                                                                                          0
segment_state                                                                              active
time_in_bed                                                                                 29340
type                                                                                   long_sleep
restless                                                                                      6.0
timezone                                                                                      120
bedtime_start_delta                                                                         -4418
bedtime_end_delta   

In [6]:
# Used for visual inspection of the wakeup dataset in order to decide on its pre-processing
print(wakeup_df.iloc[1,:])

Timestamp                                                                                                                          11/21/2022 13:13:49
How refreshed did you wake up after your night of sleep?                                                                                             7
Is there anything important about or related to your night of sleep that might have affected your sleep quality? If yes, what?                     NaN
Do you recall being awake for more than 10 minutes during the night?                                                                                No
If you answered 'Yes' to the previous question, what did you do during this time?                                                                  NaN
Did anything in particular prevent you from sleeping earlier last night?                                                                            No
If you answered ‘Yes’ to the previous question, what prevented you from sleeping earlier?     

In [7]:
# Used for visual inspection of the Oura ring dataset in order to determine the units of each variable
print(bedtime_df.iloc[1,:])

Timestamp_bedtime                                                                                           11/21/2022 22:00:33
Did you deliberately change your behavior today in an attempt to see an effect on your sleep quality?                        No
If you answered 'Yes' to the previous question, what behavior change did you do?                                            NaN
How many cups of coffee did you drink today?                                                                                  0
If you had coffee today, at what time did you take your last cup of coffee?                                                 NaN
How many alcoholic drinks did you have today?                                                                                 0
If you had alcohol today, at what time did you take your last drink?                                                        NaN
If you did a workout today, how intense was the workout?                                                

In [8]:
# Convert the ISO 8601 notation of some variables in the oura ring data to seconds
def convert_iso_seconds(df, column_name, new_column_name):
    df[new_column_name] = np.nan
    for index, row in df.iterrows():
        utc_date = datetime.strptime(sleep_df.iloc[index][column_name], '%Y-%m-%dT%H:%M:%S%z')
        df.at[index, column_name] = utc_date
        df.at[index, new_column_name] = (utc_date - datetime(utc_date.year, utc_date.month, utc_date.day, tzinfo=utc_date.tzinfo)).total_seconds()
    return df

sleep_df = convert_iso_seconds(sleep_df, 'bedtime_end', 'bedtime_end_seconds')
sleep_df = convert_iso_seconds(sleep_df, 'bedtime_start', 'bedtime_start_seconds')

In [9]:
# Change how the date of the oura data is stored
def convert_sleep_date(df):
    for index, row in df.iterrows():
        sleep_date = df.iloc[index]['day']
        y, m, d = [int(x) for x in sleep_date.split('-')]
        df.at[index, 'day'] = date(y, m, d) 
    return sleep_df

sleep_df = convert_sleep_date(sleep_df)

In [10]:
# Rename the column names of the bedtime questionnaire to make it easier to pre-process
bedtime_df.columns = ['coffee_time' if x=='If you had coffee today, at what time did you take your last cup of coffee?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['workout_time' if x=='If you did a workout today, at what time did you do your last workout?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['workout_time' if x=='If you did a workout today, at what time did your last workout end?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['medication' if x=='Did you take medication that might affect your sleep?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['eat_bedtime' if x=='How much did you eat in the last 3 hours before going to bed?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['activities_bedtime' if x=='How would you characterize the activities you did in the last 3 hours before going to bed?' else x for x in bedtime_df.columns]
bedtime_df.columns = ['special_circumstances' if x=='Is there any particular reason why you think you might or might not sleep well tonight?' else x for x in bedtime_df.columns]

if (quest_version == 1):
    bedtime_df.columns = ['alcohol_time' if x=='If you had alcohol today, at what time did you have your last drink?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['stress_relief_time' if x=='If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['stress_levels' if x=='How much stress and/or anxiety do you feel now?' else x for x in bedtime_df.columns]
else:
    # Questions that were phrased differently 
    bedtime_df.columns = ['alcohol_time' if x=='If you had alcohol today, at what time did you take your last drink?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['stress_relief_time' if x=='If you did a stress-relieving activity today (meditation, yoga, etc), at what time did the last one end?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['stress_levels' if x=='How much stress do you currently feel?' else x for x in bedtime_df.columns]
    # Newly added questions for questionnaire v2
    bedtime_df.columns = ['intervention' if x=='Did you deliberately change your behavior today in an attempt to see an effect on your sleep quality?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['coffee_amount' if x=='How many cups of coffee did you drink today?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['alcohol_amount' if x=='How many alcoholic drinks did you have today?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['workout_intensity' if x=='If you did a workout today, how intense was the workout?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['medication_time' if x=='If you took medication that might affect your sleep, how long before bedtime did you take it?' else x for x in bedtime_df.columns]
    bedtime_df.columns = ['medication_dosage' if x=='If you took medication that might affect your sleep, how much medication did you take?' else x for x in bedtime_df.columns]
    
    # These questions will be dropped for now because it requires manual processing or NLP
    bedtime_df = bedtime_df.drop(labels=["Why do you currently feel stressed?", "If you answered 'Yes' to the previous question, what behavior change did you do?", "If you answered ‘Yes’ to the previous question, please indicate the taken medication."], axis=1)


In [11]:
# Rename the column names of the wakeup questionnaire to make it easier to pre-process
wakeup_df.columns = ['recall_awake' if x=='Do you recall being awake for more than 10 minutes during the night?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['awake_activities' if x=="If you answered 'Yes' to the previous question, what did you do during this time?" else x for x in wakeup_df.columns]
wakeup_df.columns = ['prevent_early_sleep' if x=='Did anything in particular prevent you from sleeping earlier last night?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['method_woken_up' if x=='How did you wake up?' else x for x in wakeup_df.columns]
wakeup_df.columns = ['blue_light' if x=='How many minutes before bedtime did you stop exposing yourself to blue light? (bright phone, TV, etc)' else x for x in wakeup_df.columns]

if (quest_version == 1):
    wakeup_df.columns = ['subjective_sleep_score' if x=='Did you wake up refreshed after your night of sleep?' else x for x in wakeup_df.columns]
    wakeup_df.columns = ['prevent_better_sleep' if x=='Did anything in particular prevent you from sleeping better?' else x for x in wakeup_df.columns]
    wakeup_df.columns = ['special_circumstances_wakeup' if x=='Is there anything important about or related to your night of sleep that was not covered in any of the questions in this questionnaire? If yes, what?' else x for x in wakeup_df.columns]
else:
    wakeup_df.columns = ['subjective_sleep_score' if x=='How refreshed did you wake up after your night of sleep?' else x for x in wakeup_df.columns]
    wakeup_df.columns = ['prevent_better_sleep' if x=='Did any external factors prevent you from sleeping better?' else x for x in wakeup_df.columns]
    wakeup_df.columns = ['special_circumstances_wakeup' if x=='Is there anything important about or related to your night of sleep that might have affected your sleep quality? If yes, what?' else x for x in wakeup_df.columns]
    
    # This question will be dropped for now because it requires manual processing or NLP
    wakeup_df = wakeup_df.drop(labels=['If you answered ‘Yes’ to the previous question, what prevented you from sleeping earlier?'], axis=1)

In [12]:
print(bedtime_df.columns.to_list())
print(wakeup_df.columns.to_list())

['Timestamp_bedtime', 'intervention', 'coffee_amount', 'coffee_time', 'alcohol_amount', 'alcohol_time', 'workout_intensity', 'workout_time', 'stress_relief_time', 'activities_bedtime', 'eat_bedtime', 'stress_levels', 'medication', 'medication_time', 'medication_dosage', 'special_circumstances']
['Timestamp', 'subjective_sleep_score', 'special_circumstances_wakeup', 'recall_awake', 'awake_activities', 'prevent_early_sleep', 'prevent_better_sleep', 'method_woken_up', 'blue_light']


In [13]:
# The sleep score of the Oura ring is on a scale from 0 to 100, but the subjective sleep score is between 0 and 10. 
# Divide the Oura sleep scores by 10 for an easier comparison
sleep_df.loc[:,'score'] /= 10

In [14]:
# Map the 'Yes/No' questions in the questionnaire to '1/0'
def yes_no_questions(df, column_name):
    for index, row in df.iterrows():
        if df.iloc[index][column_name] == 'No':
            df.at[index, column_name] = 0
        else:
            df.at[index, column_name] = 1
    return df

bedtime_df = yes_no_questions(bedtime_df, 'medication')
wakeup_df = yes_no_questions(wakeup_df, 'recall_awake')
wakeup_df = yes_no_questions(wakeup_df, 'prevent_early_sleep')
if (quest_version == 2):
    bedtime_df = yes_no_questions(bedtime_df, 'intervention')

In [15]:
# Map the special circumstances questions in to 1 if there is a special circumstance, and 0 otherwise
def special_circumstances(df, column_name):
    for index, row in df.iterrows():
        if pd.isna(df.iloc[index][column_name]):
            df.at[index, column_name] = 0
        else:
            df.at[index, column_name] = 1
    return df

bedtime_df = special_circumstances(bedtime_df, 'special_circumstances')
wakeup_df = special_circumstances(wakeup_df, 'special_circumstances_wakeup')

In [16]:
# Perform the integer encoding of the 'blue_light variable'
def integer_encoding(df, column_name):
    # The label encoder gives a result that is sorted in alphabetic order. Thereby, the variable categories are renamed alphabetically to match their order. 
    for index in range(len(df[column_name])):
        if df.iloc[index][column_name] == 'Less than 15 minutes':
            df.at[index, column_name] = 'A'
        elif df.iloc[index][column_name] == '15 to 30 minutes':
            df.at[index, column_name] = 'B'
        elif df.iloc[index][column_name] == '30 to 45 minutes':
            df.at[index, column_name] = 'C'
        elif df.iloc[index][column_name] == 'More than 45 minutes':
            df.at[index, column_name] = 'D'
        elif df.iloc[index][column_name] == '45 to 60 minutes':
            df.at[index, column_name] = 'D'
        elif df.iloc[index][column_name] == 'More than 60 minutes':
            df.at[index, column_name] = 'E'
    le = LabelEncoder()
    le.fit(df[column_name])
    df[column_name] = le.transform(df[column_name])
    return df

wakeup_df = integer_encoding(wakeup_df, 'blue_light')
if (quest_version == 2):
    bedtime_df = integer_encoding(bedtime_df, 'medication_time')

In [17]:
# One-hot encoding a variable in a given dataframe
def onehot_encoding(df, column_name):
    dum_df = pd.get_dummies(df[column_name])
    df = df.join(dum_df)
    df = df.drop(labels=[column_name], axis=1)
    return df

wakeup_df = onehot_encoding(wakeup_df, 'method_woken_up')

In [18]:
# One-hot encoding for a variable that can have multiple answers selected in a given dataframe
def multi_label_onehot_encoder(df, column_name):
    for index, row in df.iterrows():
        # Turn the string value into a list of selected answers
        if not pd.isna(df.iloc[index][column_name]):
            df.at[index, column_name] = df.iloc[index][column_name].split(", ")
        else:
            df.at[index, column_name] = ['None']
        # The split cue used above appears in one of the answers of the 'awake_activities' column. Get rid of the second list element referring to the same answer
        if (column_name == 'awake_activities'):
            for i in df.at[index, column_name]:
                if i == 'laptop...)':
                    df.at[index, column_name].remove('laptop...)')
                    
    # Multi-label one-hot encoding
    mlb = MultiLabelBinarizer()
    df = df.join(pd.DataFrame(mlb.fit_transform(df.pop(column_name)), columns=mlb.classes_, index=df.index))
    return df

wakeup_df = multi_label_onehot_encoder(wakeup_df, 'awake_activities')
wakeup_df = multi_label_onehot_encoder(wakeup_df, 'prevent_better_sleep')

bedtime_df = multi_label_onehot_encoder(bedtime_df, 'activities_bedtime')

In [19]:
# Remove the unit in an answer such as "1 mg"
def remove_unit(df, column_name):
    for index, row in df.iterrows():
        # Remove the unit only when an answered is entered
        if not pd.isna(df.iloc[index][column_name]):
            split_answer = df.iloc[index][column_name].split("mg")
            if len(split_answer) > 1 and not pd.isna(split_answer[1]):
                value = split_answer[0]
                value = value.replace(",", ".")
                df.at[index, column_name] = float(value)
    return df 

if (quest_version == 2):
    bedtime_df = remove_unit(bedtime_df, 'medication_dosage')

In [20]:
# Only consider the date of the questionnaire datasets and ignore the exact time
def remove_time(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_time(quest_date).split('/')]
    return date(y, m, d)

In [21]:
# Add a 'day' column in the questionnaire dataframes to compare with oura data
def add_day_col(df, column_name):
    df['day'] = np.nan
    for index, row in df.iterrows():
        df.at[index, 'day'] = convert_questionnaire_date(df.iloc[index][column_name])
    return df

wakeup_df = add_day_col(wakeup_df, 'Timestamp')
bedtime_df = add_day_col(bedtime_df, 'Timestamp_bedtime')

In [22]:
# Bedtime data entered on day n should be matched to the wakeup and sleep data of day n+1 if the questionnaire data is entered before midnight
for index, row in bedtime_df.iterrows():
    utc_date_questionnaire = datetime.strptime(bedtime_df.iloc[index]['Timestamp_bedtime'], '%m/%d/%Y %H:%M:%S')
    # 16:00 is arbitrarily chosen as a delimiter that is late enough that the person should have woken up andearly enough that people did not go to sleep yet
    # With this implementation, we assume that people follow a sleep schedule such as they sleep at night and wake up during the day. 
    four_pm = datetime(utc_date_questionnaire.year,  utc_date_questionnaire.month,  utc_date_questionnaire.day, hour=16, minute=0)
    
    # If the questionnaire was filled after midnight but before bedtime, the date change is taken care of here
    if (utc_date_questionnaire > four_pm):
        # The questionnaire was filled after midnight before sleeeping or when the person woke up
        bedtime_df.at[index, 'day'] = bedtime_df.iloc[index]['day'] + timedelta(days=1)

## Combining the three dataframes

In [23]:
# Find the start and end dates of a dataframe
def find_start_end_dates(df):
    start_date = df.iloc[0]['day']
    end_date = df.iloc[-1]['day']
    return start_date, end_date 

sleep_start_date, sleep_end_date = find_start_end_dates(sleep_df)
wakeup_start_date, wakeup_end_date = find_start_end_dates(wakeup_df)
bedtime_start_date, bedtime_end_date = find_start_end_dates(bedtime_df)

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    if earliest:
        earliest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if earliest_date > bedtime_date:
            earliest_date = bedtime_date
        return earliest_date
    else:
        latest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if latest_date < bedtime_date:
            latest_date = bedtime_date
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True)

print('latest start date', latest_start, "\nearliest end date", earliest_end)


latest start date 2022-11-21 
earliest end date 2023-03-01


In [24]:
# Cut the data sets so that they only cover the period [latest_start, earliest_end]
def cut_df(df, start, end):
    truncated_df = pd.DataFrame()
    for index, row in df.iterrows():
        sleep_date = row['day']
        # Only add the data starting the latest_start
        if sleep_date < start:
            continue
        
        # End now if the earliest_end is missing from the dataset
        if (sleep_date > end):
            break

        truncated_df = truncated_df.append(pd.DataFrame(row).transpose(), ignore_index=True)

        # End if the earliest_end is reached
        if (sleep_date == end):
            break
        
    return truncated_df 

truncated_sleep_df = cut_df(sleep_df, latest_start, earliest_end)
truncated_wakeup_df = cut_df(wakeup_df, latest_start, earliest_end)
truncated_bedtime_df = cut_df(bedtime_df, latest_start, earliest_end)

print('truncated_sleep_df', truncated_sleep_df.shape, '\ntruncated_wakeup_df', truncated_wakeup_df.shape, '\ntruncated_bedtime_df', truncated_bedtime_df.shape)

truncated_sleep_df (100, 58) 
truncated_wakeup_df (100, 16) 
truncated_bedtime_df (99, 21)


In [25]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name):
    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # Dates which are not in the sequence are returned
    return pd.date_range(start_date, end_date).difference(df.index)

In [26]:
# Create a dataframe that goes from latest_start to earliest_end with no jumps in the dates, except that days without data are empty rows.
def complete_df(df, latest_start, earliest_end):
    missing_days = find_missing_days(df, latest_start, earliest_end, 'day')
    no_jumps_df = pd.DataFrame()
    counter = 0 
    for index, row in df.iterrows():
        sleep_date = row['day']
        
        # Prevents to add data twice for a single day
        if sleep_date > latest_start and sleep_date == latest_date:
            no_jumps_df = no_jumps_df.drop(no_jumps_df[no_jumps_df['day'] == latest_date].index.values[0])
        latest_date = sleep_date

        # If data is missing from the considered date, add an empty row
        while counter < (len(missing_days)) and sleep_date > missing_days[counter]:
            no_jumps_df = no_jumps_df.append([None], ignore_index=True)
            counter += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose(), ignore_index=True)
               
    return no_jumps_df

In [27]:
# Fill the missing days of the sleep df with empty rows
complete_sleep_df = complete_df(truncated_sleep_df, latest_start, earliest_end)

In [28]:
# Create a list of all the dates between latest_start and earliest_end
all_dates = [latest_start+timedelta(days=x) for x in range((earliest_end-(latest_start + timedelta(days=-1))).days)]
complete_sleep_df['actual_day'] = all_dates

In [29]:
# Convert the way in which time is stored in the questionnaire to a continuous scale
def temporal_conversion(df, column_name):
    for index, row in df.iterrows():
        # If there is a time entered, convert it
        if not pd.isna(df.iloc[index][column_name]):
            # Split the 12-hour time from AM/PM
            split_string = df.iloc[index][column_name].split(" ")
            # Create a list [hour, minutes, seconds] for the activity time
            h_m_s = split_string[0].split(":")
            h_m_s = list(np.float_(h_m_s))
            oura_index = complete_sleep_df[complete_sleep_df['day'] == df.iloc[index]['day']].index.values
            
            if (split_string[-1] == 'PM') and not(h_m_s[0] == 12):
                # Convert to a 24-hour time
                h_m_s[0] += 12
            elif (len(oura_index) > 0):  
                oura_index = oura_index[-1]
                utc_date_bedtime = complete_sleep_df.iloc[oura_index]['bedtime_start']
                utc_date_wakeup = complete_sleep_df.iloc[oura_index]['bedtime_end']
                utc_date_quest = datetime.strptime(df.iloc[index]['Timestamp_bedtime'], '%m/%d/%Y %H:%M:%S')
                utc_activity_time = datetime(utc_date_quest.year,  utc_date_quest.month,  utc_date_quest.day, hour=int(h_m_s[0]), minute=int(h_m_s[1])) 
                utc_activity_time = utc_activity_time.replace(tzinfo=utc_date_bedtime.tzinfo)  
                # If the entered data is after midnight but before bedtime, convert it to a more-than 24-hour time so that time logically follows
                if (utc_date_bedtime.day == utc_date_wakeup.day) and (utc_activity_time.day == utc_date_bedtime.day) and utc_date_bedtime > utc_activity_time:
                    h_m_s[0] += 24
                    
            # Convert the time to a numerical value
            df.at[index, column_name] = h_m_s[0] + h_m_s[1]/60
    return df

truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'coffee_time')
truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'alcohol_time')
truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'workout_time')
truncated_bedtime_df = temporal_conversion(truncated_bedtime_df, 'stress_relief_time')

In [30]:
# Match the bedtime questionnaire data to the correct date. This function thus modifies the column 'day'
def convert_dates_bedtime(df):
    last_date = None
    # Iterate over all the bedtime questionnaire instances 
    for index, row in df.iterrows():
        utc_date_questionnaire = datetime.strptime(df.iloc[index]['Timestamp_bedtime'], '%m/%d/%Y %H:%M:%S')
        
        # Retrieve the index, in the oura dataframe, of the day for which the questionnaire was filled 
        oura_index = complete_sleep_df[complete_sleep_df['day'] == df.iloc[index]['day']].index.values

        if len(oura_index) == 0:
            # The oura ring has no data recorded for this day. The row will be deleted later in the pre-processing. We can continue with the next questionnaire instance.
            continue
        oura_index = oura_index[-1]

        utc_date_wakeup = complete_sleep_df.iloc[oura_index]['bedtime_end']
        utc_date_questionnaire = utc_date_questionnaire.replace(tzinfo=utc_date_wakeup.tzinfo) 

        # If we have reached the end of the oura dataset and there are no data for day n+1
        if (len(complete_sleep_df) - 1) == oura_index:
            utc_date_bedtime_nextday = None
        else:
            utc_date_bedtime_nextday = complete_sleep_df.iloc[oura_index+1]['bedtime_start']

        # Add the timezone when the oura ring has data for the night n+1. It is needed for the comparison of dates
        if (not pd.isna(utc_date_bedtime_nextday)):
            utc_date_bedtime_nextday = utc_date_bedtime_nextday.replace(tzinfo=utc_date_wakeup.tzinfo)

        if (not pd.isna(utc_date_bedtime_nextday)) and (utc_date_questionnaire - utc_date_wakeup) < (utc_date_bedtime_nextday - utc_date_questionnaire):
            # The questionnaire for day n is filled on day n+1, but closer to their wake-up time than to the bedtime of day n+1
            df.at[index, 'day'] = complete_sleep_df.iloc[oura_index]['day']
        
        # If the questionnaire for day n was filled closer to bedtime of day n+1 than to wake up time of day n
        if (last_date == df.iloc[index]['day']):
            df.at[index-1, 'day'] = last_date + timedelta(days=-1)
        last_date = df.iloc[index]['day']
        
    return df

truncated_bedtime_df = convert_dates_bedtime(truncated_bedtime_df)

In [31]:
# Match the wakeup questionnaire data to the correct date. This function thus modifies the column 'day'
def convert_dates_wakeup(df):
    last_date = None
    # Iterate over all the wakeup questionnaire instances  
    for index, row in df.iterrows():
        # If two wakeup questionnaires were filled on the same day 
        if (last_date == df.iloc[index]['day']):
            # Check if there is a wakeup questionnaire instance for the previous day
            data_prev_day = df[df['day'] == last_date + timedelta(days=-1)]['day'].index.values
            if len(data_prev_day) == 0:
                # If not, then the wakeup questionnaire of day n was probably filled on day n+1
                df.at[index-1, 'day'] = last_date + timedelta(days=-1)
        last_date = df.iloc[index]['day']
        
    return df

truncated_wakeup_df = convert_dates_wakeup(truncated_wakeup_df)

In [32]:
# Add the previous day's score as a variable
def add_previous_night_score(df, score_column, new_column):
    df[new_column] = np.nan
    prev_score = np.nan
    for index, row in df.iterrows():
        df.at[index, new_column] = prev_score
        prev_score = df.iloc[index][score_column]
    return df

In [33]:
# Create three datasets that cover the same time period with no date gap in their dataframes
complete_wakeup_df = complete_df(truncated_wakeup_df, latest_start, earliest_end)
complete_bedtime_df = complete_df(truncated_bedtime_df, latest_start, earliest_end)
complete_bedtime_df['actual_day'] = all_dates
complete_wakeup_df = add_previous_night_score(complete_wakeup_df, 'subjective_sleep_score', 'prev_subjective_score')
complete_sleep_df = add_previous_night_score(complete_sleep_df, 'score', 'prev_oura_score')

print('complete_sleep_df', complete_sleep_df.shape, '\ncomplete_wakeup_df', complete_wakeup_df.shape, '\ncomplete_bedtime_df', complete_bedtime_df.shape)

complete_sleep_df (101, 61) 
complete_wakeup_df (101, 18) 
complete_bedtime_df (101, 23)


In [34]:
# Add a column in which the bedtime is a continuous scale (e.g. 16:00 is represented as 16 and 1:00 is represented as 25)
def make_bedtime_start_continuous(df):
    df['bed_time'] = np.nan
    for index, row in df.iterrows():
        utc_day = complete_sleep_df.iloc[index]['bedtime_start']
        utc_date_wakeup = complete_sleep_df.iloc[index]['bedtime_end']
        if not pd.isna(utc_day):
            df.at[index, 'bed_time'] = 0
            # If the bedtime is after midnight, convert it to a more-than 24-hour time so that time logically follows
            if (utc_day.day == utc_date_wakeup.day):
                df.at[index, 'bed_time'] += 24
                df.at[index, 'bedtime_start_seconds'] += 24*60*60
            df.at[index, 'bed_time'] += utc_day.hour + utc_day.minute/60 + utc_day.second/360
    return df

complete_sleep_df = make_bedtime_start_continuous(complete_sleep_df)

In [35]:
for index, row in complete_sleep_df.iterrows():
    if pd.isna(complete_sleep_df.iloc[index]['score']):
        complete_sleep_df.at[index, 'day'] = np.nan

In [36]:
# Rename the day columns not to mix them up 
complete_sleep_df.columns = ['day_oura' if x=='day' else x for x in complete_sleep_df.columns]
complete_wakeup_df.columns = ['day_wakeup' if x=='day' else x for x in complete_wakeup_df.columns]
complete_bedtime_df.columns = ['day_bedtime' if x=='day' else x for x in complete_bedtime_df.columns]

save_sleep_df = complete_sleep_df
save_questionnaire_df = pd.concat([complete_wakeup_df.reset_index(), complete_bedtime_df.reset_index()], axis=1)
print(save_sleep_df.columns.tolist())
print(save_questionnaire_df.columns.tolist())

[0, 'average_breath', 'average_breath_variation', 'average_heart_rate', 'average_hrv', 'awake_time', 'bedtime_end', 'bedtime_end_delta', 'bedtime_end_seconds', 'bedtime_start', 'bedtime_start_delta', 'bedtime_start_seconds', 'contributors.deep_sleep', 'contributors.efficiency', 'contributors.latency', 'contributors.rem_sleep', 'contributors.restfulness', 'contributors.timing', 'contributors.total_sleep', 'day_oura', 'deep_sleep_duration', 'efficiency', 'got_ups', 'heart_rate.interval', 'heart_rate.items', 'heart_rate.timestamp', 'hrv.interval', 'hrv.items', 'hrv.timestamp', 'latency', 'light_sleep_duration', 'lowest_heart_rate', 'lowest_heart_rate_time_offset', 'midpoint_at_delta', 'movement_30_sec', 'period', 'readiness.contributors.activity_balance', 'readiness.contributors.body_temperature', 'readiness.contributors.hrv_balance', 'readiness.contributors.previous_day_activity', 'readiness.contributors.previous_night', 'readiness.contributors.recovery_index', 'readiness.contributors.re

## Save the pre-processed Oura dataset and questionnaire dataset separately

In [37]:
# Drop irrelevant columns. These were determined by looking at the above list of columns. 
save_sleep_df = save_sleep_df.drop(labels=['bedtime_end', 'bedtime_start', 'heart_rate.timestamp', 'hrv.timestamp', 'midpoint_at_delta', 'hrv.interval', 'heart_rate.interval'], axis=1)
save_questionnaire_df = save_questionnaire_df.drop(labels=['index', 0, 'None', 'Timestamp', 'Timestamp_bedtime'], axis=1)

In [38]:
# Identify all the rows with missing data
missing_dates = []
for index, row in save_sleep_df.iterrows():
    if pd.isna(save_sleep_df.iloc[index]['day_oura']) or pd.isna(save_questionnaire_df.iloc[index]['day_wakeup']) or pd.isna(save_questionnaire_df.iloc[index]['day_bedtime']):
        missing_dates.append(save_sleep_df.iloc[index]['actual_day'])

# Drop the rows with missing data from dataframe
for i in range(len(missing_dates)):
    save_sleep_df = save_sleep_df.drop(save_sleep_df[save_sleep_df['actual_day'] == missing_dates[i]].index.values[0])
    save_questionnaire_df = save_questionnaire_df.drop(save_questionnaire_df[save_questionnaire_df['actual_day'] == missing_dates[i]].index.values[0])

save_sleep_df = save_sleep_df.reset_index()
save_questionnaire_df = save_questionnaire_df.reset_index()

In [39]:
# Drop irrelevant columns
save_sleep_df = save_sleep_df.drop(labels=['day_oura', 'index'], axis=1)
save_questionnaire_df = save_questionnaire_df.drop(labels=['day_bedtime', 'day_wakeup', 'index'], axis=1)

In [40]:
# Save the dataframes to a csv file
save_sleep_df.to_csv('data/preprocessed/preprocessed_sleep_' + str(participant_number) + '_v' + str(quest_version) + '.csv', index=False)
save_questionnaire_df.to_csv('data/preprocessed/preprocessed_questionnaires_' + str(participant_number) + '_v' + str(quest_version) + '.csv', index=False)

## Pre-process all of the data 

In [41]:
# Concatenate the two dataframes. Merge the different columns on to the same rows 
df = pd.concat([save_sleep_df.reset_index(), save_questionnaire_df.reset_index()], axis = 1)
df

Unnamed: 0,index,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end_delta,bedtime_end_seconds,bedtime_start_delta,bedtime_start_seconds,...,intervention,medication,medication_dosage,medication_time,special_circumstances,stress_levels,stress_relief_time,workout_intensity,workout_time,actual_day
0,0,14.0,2.375,51.26,115.0,6900.0,28523,28523.0,-6097,80303.0,...,0,0,,0,0,0,,,,2022-11-21
1,1,15.0,2.75,51.71,93.0,2790.0,29793,29793.0,-2427,83973.0,...,0,0,,0,0,0,,,,2022-11-22
2,2,15.0,2.5,53.12,80.0,2220.0,29589,29589.0,-2871,83529.0,...,0,0,,0,0,1,,,,2022-11-23
3,3,14.75,2.375,57.63,73.0,3600.0,29334,29334.0,-2466,83934.0,...,0,0,,0,0,1,,3.0,20.0,2022-11-24
4,4,14.875,2.75,53.19,94.0,1920.0,29510,29510.0,-2410,83990.0,...,0,0,,0,0,0,,,,2022-11-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,92,14.875,2.75,57.32,67.0,2940.0,27708,27708.0,-3252,83148.0,...,0,0,,0,0,1,,,,2023-02-25
93,93,14.625,2.625,65.17,49.0,1500.0,31483,31483.0,7063,93463.0,...,0,0,,0,0,1,,4.0,12.0,2023-02-26
94,94,15.125,2.5,59.26,62.0,1740.0,26770,26770.0,430,86830.0,...,0,0,,0,0,1,,,,2023-02-27
95,95,14.875,2.875,59.32,59.0,1860.0,27700,27700.0,1120,87520.0,...,0,0,,0,0,2,,,,2023-02-28


In [42]:
# Visual analysis of the column names to see which ones can be dropped
print(df.columns.tolist())

['index', 'average_breath', 'average_breath_variation', 'average_heart_rate', 'average_hrv', 'awake_time', 'bedtime_end_delta', 'bedtime_end_seconds', 'bedtime_start_delta', 'bedtime_start_seconds', 'contributors.deep_sleep', 'contributors.efficiency', 'contributors.latency', 'contributors.rem_sleep', 'contributors.restfulness', 'contributors.timing', 'contributors.total_sleep', 'deep_sleep_duration', 'efficiency', 'got_ups', 'heart_rate.items', 'hrv.items', 'latency', 'light_sleep_duration', 'lowest_heart_rate', 'lowest_heart_rate_time_offset', 'movement_30_sec', 'period', 'readiness.contributors.activity_balance', 'readiness.contributors.body_temperature', 'readiness.contributors.hrv_balance', 'readiness.contributors.previous_day_activity', 'readiness.contributors.previous_night', 'readiness.contributors.recovery_index', 'readiness.contributors.resting_heart_rate', 'readiness.contributors.sleep_balance', 'readiness.score', 'readiness.temperature_deviation', 'readiness.temperature_tre

In [43]:
df = df.drop(labels=['index'], axis=1)

In [44]:
# Save the complete dataframe to a csv file
df.to_csv('data/preprocessed/preprocessed_data_' + str(participant_number) + '_v' + str(quest_version) + '.csv', index=False)