## Imports

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import *

## Loading the data

In [2]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/oura_json/oura_sleep_2022-12-06T10-07-33.json') as f:
    data1 = json.load(f)

# Flatten the nested json file
sleep_df = pd.json_normalize(data1['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (176, 56)


In [3]:
# Loading raw wake up data from Google Form csv file
wakeup_df = pd.read_csv ('questionnaire_data/raw/Wakeup_3_v1_raw.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv file 
bedtime_df = pd.read_csv ('questionnaire_data/raw/Bedtime_3_v1_raw.csv')
# Rename the 'Timestamp' column of the bedtime dataframe in order to differentiate it from the one in the wakeup dataframe
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (151, 9)
bedtime_df (149, 10)


## Initial pre-processing of datasets separately

In [4]:
# Used for visual inspection of the Oura ring dataset in order to determine the units of each variable
print(sleep_df.iloc[1,:])

bedtime_end                                                             2022-06-10T06:55:22+02:00
bedtime_start                                                           2022-06-09T22:46:22+02:00
day                                                                                    2022-06-10
period                                                                                          0
segment_state                                                                              active
time_in_bed                                                                                 29340
type                                                                                   long_sleep
sleep_phase_5_min                               4244222111111111112211111112233333222211112112...
restless                                                                                      6.0
timezone                                                                                      120
bedtime_start_delta 

In [5]:
# Convert the ISO 8601 notation of some variables in the oura ring data to seconds
def convert_iso_seconds(df, column_name, new_column_name):
    df[new_column_name] = np.nan
    for index, row in df.iterrows():
        utc_date = datetime.strptime(sleep_df.iloc[index][column_name], '%Y-%m-%dT%H:%M:%S%z')
        df.at[index, column_name] = utc_date
        df.at[index, new_column_name] = (utc_date - datetime(utc_date.year, utc_date.month, utc_date.day, tzinfo=utc_date.tzinfo)).total_seconds()
    return df

sleep_df = convert_iso_seconds(sleep_df, 'bedtime_end', 'bedtime_end_seconds')
sleep_df = convert_iso_seconds(sleep_df, 'bedtime_start', 'bedtime_start_seconds')

In [6]:
# Change how the date of the oura data is stored
def convert_sleep_date(df):
    for index, row in df.iterrows():
        sleep_date = df.iloc[index]['oura_day']
        y, m, d = [int(x) for x in sleep_date.split('-')]
        df.at[index, 'day'] = date(y, m, d) 
    return sleep_df
    
sleep_df.columns = ['oura_day' if x=='day' else x for x in sleep_df.columns]
sleep_df['day'] = np.nan
sleep_df = convert_sleep_date(sleep_df)

In [7]:
# Only consider the date of the questionnaire datasets and ignore the exact time
def remove_time(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_time(quest_date).split('/')]
    return date(y, m, d)

In [8]:
# Add a 'day' column in the questionnaire dataframes to compare with oura data
def add_day_col(df, column_name):
    df['day'] = np.nan
    for index, row in df.iterrows():
        df.at[index, 'day'] = convert_questionnaire_date(df.iloc[index][column_name])
    return df

wakeup_df = add_day_col(wakeup_df, 'Timestamp')
bedtime_df = add_day_col(bedtime_df, 'Timestamp_bedtime')

In [9]:
# Bedtime data entered on day n should be matched to the wakeup and sleep data of day n+1
for index, row in bedtime_df.iterrows():
    bedtime_df.at[index, 'day'] = bedtime_df.iloc[index]['day'] + timedelta(days=1)

## Combining the three dataframes

In [10]:
# Find start and end date of all dataframes
sleep_start_date = sleep_df.iloc[0]['day']
sleep_end_date = sleep_df.iloc[-1]['day']

wakeup_start_date = wakeup_df.iloc[0]['day']
wakeup_end_date = wakeup_df.iloc[-1]['day']

bedtime_start_date = bedtime_df.iloc[0]['day']
bedtime_end_date = bedtime_df.iloc[-1]['day']

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    if earliest:
        earliest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if earliest_date > bedtime_date:
            earliest_date = bedtime_date
        return earliest_date
    else:
        latest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if latest_date < bedtime_date:
            latest_date = bedtime_date
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True)
print('latest start date', latest_start, "\nearliest end date", earliest_end)

latest start date 2022-06-19 
earliest end date 2022-11-20


In [11]:
# Cut the data sets so that they only cover the period [latest_start, earliest_end]
def cut_df(df, start, end):
    truncated_df = pd.DataFrame()
    for index, row in df.iterrows():
        sleep_date = row['day']
        # Only add the data starting the latest_start
        if sleep_date < start:
            continue

        # End  if the earliest_end is reached
        if (sleep_date >= end):
            break
        
        truncated_df = truncated_df.append(pd.DataFrame(row).transpose(), ignore_index=True)

    return truncated_df 

truncated_sleep_df = cut_df(sleep_df, latest_start, earliest_end)
truncated_wakeup_df = cut_df(wakeup_df, latest_start, earliest_end)
truncated_bedtime_df = cut_df(bedtime_df, latest_start, earliest_end)

print('truncated_sleep_df', truncated_sleep_df.shape, '\ntruncated_wakeup_df', truncated_wakeup_df.shape, '\ntruncated_bedtime_df', truncated_bedtime_df.shape)

truncated_sleep_df (149, 59) 
truncated_wakeup_df (150, 10) 
truncated_bedtime_df (148, 11)


In [12]:
bedtime_df

Unnamed: 0,Timestamp_bedtime,"If you had coffee today, at what time did you take your last cup of coffee?","If you had alcohol today, at what time did you have your last drink?","If you did a workout today, at what time did you do your last workout?","If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?",Did you take medication that might affect your sleep?,How much did you eat in the last 3 hours before going to bed?,How would you characterize the activities you did in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,Is there any particular reason why you think you might or might not sleep well tonight?,day
0,6/18/2022 23:23:07,,,,,No,0,"Stressful, Mentally active",4,Stress due to important deadlines coming up,2022-06-19
1,6/19/2022 23:47:36,,,,,No,0,"Stressful, Mentally active",4,,2022-06-20
2,6/20/2022 22:57:13,,,8:30:00 AM,,No,0,Relaxed,3,,2022-06-21
3,6/21/2022 22:56:24,,,,,No,0,"Relaxed, Mentally active",3,,2022-06-22
4,6/22/2022 23:17:19,11:00:00 AM,,8:00:00 AM,,No,3,"Relaxed, Socially active",3,,2022-06-23
...,...,...,...,...,...,...,...,...,...,...,...
144,11/14/2022 23:14:11,,,,,No,0,Relaxed,1,,2022-11-15
145,11/15/2022 22:55:39,,,10:30:00 AM,,No,0,Relaxed,0,,2022-11-16
146,11/17/2022 2:10:08,,1:00:00 AM,8:00:00 PM,,No,0,"Stressful, Physically active, Socially active",2,,2022-11-18
147,11/17/2022 22:30:21,,,,,No,0,Relaxed,1,,2022-11-18


In [13]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name):
    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # Dates which are not in the sequence are returned
    return pd.date_range(start_date, end_date).difference(df.index)

In [15]:
# Create a dataframe that goes from latest_start to earliest_end with no jumps in the dates, except that days without data are empty rows.
def complete_df(df, latest_start, earliest_end):
    missing_days = find_missing_days(df, latest_start, earliest_end, 'day')
    no_jumps_df = pd.DataFrame()
    counter = 0 
    for index, row in df.iterrows():
        sleep_date = row['day']
        
        # Prevents to add data twice for a single day
        ### Might go away if I match data to date before
        if sleep_date > latest_start and sleep_date == latest_date:
            continue
        latest_date = sleep_date

        # If data is missing from the considered date, add an empty row
        while counter < (len(missing_days)) and sleep_date > missing_days[counter]:
            no_jumps_df = no_jumps_df.append([None], ignore_index=True)
            counter += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose(), ignore_index=True)
               
    return no_jumps_df

In [16]:
# Fill the missing days of the sleep df with empty rows
complete_sleep_df = complete_df(truncated_sleep_df, latest_start, earliest_end)

In [17]:
# Create a list of all the dates between latest_start and earliest_end
all_dates = [latest_start+timedelta(days=x) for x in range((earliest_end-latest_start).days)]

In [18]:
print(complete_sleep_df.iloc[10]['day'])

nan


In [19]:
# Add a column 'day' in the questionnaire data to compare dates with oura data
def convert_dates(df, column_name):
    ### MAKE IT GO OVER ALL ROWS EXCEPT THE LAST ONE
    for index, row in df.iterrows():
        print("new row", df.iloc[index]['Timestamp_bedtime'])
        utc_date_questionnaire = datetime.strptime(df.iloc[index][column_name], '%m/%d/%Y %H:%M:%S')
        four_pm = datetime(utc_date_questionnaire.year,  utc_date_questionnaire.month,  utc_date_questionnaire.day, hour=16, minute=0)
        if (utc_date_questionnaire < four_pm):
            actual_date = df.iloc[index]['day'] + timedelta(days=-1)
        else:
            actual_date = df.iloc[index]['day']
        print('actual day is', actual_date)
        oura_index = complete_sleep_df[complete_sleep_df['day'] == actual_date].index.values
        if len(oura_index) == 0:
            print('no oura data on actual day')
            ## Do more stuff
            continue
        else:
            oura_index = oura_index[0]
        utc_date_bedtime = complete_sleep_df.iloc[oura_index]['bedtime_start']
        utc_date_wakeup = complete_sleep_df.iloc[oura_index]['bedtime_end']
        utc_date_questionnaire = utc_date_questionnaire.replace(tzinfo=utc_date_wakeup.tzinfo) 
        # if () ### SHOULD I CHECK IF THERE IS NO OTHER NIGHT NEXT (end of dataset)
        utc_date_bedtime_nextday = complete_sleep_df.iloc[oura_index+1]['bedtime_start']
        if (not pd.isna(utc_date_bedtime_nextday)):
            utc_date_bedtime_nextday = utc_date_bedtime_nextday.replace(tzinfo=utc_date_wakeup.tzinfo)
        print((utc_date_bedtime > utc_date_questionnaire))
        # If questionnaire is filled before bedtime
        if utc_date_bedtime > utc_date_questionnaire:
            midnight = datetime(utc_date_wakeup.year,  utc_date_wakeup.month,  utc_date_wakeup.day, hour=0, minute=0, tzinfo=utc_date_wakeup.tzinfo)
            if utc_date_questionnaire > midnight:
                df.at[index, 'day'] = df.iloc[index]['day'] + timedelta(days=-1)
            print('so this is for day', df.iloc[index]['day'])
        elif (not pd.isna(utc_date_bedtime_nextday)) and (utc_date_questionnaire - utc_date_wakeup) < (utc_date_bedtime_nextday - utc_date_questionnaire):
            df.at[index, 'day'] = complete_sleep_df.iloc[oura_index]['day']
            print('so this is for day', df.iloc[index]['day'])
        elif (pd.isna(utc_date_bedtime_nextday)):
            ## filled in after midnight and it should be for day n but no oura data for day n+1
            print('well see later')
        else:
            print('situation not covered')
    return df

# truncated_wakeup_df = convert_dates(truncated_wakeup_df, 'Timestamp')
truncated_bedtime_df = convert_dates(truncated_bedtime_df, 'Timestamp_bedtime')

new row 6/18/2022 23:23:07
actual day is 2022-06-19
True
so this is for day 2022-06-19
new row 6/19/2022 23:47:36
actual day is 2022-06-20
True
so this is for day 2022-06-20
new row 6/20/2022 22:57:13
actual day is 2022-06-21
True
so this is for day 2022-06-21
new row 6/21/2022 22:56:24
actual day is 2022-06-22
True
so this is for day 2022-06-22
new row 6/22/2022 23:17:19
actual day is 2022-06-23
True
so this is for day 2022-06-23
new row 6/23/2022 21:40:43
actual day is 2022-06-24
True
so this is for day 2022-06-24
new row 6/24/2022 23:27:23
actual day is 2022-06-25
True
so this is for day 2022-06-25
new row 6/25/2022 22:43:14
actual day is 2022-06-26
True
so this is for day 2022-06-26
new row 6/27/2022 10:09:21
actual day is 2022-06-27
False
so this is for day 2022-06-27
new row 6/28/2022 3:20:14
actual day is 2022-06-28
False
well see later
new row 6/29/2022 4:03:08
actual day is 2022-06-29
no oura data on actual day
new row 6/30/2022 3:42:47
actual day is 2022-06-30
False
well see 

In [21]:
# Create three datasets that cover the same time period with no date gap in their dataframes
# complete_sleep_df = complete_df(sleep_df, latest_start, earliest_end, oura=True)
complete_wakeup_df = complete_df(truncated_wakeup_df, latest_start, earliest_end)
complete_bedtime_df = complete_df(truncated_bedtime_df, latest_start, earliest_end)

print('complete_sleep_df', complete_sleep_df.shape, '\ncomplete_wakeup_df', complete_wakeup_df.shape, '\ncomplete_bedtime_df', complete_bedtime_df.shape)

complete_sleep_df (154, 60) 
complete_wakeup_df (154, 11) 
complete_bedtime_df (153, 12)


In [22]:
# Concatenate the 3 dataframes. Merge the different columns on to the same rows 
df = pd.concat([complete_sleep_df.reset_index(), complete_wakeup_df.reset_index(), complete_bedtime_df.reset_index()], axis=1)

## Pre-process the dataset 

In [23]:
# Identify all the rows with missing data
missing_date_idx = []
for index, row in df.iterrows():
    if pd.isna(df.iloc[index]['oura_day']) or pd.isna(df.iloc[index]['Timestamp']) or pd.isna(df.iloc[index]['Timestamp_bedtime']):
        missing_date_idx.append(index)

# Drop the rows with missing data from dataframe
for i in range(len(missing_date_idx)):
    # print(missing_date_idx[i])
    df = df.drop(missing_date_idx[i])

df = df.reset_index()
df

Unnamed: 0,level_0,index,0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_end_delta,...,How much did you eat in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,How would you characterize the activities you did in the last 3 hours before going to bed?,"If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?","If you did a workout today, at what time did you do your last workout?","If you had alcohol today, at what time did you have your last drink?","If you had coffee today, at what time did you take your last cup of coffee?",Is there any particular reason why you think you might or might not sleep well tonight?,Timestamp_bedtime,day
0,0,0,,16.5,2.625,56.29,78.0,2730.0,2022-06-19 07:41:27+02:00,27687,...,0,4,"Stressful, Mentally active",,,,,Stress due to important deadlines coming up,6/18/2022 23:23:07,2022-06-19
1,1,1,,,,,,,2022-06-20 07:50:44+02:00,28244,...,0,4,"Stressful, Mentally active",,,,,,6/19/2022 23:47:36,2022-06-20
2,2,2,,16.5,2.625,56.05,79.0,2490.0,2022-06-21 08:23:15+02:00,30195,...,0,3,Relaxed,,8:30:00 AM,,,,6/20/2022 22:57:13,2022-06-21
3,3,3,,16.75,2.625,56.07,72.0,2400.0,2022-06-22 07:21:15+02:00,26475,...,0,3,"Relaxed, Mentally active",,,,,,6/21/2022 22:56:24,2022-06-22
4,4,4,,16.5,2.625,57.22,78.0,2130.0,2022-06-23 07:27:37+02:00,26857,...,3,3,"Relaxed, Socially active",,8:00:00 AM,,11:00:00 AM,,6/22/2022 23:17:19,2022-06-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,148,148,,15.0,2.5,57.29,72.0,3510.0,2022-11-14 08:40:31+01:00,31231,...,0,1,"Stressful, Socially active",,,2:45:00 AM,,,11/13/2022 3:47:54,2022-11-14
132,149,149,,15.75,2.625,55.67,69.0,3090.0,2022-11-15 08:06:49+01:00,29209,...,0,1,Relaxed,,,,,,11/14/2022 23:14:11,2022-11-15
133,150,150,,15.25,2.25,57.62,64.0,3480.0,2022-11-16 08:21:18+01:00,30078,...,0,0,Relaxed,,10:30:00 AM,,,,11/15/2022 22:55:39,2022-11-16
134,151,151,,15.75,2.5,63.72,48.0,2040.0,2022-11-17 08:45:52+01:00,31552,...,0,2,"Stressful, Physically active, Socially active",,8:00:00 PM,1:00:00 AM,,,11/17/2022 2:10:08,2022-11-17


In [24]:
print(df.columns.tolist())

['level_0', 'index', 0, 'average_breath', 'average_breath_variation', 'average_heart_rate', 'average_hrv', 'awake_time', 'bedtime_end', 'bedtime_end_delta', 'bedtime_end_seconds', 'bedtime_start', 'bedtime_start_delta', 'bedtime_start_seconds', 'contributors.deep_sleep', 'contributors.efficiency', 'contributors.latency', 'contributors.rem_sleep', 'contributors.restfulness', 'contributors.timing', 'contributors.total_sleep', 'day', 'deep_sleep_duration', 'efficiency', 'got_ups', 'heart_rate.interval', 'heart_rate.items', 'heart_rate.timestamp', 'hrv.interval', 'hrv.items', 'hrv.timestamp', 'latency', 'light_sleep_duration', 'lowest_heart_rate', 'lowest_heart_rate_time_offset', 'midpoint_at_delta', 'movement_30_sec', 'oura_day', 'period', 'readiness.contributors.activity_balance', 'readiness.contributors.body_temperature', 'readiness.contributors.hrv_balance', 'readiness.contributors.previous_day_activity', 'readiness.contributors.previous_night', 'readiness.contributors.recovery_index',

In [25]:
# Drop irrelevant columns. These were determined by looking at the above list of columns. 
df = df.drop(labels=['level_0', 'index', 0], axis=1)
df

Unnamed: 0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_end_delta,bedtime_end_seconds,bedtime_start,bedtime_start_delta,...,How much did you eat in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,How would you characterize the activities you did in the last 3 hours before going to bed?,"If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?","If you did a workout today, at what time did you do your last workout?","If you had alcohol today, at what time did you have your last drink?","If you had coffee today, at what time did you take your last cup of coffee?",Is there any particular reason why you think you might or might not sleep well tonight?,Timestamp_bedtime,day
0,16.5,2.625,56.29,78.0,2730.0,2022-06-19 07:41:27+02:00,27687,27687.0,2022-06-18 23:51:27+02:00,-513,...,0,4,"Stressful, Mentally active",,,,,Stress due to important deadlines coming up,6/18/2022 23:23:07,2022-06-19
1,,,,,,2022-06-20 07:50:44+02:00,28244,28244.0,2022-06-20 00:00:44+02:00,44,...,0,4,"Stressful, Mentally active",,,,,,6/19/2022 23:47:36,2022-06-20
2,16.5,2.625,56.05,79.0,2490.0,2022-06-21 08:23:15+02:00,30195,30195.0,2022-06-20 23:00:15+02:00,-3585,...,0,3,Relaxed,,8:30:00 AM,,,,6/20/2022 22:57:13,2022-06-21
3,16.75,2.625,56.07,72.0,2400.0,2022-06-22 07:21:15+02:00,26475,26475.0,2022-06-21 23:17:15+02:00,-2565,...,0,3,"Relaxed, Mentally active",,,,,,6/21/2022 22:56:24,2022-06-22
4,16.5,2.625,57.22,78.0,2130.0,2022-06-23 07:27:37+02:00,26857,26857.0,2022-06-22 23:33:37+02:00,-1583,...,3,3,"Relaxed, Socially active",,8:00:00 AM,,11:00:00 AM,,6/22/2022 23:17:19,2022-06-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,15.0,2.5,57.29,72.0,3510.0,2022-11-14 08:40:31+01:00,31231,31231.0,2022-11-14 01:13:31+01:00,4411,...,0,1,"Stressful, Socially active",,,2:45:00 AM,,,11/13/2022 3:47:54,2022-11-14
132,15.75,2.625,55.67,69.0,3090.0,2022-11-15 08:06:49+01:00,29209,29209.0,2022-11-14 23:39:49+01:00,-1211,...,0,1,Relaxed,,,,,,11/14/2022 23:14:11,2022-11-15
133,15.25,2.25,57.62,64.0,3480.0,2022-11-16 08:21:18+01:00,30078,30078.0,2022-11-15 23:12:18+01:00,-2862,...,0,0,Relaxed,,10:30:00 AM,,,,11/15/2022 22:55:39,2022-11-16
134,15.75,2.5,63.72,48.0,2040.0,2022-11-17 08:45:52+01:00,31552,31552.0,2022-11-17 02:28:52+01:00,8932,...,0,2,"Stressful, Physically active, Socially active",,8:00:00 PM,1:00:00 AM,,,11/17/2022 2:10:08,2022-11-17
