## Imports

In [1]:
import pandas as pd
import json
from datetime import *

## Loading the data

In [2]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/oura_json/oura_sleep_2022-12-06T10-07-33.json') as f:
    data1 = json.load(f)

sleep_df = pd.json_normalize(data1['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (176, 56)


In [3]:
# Loading raw wake up data from Google Form csv
wakeup_df = pd.read_csv ('questionnaire_data/raw/Wakeup_3_v1_raw.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv
bedtime_df = pd.read_csv ('questionnaire_data/raw/Bedtime_3_v1_raw.csv')
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (151, 9)
bedtime_df (149, 10)


## Combining the three dataframes

In [4]:
# Finding start and end date of all dataframes
sleep_start_date = sleep_df.iloc[0]['day']
sleep_end_date = sleep_df.iloc[-1]['day']

wakeup_start_date = wakeup_df.iloc[0]['Timestamp']
wakeup_end_date = wakeup_df.iloc[-1]['Timestamp']

bedtime_start_date = bedtime_df.iloc[0]['Timestamp_bedtime']
bedtime_end_date = bedtime_df.iloc[-1]['Timestamp_bedtime']

# Only consider the date of the questionnaire data and ignores the exact time
def remove_hour(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the oura data is stored
def convert_sleep_date(sleep_date):
    y, m, d = [int(x) for x in sleep_date.split('-')]
    return date(y, m, d)

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_hour(quest_date).split('/')]
    return date(y, m, d)

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    date1 = convert_sleep_date(sleep_date)
    date2 = convert_questionnaire_date(wakeup_date)
    date3 = convert_questionnaire_date(bedtime_date)
    
    if earliest:
        earliest_date = compare_two_dates(date1, date2, earliest)
        if earliest_date > date3:
            earliest_date = date3
        return earliest_date
    else:
        latest_date = compare_two_dates(date1, date2, earliest)
        if latest_date < date3:
            latest_date = date3
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True)
print('latest date', latest_start, "\nearliest date", earliest_end)


latest date 2022-06-19 
earliest date 2022-11-19


In [5]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name, oura):
    if oura == False:
        for i in df[column_name]:
            df[column_name] = df[column_name].replace([i], convert_questionnaire_date(i))

    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # Dates which are not in the sequence are returned
    return pd.date_range(start_date, end_date).difference(df.index)

In [6]:
# Create a dataframe that goes from latest_start to earliest_end with no jumps in the dates, except that days without data are empty rows.
def complete_df(df, column_name, oura):

    missing_days = find_missing_days(df, latest_start, earliest_end, column_name, oura)
    no_jumps_df = pd.DataFrame()
    counter = 0 
    for index, row in df.iterrows():
        if oura == True:
            sleep_date = convert_sleep_date(row[column_name])
        else:
            sleep_date = row[column_name]

        # Only add the data starting the latest_start
        if counter == 0 and sleep_date < latest_start:
            continue
        
        # Prevents to add data twice for a single day
        if sleep_date > latest_start and sleep_date == latest_date:
            continue
        latest_date = sleep_date

        # If data is missing from the considered date, add an empty row
        while counter < (len(missing_days)) and sleep_date > missing_days[counter]:
            no_jumps_df = no_jumps_df.append([None])
            counter += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose())
        
        # End when the earliest_end is reached
        if (sleep_date == earliest_end):
            break
               
    return no_jumps_df

In [7]:
# Create three datasets that cover the same time period
complete_sleep_df = complete_df(sleep_df, 'day', oura=True)
complete_wakeup_df = complete_df(wakeup_df, 'Timestamp', oura=False)
complete_bedtime_df = complete_df(bedtime_df, 'Timestamp_bedtime', oura=False)

print('complete_sleep_df', complete_sleep_df.shape, '\ncomplete_wakeup_df', complete_wakeup_df.shape, '\ncomplete_bedtime_df', complete_bedtime_df.shape)

complete_sleep_df (154, 57) 
complete_wakeup_df (154, 10) 
complete_bedtime_df (154, 11)


In [8]:
# Concatenate the 3 data sets. Merge the different columns on to the same rows 
df = pd.concat([complete_sleep_df.reset_index(), complete_wakeup_df.reset_index(), complete_bedtime_df.reset_index()], axis=1)
df

Unnamed: 0,index,0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_end_delta,bedtime_start,...,Did you take medication that might affect your sleep?,How much did you eat in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,How would you characterize the activities you did in the last 3 hours before going to bed?,"If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?","If you did a workout today, at what time did you do your last workout?","If you had alcohol today, at what time did you have your last drink?","If you had coffee today, at what time did you take your last cup of coffee?",Is there any particular reason why you think you might or might not sleep well tonight?,Timestamp_bedtime
0,10,,16.5,2.625,56.29,78.0,2730.0,2022-06-19T07:41:27+02:00,27687,2022-06-18T23:51:27+02:00,...,No,0,4,"Stressful, Mentally active",,,,,,2022-06-19
1,11,,,,,,,2022-06-20T07:50:44+02:00,28244,2022-06-20T00:00:44+02:00,...,No,0,3,Relaxed,,8:30:00 AM,,,,2022-06-20
2,12,,16.5,2.625,56.05,79.0,2490.0,2022-06-21T08:23:15+02:00,30195,2022-06-20T23:00:15+02:00,...,No,0,3,"Relaxed, Mentally active",,,,,,2022-06-21
3,13,,16.75,2.625,56.07,72.0,2400.0,2022-06-22T07:21:15+02:00,26475,2022-06-21T23:17:15+02:00,...,No,3,3,"Relaxed, Socially active",,8:00:00 AM,,11:00:00 AM,,2022-06-22
4,14,,16.5,2.625,57.22,78.0,2130.0,2022-06-23T07:27:37+02:00,26857,2022-06-22T23:33:37+02:00,...,No,0,3,"Stressful, Mentally active",,4:00:00 PM,,,,2022-06-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,154,,15.75,2.625,55.67,69.0,3090.0,2022-11-15T08:06:49+01:00,29209,2022-11-14T23:39:49+01:00,...,No,0,0,Relaxed,,10:30:00 AM,,,,2022-11-15
150,155,,15.25,2.25,57.62,64.0,3480.0,2022-11-16T08:21:18+01:00,30078,2022-11-15T23:12:18+01:00,...,,,,,,,,,,
151,156,,15.75,2.5,63.72,48.0,2040.0,2022-11-17T08:45:52+01:00,31552,2022-11-17T02:28:52+01:00,...,No,0,2,"Stressful, Physically active, Socially active",,8:00:00 PM,1:00:00 AM,,,2022-11-17
152,157,,15.875,2.75,55.93,72.0,2610.0,2022-11-18T09:01:40+01:00,32500,2022-11-17T23:43:40+01:00,...,,,,,,,,,,
