## Imports

In [1]:
import pandas as pd
import json
from datetime import *

## Loading the data

In [2]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/oura_json/oura_sleep_2022-12-06T10-07-33.json') as f:
    data1 = json.load(f)

sleep_df = pd.json_normalize(data1['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (176, 56)


In [3]:
# Loading raw wake up data from Google Form csv
wakeup_df = pd.read_csv ('questionnaire_data/raw/Wakeup_3_v1_raw.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv
bedtime_df = pd.read_csv ('questionnaire_data/raw/Bedtime_3_v1_raw.csv')
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (151, 9)
bedtime_df (149, 10)


## Combining the three dataframes

In [4]:
# Finding start and end date of all dataframes
sleep_start_date = sleep_df.iloc[0]['day']
sleep_end_date = sleep_df.iloc[-1]['day']

wakeup_start_date = wakeup_df.iloc[0]['Timestamp']
wakeup_end_date = wakeup_df.iloc[-1]['Timestamp']

bedtime_start_date = bedtime_df.iloc[0]['Timestamp_bedtime']
bedtime_end_date = bedtime_df.iloc[-1]['Timestamp_bedtime']

# Only consider the date of the questionnaire data and ignores the exact time
def remove_hour(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the oura data is stored
def convert_sleep_date(sleep_date):
    y, m, d = [int(x) for x in sleep_date.split('-')]
    return date(y, m, d)

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_hour(quest_date).split('/')]
    return date(y, m, d)

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    date1 = convert_sleep_date(sleep_date)
    date2 = convert_questionnaire_date(wakeup_date)
    date3 = convert_questionnaire_date(bedtime_date)
    
    if earliest:
        earliest_date = compare_two_dates(date1, date2, earliest)
        if earliest_date > date3:
            earliest_date = date3
        return earliest_date
    else:
        latest_date = compare_two_dates(date1, date2, earliest)
        if latest_date < date3:
            latest_date = date3
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True)
print('latest date', latest_start, "\nearliest date", earliest_end)


latest date 2022-06-19 
earliest date 2022-11-19


In [5]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name, oura):
    if oura == False:
        for i in df[column_name]:
            df[column_name] = df[column_name].replace([i], convert_questionnaire_date(i))

    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # dates which are not in the sequence
    # are returned
    return pd.date_range(start_date, end_date).difference(df.index)

missing_sleep_days = find_missing_days(sleep_df, latest_start, earliest_end, 'day', oura=True)
missing_wakeup_days = find_missing_days(wakeup_df, latest_start, earliest_end, 'Timestamp', oura=False)
missing_bedtime_days = find_missing_days(bedtime_df, latest_start, earliest_end, 'Timestamp_bedtime', oura=False)

In [6]:
print(missing_sleep_days)
print(len(missing_sleep_days))

DatetimeIndex(['2022-06-29', '2022-07-01', '2022-07-13', '2022-07-21',
               '2022-08-25', '2022-11-11', '2022-11-12', '2022-11-13'],
              dtype='datetime64[ns]', freq=None)
8


In [9]:
def complete_df(df, column_name, oura):
    no_jumps_df = pd.DataFrame()
    counter = 0 
    for index, row in df.iterrows():
        if oura == True:
            sleep_date = convert_sleep_date(row[column_name])
        else:
            sleep_date = row[column_name]
        if (sleep_date < latest_start):
            continue

        while counter < (len(missing_sleep_days)) and sleep_date> missing_sleep_days[counter]:
            no_jumps_df = no_jumps_df.append(pd.Series(), ignore_index=True)
            counter += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose())
        
        if (sleep_date == earliest_end):
            break

    return no_jumps_df

In [10]:
complete_sleep_df = complete_df(sleep_df, 'day', oura=True)
complete_wakeup_df = complete_df(wakeup_df, 'Timestamp', oura=False)
complete_bedtime_df = complete_df(bedtime_df, 'Timestamp_bedtime', oura=False)

  del sys.path[0]


In [None]:
complete_sleep_df = pd.DataFrame()
# printed = False
counter = 0 
for index, row in sleep_df.iterrows():
    # print(index)
    # print(sleep_df.iloc[index]['day'])
    if (convert_sleep_date(row['day']) < latest_start):
        continue
    # if (printed == False):
    #     print (convert_sleep_date(row['day']))
    #     printed = True

    # print(type(convert_sleep_date(row['day'])))
    # print(convert_sleep_date(row['day']))
    # print(type(missing_sleep_days[counter]))
    # print(missing_sleep_days[counter])
    while counter < (len(missing_sleep_days)) and convert_sleep_date(row['day']) > missing_sleep_days[counter]:

        complete_sleep_df = complete_sleep_df.append(pd.Series(), ignore_index=True)
        # if (printed == False):
        #     print (complete_sleep_df)
        #     printed = True
        counter += 1
        # print(counter)
        # print(len(missing_sleep_days))

    complete_sleep_df = complete_sleep_df.append(pd.DataFrame(row).transpose())
    latest_date = convert_sleep_date(row['day'])
    if (convert_sleep_date(row['day']) == earliest_end):
        break

print(complete_sleep_df)


In [None]:
complete_wakeup_df = pd.DataFrame()
# printed = False
counter = 0 
for index, row in wakeup_df.iterrows():
    # print(index)
    # print(wakeup_df.iloc[index]['Timestamp'])
    if (row['Timestamp'] < latest_start):
        continue

    # if (printed == False):
    #     print (convert_questionnaire_date(row['Timestamp']))
    #     printed = True
    
    if missing_wakeup_days[counter] == True:
        complete_wakeup_df = complete_wakeup_df.append(row)
    else:
        # print(missing_wakeup_days[index])
        complete_wakeup_df = complete_wakeup_df.append(pd.Series(), ignore_index=True)
        # if (printed == False):
        #     print (complete_wakeup_df)
        #     printed = True
        counter += 1

    latest_date = convert_questionnaire_date(row['Timestamp'])
    if (convert_questionnaire_date(row['Timestamp']) == earliest_end):
        break

print(complete_wakeup_df)

In [None]:
complete_bedtime_df = pd.DataFrame()
# printed = False
counter = 0 
for index, row in bedtime_df.iterrows():
    # print(index)
    # print(bedtime_df.iloc[index]['Timestamp_bedtime'])
    if (convert_questionnaire_date(row['Timestamp_bedtime']) < latest_start):
        continue

    # if (printed == False):
    #     print (convert_questionnaire_date(row['Timestamp_bedtime']))
    #     printed = True
    
    if missing_bedtime_days[counter] == True:
        complete_bedtime_df = complete_bedtime_df.append(row)
    else:
        # print(missing_bedtime_days[index])
        complete_bedtime_df = complete_bedtime_df.append(pd.Series(), ignore_index=True)
        # if (printed == False):
        #     print (complete_bedtime_df)
        #     printed = True
        counter += 1

    latest_date = convert_questionnaire_date(row['Timestamp_bedtime'])
    if (convert_questionnaire_date(row['Timestamp_bedtime']) == earliest_end):
        break

print(complete_bedtime_df)

In [None]:
df = pd.concat([sleep_df, wakeup_df, bedtime_df])

df

Unnamed: 0,bedtime_end,bedtime_start,day,period,segment_state,time_in_bed,type,sleep_phase_5_min,restless,timezone,...,Timestamp_bedtime,"If you had coffee today, at what time did you take your last cup of coffee?","If you had alcohol today, at what time did you have your last drink?","If you did a workout today, at what time did you do your last workout?","If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?",Did you take medication that might affect your sleep?,How much did you eat in the last 3 hours before going to bed?,How would you characterize the activities you did in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,Is there any particular reason why you think you might or might not sleep well tonight?
1970-01-01 00:00:00,2022-06-09T07:03:09+02:00,2022-06-08T23:01:09+02:00,2022-06-09,0.0,active,28920.0,rest,,,120.0,...,,,,,,,,,,
1970-01-01 00:00:00.000000001,2022-06-10T06:55:22+02:00,2022-06-09T22:46:22+02:00,2022-06-10,0.0,active,29340.0,long_sleep,4244222111111111112211111112233333222211112112...,6.0,120.0,...,,,,,,,,,,
1970-01-01 00:00:00.000000002,2022-06-11T06:50:26+02:00,2022-06-10T22:43:26+02:00,2022-06-11,0.0,active,29220.0,long_sleep,4422211111211111111111111111333322111222222222...,6.0,120.0,...,,,,,,,,,,
1970-01-01 00:00:00.000000003,2022-06-12T06:51:55+02:00,2022-06-11T23:14:55+02:00,2022-06-12,0.0,active,27420.0,long_sleep,4222221111111111121221112222333332222221222221...,7.0,120.0,...,,,,,,,,,,
1970-01-01 00:00:00.000000004,2022-06-13T07:21:57+02:00,2022-06-12T23:48:57+02:00,2022-06-13,0.0,active,27180.0,long_sleep,4422222122221111111122222111111123332222221112...,7.0,120.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,,,,,,,,,,,...,11/14/2022 23:14:11,,,,,No,0.0,Relaxed,1.0,
145,,,,,,,,,,,...,11/15/2022 22:55:39,,,10:30:00 AM,,No,0.0,Relaxed,0.0,
146,,,,,,,,,,,...,11/17/2022 2:10:08,,1:00:00 AM,8:00:00 PM,,No,0.0,"Stressful, Physically active, Socially active",2.0,
147,,,,,,,,,,,...,11/17/2022 22:30:21,,,,,No,0.0,Relaxed,1.0,
