## Imports

In [1]:
import pandas as pd
import json
from datetime import *

## Loading the data

In [2]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/oura_json/oura_sleep_2022-12-06T10-07-33.json') as f:
    data1 = json.load(f)

sleep_df = pd.json_normalize(data1['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (176, 56)


In [3]:
# Loading raw wake up data from Google Form csv
wakeup_df = pd.read_csv ('questionnaire_data/raw/Wakeup_3_v1_raw.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv
bedtime_df = pd.read_csv ('questionnaire_data/raw/Bedtime_3_v1_raw.csv')
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (151, 9)
bedtime_df (149, 10)


## Combining the three dataframes

In [4]:
# Finding start and end date of all dataframes
sleep_start_date = sleep_df.iloc[0]['day']
sleep_end_date = sleep_df.iloc[-1]['day']

wakeup_start_date = wakeup_df.iloc[0]['Timestamp']
wakeup_end_date = wakeup_df.iloc[-1]['Timestamp']

bedtime_start_date = bedtime_df.iloc[0]['Timestamp_bedtime']
bedtime_end_date = bedtime_df.iloc[-1]['Timestamp_bedtime']

# Only consider the date of the questionnaire data and ignores the exact time
def remove_hour(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the oura data is stored
def convert_sleep_date(sleep_date):
    y, m, d = [int(x) for x in sleep_date.split('-')]
    return date(y, m, d)

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_hour(quest_date).split('/')]
    return date(y, m, d)

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    date1 = convert_sleep_date(sleep_date)
    date2 = convert_questionnaire_date(wakeup_date)
    date3 = convert_questionnaire_date(bedtime_date)
    
    if earliest:
        earliest_date = compare_two_dates(date1, date2, earliest)
        if earliest_date > date3:
            earliest_date = date3
        return earliest_date
    else:
        latest_date = compare_two_dates(date1, date2, earliest)
        if latest_date < date3:
            latest_date = date3
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True)
print('latest date', latest_start, "\nearliest date", earliest_end)


latest date 2022-06-19 
earliest date 2022-11-19


In [5]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name, oura):
    if oura == False:
        for i in df[column_name]:
            df[column_name] = df[column_name].replace([i], convert_questionnaire_date(i))

    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # dates which are not in the sequence
    # are returned
    return pd.date_range(start_date, end_date).difference(df.index)

In [6]:
# missing_sleep_days = find_missing_days(sleep_df, latest_start, earliest_end, 'day', oura=True)
# missing_wakeup_days = find_missing_days(wakeup_df, latest_start, earliest_end, 'Timestamp', oura=False)
# missing_bedtime_days = find_missing_days(bedtime_df, latest_start, earliest_end, 'Timestamp_bedtime', oura=False)
# print(missing_sleep_days)
# print(len(missing_sleep_days))

In [7]:
no_jumps_df = pd.DataFrame()
no_jumps_df = no_jumps_df.append(sleep_df.iloc[0])
no_jumps_df = no_jumps_df.append([None])

In [8]:
no_jumps_df

Unnamed: 0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_end_delta,bedtime_start,bedtime_start_delta,contributors.deep_sleep,...,score,segment_state,sleep_midpoint,sleep_phase_5_min,time_in_bed,timezone,total_sleep_duration,type,wake_ups,0
0,,,,,,2022-06-09T07:03:09+02:00,25389.0,2022-06-08T23:01:09+02:00,-3531.0,,...,,active,,,28920.0,120.0,,rest,,
0,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# Create a dataframe that goes from latest_start to earliest_end with no jumps in the dates, except that days without data are empty rows.
def complete_df(df, column_name, oura):

    missing_days = find_missing_days(df, latest_start, earliest_end, column_name, oura)
    no_jumps_df = pd.DataFrame()
    counter = 0 
    for index, row in df.iterrows():
        if oura == True:
            sleep_date = convert_sleep_date(row[column_name])
        else:
            sleep_date = row[column_name]
        if counter == 0 and sleep_date < latest_start:
            continue
        
        if sleep_date > latest_start and sleep_date == latest_date:
            continue
        latest_date = sleep_date

        while counter < (len(missing_days)) and sleep_date > missing_days[counter]:
            print("this day is missing", missing_days[counter])
            print("the data frame was\n", no_jumps_df[column_name])
            no_jumps_df = no_jumps_df.append([None])
            print("now it is\n", no_jumps_df[column_name])
            counter += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose())
        print("added", row[column_name])
        
        if (sleep_date == earliest_end):
            print("reached max date")
            break
    no_jumps_df = no_jumps_df.reset_index()        
    return no_jumps_df

In [10]:
complete_sleep_df = complete_df(sleep_df, 'day', oura=True)
complete_wakeup_df = complete_df(wakeup_df, 'Timestamp', oura=False)
complete_bedtime_df = complete_df(bedtime_df, 'Timestamp_bedtime', oura=False)

added 2022-06-19
added 2022-06-20
added 2022-06-21
added 2022-06-22
added 2022-06-23
added 2022-06-24
added 2022-06-25
added 2022-06-26
added 2022-06-27
added 2022-06-28
this day is missing 2022-06-29 00:00:00
the data frame was
 10    2022-06-19
11    2022-06-20
12    2022-06-21
13    2022-06-22
14    2022-06-23
15    2022-06-24
16    2022-06-25
17    2022-06-26
18    2022-06-27
19    2022-06-28
Name: day, dtype: object
now it is
 10    2022-06-19
11    2022-06-20
12    2022-06-21
13    2022-06-22
14    2022-06-23
15    2022-06-24
16    2022-06-25
17    2022-06-26
18    2022-06-27
19    2022-06-28
0            NaN
Name: day, dtype: object
added 2022-06-30
this day is missing 2022-07-01 00:00:00
the data frame was
 10    2022-06-19
11    2022-06-20
12    2022-06-21
13    2022-06-22
14    2022-06-23
15    2022-06-24
16    2022-06-25
17    2022-06-26
18    2022-06-27
19    2022-06-28
0            NaN
20    2022-06-30
Name: day, dtype: object
now it is
 10    2022-06-19
11    2022-06-20
1

In [11]:
print(complete_sleep_df['day'])

0      2022-06-19
1      2022-06-20
2      2022-06-21
3      2022-06-22
4      2022-06-23
          ...    
149    2022-11-15
150    2022-11-16
151    2022-11-17
152    2022-11-18
153    2022-11-19
Name: day, Length: 154, dtype: object


In [17]:
print(complete_bedtime_df['Timestamp_bedtime'])

0      2022-06-19
1      2022-06-20
2      2022-06-21
3      2022-06-22
4      2022-06-23
          ...    
149    2022-11-15
150           NaN
151    2022-11-17
152           NaN
153    2022-11-19
Name: Timestamp_bedtime, Length: 154, dtype: object


In [13]:
for index, row in complete_wakeup_df.iterrows():
    print(complete_wakeup_df.iloc[index]['Timestamp'])

2022-06-19
2022-06-20
2022-06-21
2022-06-22
2022-06-23
2022-06-24
2022-06-25
2022-06-26
2022-06-27
2022-06-28
2022-06-29
2022-06-30
2022-07-01
2022-07-02
2022-07-03
2022-07-04
2022-07-05
2022-07-06
2022-07-07
2022-07-08
2022-07-09
2022-07-10
2022-07-11
2022-07-12
2022-07-13
2022-07-14
2022-07-15
2022-07-16
2022-07-17
2022-07-18
2022-07-19
2022-07-20
2022-07-21
2022-07-22
2022-07-23
2022-07-24
2022-07-25
2022-07-26
nan
2022-07-28
2022-07-29
2022-07-30
2022-07-31
2022-08-01
2022-08-02
2022-08-03
2022-08-04
2022-08-05
2022-08-06
2022-08-07
2022-08-08
2022-08-09
2022-08-10
2022-08-11
2022-08-12
2022-08-13
2022-08-14
2022-08-15
2022-08-16
2022-08-17
2022-08-18
2022-08-19
2022-08-20
2022-08-21
2022-08-22
2022-08-23
2022-08-24
nan
2022-08-26
nan
2022-08-28
2022-08-29
2022-08-30
2022-08-31
2022-09-01
2022-09-02
2022-09-03
2022-09-04
2022-09-05
2022-09-06
2022-09-07
2022-09-08
2022-09-09
2022-09-10
2022-09-11
2022-09-12
2022-09-13
2022-09-14
2022-09-15
2022-09-16
2022-09-17
2022-09-18
2022-09-1

In [14]:
print(complete_wakeup_df)

     index    0 Did anything in particular prevent you from sleeping better?  \
0        0  NaN                                            Nothing             
1        1  NaN                                            Nothing             
2        2  NaN                                            Nothing             
3        3  NaN                                            Nothing             
4        4  NaN                                            Nothing             
..     ...  ...                                                ...             
149    145  NaN                                            Nothing             
150    146  NaN                                            Nothing             
151    147  NaN                                            Nothing             
152    148  NaN                                            Nothing             
153    149  NaN                                            Nothing             

    Did anything in particular prevent 

In [19]:
df = pd.concat([complete_sleep_df.reset_index(), complete_wakeup_df.reset_index(), complete_bedtime_df.reset_index()], axis=1)
df

Unnamed: 0,level_0,index,0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_end_delta,...,Did you take medication that might affect your sleep?,How much did you eat in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,How would you characterize the activities you did in the last 3 hours before going to bed?,"If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?","If you did a workout today, at what time did you do your last workout?","If you had alcohol today, at what time did you have your last drink?","If you had coffee today, at what time did you take your last cup of coffee?",Is there any particular reason why you think you might or might not sleep well tonight?,Timestamp_bedtime
0,0,10,,16.5,2.625,56.29,78.0,2730.0,2022-06-19T07:41:27+02:00,27687,...,No,0,4,"Stressful, Mentally active",,,,,,2022-06-19
1,1,11,,,,,,,2022-06-20T07:50:44+02:00,28244,...,No,0,3,Relaxed,,8:30:00 AM,,,,2022-06-20
2,2,12,,16.5,2.625,56.05,79.0,2490.0,2022-06-21T08:23:15+02:00,30195,...,No,0,3,"Relaxed, Mentally active",,,,,,2022-06-21
3,3,13,,16.75,2.625,56.07,72.0,2400.0,2022-06-22T07:21:15+02:00,26475,...,No,3,3,"Relaxed, Socially active",,8:00:00 AM,,11:00:00 AM,,2022-06-22
4,4,14,,16.5,2.625,57.22,78.0,2130.0,2022-06-23T07:27:37+02:00,26857,...,No,0,3,"Stressful, Mentally active",,4:00:00 PM,,,,2022-06-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,149,154,,15.75,2.625,55.67,69.0,3090.0,2022-11-15T08:06:49+01:00,29209,...,No,0,0,Relaxed,,10:30:00 AM,,,,2022-11-15
150,150,155,,15.25,2.25,57.62,64.0,3480.0,2022-11-16T08:21:18+01:00,30078,...,,,,,,,,,,
151,151,156,,15.75,2.5,63.72,48.0,2040.0,2022-11-17T08:45:52+01:00,31552,...,No,0,2,"Stressful, Physically active, Socially active",,8:00:00 PM,1:00:00 AM,,,2022-11-17
152,152,157,,15.875,2.75,55.93,72.0,2610.0,2022-11-18T09:01:40+01:00,32500,...,,,,,,,,,,
