## Imports

In [1]:
import pandas as pd
import json
from datetime import *

## Loading the data

In [2]:
# Loading sleep data from Oura json file available on https://cloud.ouraring.com/profile 
with open('/Users/user/Desktop/Msc AI RU/Internship/Code/oura_json/oura_sleep_2022-12-06T10-07-33.json') as f:
    data1 = json.load(f)

sleep_df = pd.json_normalize(data1['sleep'], max_level=2)
print('sleep_df', sleep_df.shape)

sleep_df (176, 56)


In [3]:
print(sleep_df.index[-1])
last_row = sleep_df.index[-1]
print(last_row)

175
175


In [4]:
# Loading raw wake up data from Google Form csv
wakeup_df = pd.read_csv ('questionnaire_data/raw/Wakeup_3_v1_raw.csv')
print('wakeup_df', wakeup_df.shape)

# Loading raw bedtime data from Google Form csv
bedtime_df = pd.read_csv ('questionnaire_data/raw/Bedtime_3_v1_raw.csv')
bedtime_df.columns = ['Timestamp_bedtime' if x=='Timestamp' else x for x in bedtime_df.columns]
print('bedtime_df', bedtime_df.shape)

wakeup_df (151, 9)
bedtime_df (149, 10)


## Initial pre-processing of datasets separately

In [5]:
# Visual analysis of the Oura ring data to determine the units of each variable
print(sleep_df.iloc[1,:])

bedtime_end                                                             2022-06-10T06:55:22+02:00
bedtime_start                                                           2022-06-09T22:46:22+02:00
day                                                                                    2022-06-10
period                                                                                          0
segment_state                                                                              active
time_in_bed                                                                                 29340
type                                                                                   long_sleep
sleep_phase_5_min                               4244222111111111112211111112233333222211112112...
restless                                                                                      6.0
timezone                                                                                      120
bedtime_start_delta 

In [6]:
# Convert the ISO 8601 notation of the oura ring data to seconds
def convert_iso_seconds(df, column_name):
    for index, row in df.iterrows():
        utc_date = datetime.strptime(sleep_df.iloc[index][column_name], '%Y-%m-%dT%H:%M:%S%z')
        df.at[index, column_name] = (utc_date - datetime(utc_date.year, utc_date.month, utc_date.day, tzinfo=utc_date.tzinfo)).total_seconds()
    return df

sleep_df = convert_iso_seconds(sleep_df, 'bedtime_end')
sleep_df = convert_iso_seconds(sleep_df, 'bedtime_start')

In [7]:
# Only consider the date of the questionnaire data and ignores the exact time
def remove_hour(string_date):
    split_string = string_date.split(" ")
    return split_string[0]

# Change how the date of the oura data is stored
def convert_sleep_date(sleep_date):
    y, m, d = [int(x) for x in sleep_date.split('-')]
    return date(y, m, d)

# Change how the date of the questionnaire data is stored
def convert_questionnaire_date(quest_date):
    m, d, y = [int(x) for x in remove_hour(quest_date).split('/')]
    return date(y, m, d)

In [8]:
# Convert all dates into a uniform format
def convert_dates(df, column_name, oura=False):
    for index, row in df.iterrows():
        if oura:
            df.at[index, column_name] = convert_sleep_date(df.iloc[index][column_name])
        else:
            df.at[index, column_name] = convert_questionnaire_date(df.iloc[index][column_name])
    return df

sleep_df = convert_dates(sleep_df, 'day', oura=True)
wakeup_df = convert_dates(wakeup_df, 'Timestamp')
bedtime_df = convert_dates(bedtime_df, 'Timestamp_bedtime')

In [9]:
# Bedtime data entered on day n should be matched to the wakeup and sleep data of day n+1
for index, row in bedtime_df.iterrows():
    bedtime_df.at[index, 'Timestamp_bedtime'] = bedtime_df.iloc[index]['Timestamp_bedtime'] + timedelta(days=1)

## Combining the three dataframes

In [10]:
# Finding start and end date of all dataframes
sleep_start_date = sleep_df.iloc[0]['day']
sleep_end_date = sleep_df.iloc[-1]['day']

wakeup_start_date = wakeup_df.iloc[0]['Timestamp']
wakeup_end_date = wakeup_df.iloc[-1]['Timestamp']

bedtime_start_date = bedtime_df.iloc[0]['Timestamp_bedtime']
bedtime_end_date = bedtime_df.iloc[-1]['Timestamp_bedtime']

# Compare two dates and only return the earliest or the latest, depending on what is required
def compare_two_dates(date1, date2, earliest):
    if date1 == date2:
        earliest_date = latest_date = date1
    elif date1 > date2:
        earliest_date = date2
        latest_date = date1  
    else:
        earliest_date = date1
        latest_date = date2
    if earliest:
        return earliest_date
    else:
        return latest_date

# Compare three dates and only return the earliest or the latest, depending on what is required
def compare_three_dates(sleep_date, wakeup_date, bedtime_date, earliest):
    if earliest:
        earliest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if earliest_date > bedtime_date:
            earliest_date = bedtime_date
        return earliest_date
    else:
        latest_date = compare_two_dates(sleep_date, wakeup_date, earliest)
        if latest_date < bedtime_date:
            latest_date = bedtime_date
        return latest_date

# Determine the time range for which all three data sets simultaneously have data
latest_start = compare_three_dates(sleep_start_date, wakeup_start_date, bedtime_start_date, earliest = False) + timedelta(days=1)
earliest_end = compare_three_dates(sleep_end_date, wakeup_end_date, bedtime_end_date, earliest = True) + timedelta(days=-1)
print('latest date', latest_start, "\nearliest date", earliest_end)


latest date 2022-06-20 
earliest date 2022-11-19


In [11]:
# Determine the days with missing data
def find_missing_days(df, start_date, end_date, column_name):
    # Set the date values as index
    df = df.set_index(column_name)

    # Convert string format of date to a DateTime object
    df.index = pd.to_datetime(df.index)

    # Dates which are not in the sequence are returned
    return pd.date_range(start_date, end_date).difference(df.index)

In [39]:
# Create a dataframe that goes from latest_start to earliest_end with no jumps in the dates, except that days without data are empty rows.
def complete_df(df, latest_start, earliest_end, column_name, oura=False):
    print('i am in a new data frame')
    missing_days = find_missing_days(df, latest_start, earliest_end, column_name)
    no_jumps_df = pd.DataFrame()
    last_empty_row = 0
    counter_missing_days = 0 
    counter_added_rows = 0
    for idx, row in df.iterrows():
        sleep_date = row[column_name]
        # Only add the data starting the latest_start
        if counter_missing_days == 0 and sleep_date < latest_start:
            continue
        
        print('at index', counter_added_rows, ', day', row[column_name])

        # Prevents to add data twice for a single day
        if sleep_date > latest_start and sleep_date == latest_date:
            print('twice the same date')
            print('the last index of the df is', counter_added_rows)
            if oura:
                print("i want to remove the last row")
                no_jumps_df = no_jumps_df.drop(counter_added_rows)
                counter_added_rows -= 1
                print('I REMOved the last row, now the last index is', counter_added_rows)
            else:
                edit_row = last_empty_row
                print("i want to remove the last row, with edit_row=", edit_row, 'and counter_added_rows=', counter_added_rows-1)
                print(no_jumps_df.shape)
                print(no_jumps_df.iloc[counter_added_rows-1][column_name])
                while edit_row < counter_added_rows-1:
                    print('i want to change values of', edit_row)
                    print(no_jumps_df.loc[edit_row])
                    no_jumps_df.at[edit_row] = no_jumps_df.loc[edit_row + 1]
                    edit_row += 1
                    print('done, now I want to change', edit_row)
                no_jumps_df = no_jumps_df.drop(edit_row)
                counter_added_rows -= 1
        latest_date = sleep_date

        # If data is missing from the considered date, add an empty row
        while counter_missing_days < (len(missing_days)) and sleep_date > missing_days[counter_missing_days]:
            no_jumps_df = no_jumps_df.append([None])
            last_empty_row = counter_added_rows
            print('last empty row is updated to', last_empty_row)
            counter_missing_days += 1
            counter_added_rows += 1

        no_jumps_df = no_jumps_df.append(pd.DataFrame(row).transpose())
        print('day', row[column_name], 'added at', counter_added_rows)
        counter_added_rows += 1
        
        # End when the earliest_end is reached
        if (sleep_date == earliest_end):
            break
               
    return no_jumps_df

In [40]:
# Create three datasets that cover the same time period
complete_sleep_df = complete_df(sleep_df, latest_start, earliest_end, 'day', oura=True)
# complete_sleep_df = complete_df(sleep_df, latest_start, earliest_end, 'day')
complete_wakeup_df = complete_df(wakeup_df, latest_start, earliest_end, 'Timestamp')
complete_bedtime_df = complete_df(bedtime_df, latest_start, earliest_end, 'Timestamp_bedtime')

print('complete_sleep_df', complete_sleep_df.shape, '\ncomplete_wakeup_df', complete_wakeup_df.shape, '\ncomplete_bedtime_df', complete_bedtime_df.shape)

i am in a new data frame
at index 0 , day 2022-06-20
day 2022-06-20 added at 0
at index 1 , day 2022-06-21
day 2022-06-21 added at 1
at index 2 , day 2022-06-22
day 2022-06-22 added at 2
at index 3 , day 2022-06-23
day 2022-06-23 added at 3
at index 4 , day 2022-06-24
day 2022-06-24 added at 4
at index 5 , day 2022-06-25
day 2022-06-25 added at 5
at index 6 , day 2022-06-26
day 2022-06-26 added at 6
at index 7 , day 2022-06-27
day 2022-06-27 added at 7
at index 8 , day 2022-06-28
day 2022-06-28 added at 8
at index 9 , day 2022-06-30
last empty row is updated to 9
day 2022-06-30 added at 10
at index 11 , day 2022-07-02
last empty row is updated to 11
day 2022-07-02 added at 12
at index 13 , day 2022-07-03
day 2022-07-03 added at 13
at index 14 , day 2022-07-04
day 2022-07-04 added at 14
at index 15 , day 2022-07-05
day 2022-07-05 added at 15
at index 16 , day 2022-07-06
day 2022-07-06 added at 16
at index 17 , day 2022-07-07
day 2022-07-07 added at 17
at index 18 , day 2022-07-08
day 20

KeyError: 146

In [None]:
complete_sleep_df.reset_index()
print(complete_sleep_df.loc[[22]])

      0 average_breath average_breath_variation average_heart_rate  \
22  NaN            NaN                      NaN                NaN   

   average_hrv awake_time bedtime_end bedtime_end_delta bedtime_start  \
22         NaN        NaN     22085.0             22085       84545.0   

   bedtime_start_delta  ... restless_periods score segment_state  \
22               -1855  ...              NaN   NaN        active   

   sleep_midpoint sleep_phase_5_min time_in_bed timezone total_sleep_duration  \
22            NaN              None       23940      120                  NaN   

    type wake_ups  
22  rest      NaN  

[1 rows x 57 columns]


In [None]:
# Concatenate the 3 data sets. Merge the different columns on to the same rows 
df = pd.concat([complete_sleep_df.reset_index(), complete_wakeup_df.reset_index(), complete_bedtime_df.reset_index()], axis=1)

## Pre-process the dataset 

In [None]:
# Identify all the rows with missing data
missing_date_idx = []
for index, row in df.iterrows():
    if pd.isna(df.iloc[index]['day']) or pd.isna(df.iloc[index]['Timestamp']) or pd.isna(df.iloc[index]['Timestamp_bedtime']):
        missing_date_idx.append(index)

# Drop the rows with missing data from dataframe
for i in range(len(missing_date_idx)):
    # print(missing_date_idx[i])
    df = df.drop(missing_date_idx[i])

df = df.reset_index()
# df

7
9
11
14
18
23
26
30
31
32
35
37
43
47
49
52
56
60
64
66
68
71
77
86
92
96
111
118
139
144
145
146
150
152
153


In [None]:
print(df.columns.tolist())

['level_0', 'index', 0, 'average_breath', 'average_breath_variation', 'average_heart_rate', 'average_hrv', 'awake_time', 'bedtime_end', 'bedtime_end_delta', 'bedtime_start', 'bedtime_start_delta', 'contributors.deep_sleep', 'contributors.efficiency', 'contributors.latency', 'contributors.rem_sleep', 'contributors.restfulness', 'contributors.timing', 'contributors.total_sleep', 'day', 'deep_sleep_duration', 'efficiency', 'got_ups', 'heart_rate.interval', 'heart_rate.items', 'heart_rate.timestamp', 'hrv.interval', 'hrv.items', 'hrv.timestamp', 'latency', 'light_sleep_duration', 'lowest_heart_rate', 'lowest_heart_rate_time_offset', 'midpoint_at_delta', 'movement_30_sec', 'period', 'readiness.contributors.activity_balance', 'readiness.contributors.body_temperature', 'readiness.contributors.hrv_balance', 'readiness.contributors.previous_day_activity', 'readiness.contributors.previous_night', 'readiness.contributors.recovery_index', 'readiness.contributors.resting_heart_rate', 'readiness.con

In [None]:
# Drop irrelevant columns. These were determined by looking at the above list of columns. 
df = df.drop(labels=['level_0', 'index', 0], axis=1)
df

Unnamed: 0,average_breath,average_breath_variation,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_end_delta,bedtime_start,bedtime_start_delta,contributors.deep_sleep,...,Did you take medication that might affect your sleep?,How much did you eat in the last 3 hours before going to bed?,How much stress and/or anxiety do you feel now?,How would you characterize the activities you did in the last 3 hours before going to bed?,"If you did a stress-relieving activity today (meditation, yoga, etc), at what time did you do your last one?","If you did a workout today, at what time did you do your last workout?","If you had alcohol today, at what time did you have your last drink?","If you had coffee today, at what time did you take your last cup of coffee?",Is there any particular reason why you think you might or might not sleep well tonight?,Timestamp_bedtime
0,,,,,,28244.0,28244,44.0,44,,...,No,0,4,"Stressful, Mentally active",,,,,,2022-06-20
1,16.5,2.625,56.05,79.0,2490.0,30195.0,30195,82815.0,-3585,99.0,...,No,0,3,Relaxed,,8:30:00 AM,,,,2022-06-21
2,16.75,2.625,56.07,72.0,2400.0,26475.0,26475,83835.0,-2565,98.0,...,No,0,3,"Relaxed, Mentally active",,,,,,2022-06-22
3,16.5,2.625,57.22,78.0,2130.0,26857.0,26857,84817.0,-1583,94.0,...,No,3,3,"Relaxed, Socially active",,8:00:00 AM,,11:00:00 AM,,2022-06-23
4,,,,,,25475.0,25475,81275.0,-5125,,...,No,0,3,"Stressful, Mentally active",,4:00:00 PM,,,,2022-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,15.25,2.375,62.64,51.0,3210.0,27293.0,27293,81113.0,-5287,96.0,...,No,1,1,"Physically active, Socially active",,9:15:00 PM,,,,2022-11-10
115,15.0,2.5,57.29,72.0,3510.0,31231.0,31231,4411.0,4411,100.0,...,No,0,1,"Stressful, Socially active",,,2:45:00 AM,,,2022-11-14
116,15.75,2.625,55.67,69.0,3090.0,29209.0,29209,85189.0,-1211,97.0,...,No,0,1,Physically active,,,,,,2022-11-15
117,15.25,2.25,57.62,64.0,3480.0,30078.0,30078,83538.0,-2862,100.0,...,No,0,0,Relaxed,,10:30:00 AM,,,,2022-11-16


In [None]:
print(df.iloc[-1])

average_breath                                                                                 15.875
average_breath_variation                                                                         2.75
average_heart_rate                                                                              55.93
average_hrv                                                                                      72.0
awake_time                                                                                     2610.0
                                                                                              ...    
If you did a workout today, at what time did you do your last workout?                     8:00:00 PM
If you had alcohol today, at what time did you have your last drink?                       1:00:00 AM
If you had coffee today, at what time did you take your last cup of coffee?                       NaN
Is there any particular reason why you think you might or might not sleep well ton