In [1]:
import numpy as np
import pandas as pd
import os
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.dates as mdates
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline

In [3]:
#Processes a user's ScanWatch activity data.
# Parameters:
# watch_u (list of DataFrames): A list where each DataFrame contains activity data from a user's watch.
# Returns:
# DataFrame: A single DataFrame with aggregated daily metrics (mean/min/max HR or total values).
def process_user_scanwatch_activity(watch_u):
    sub_daily = []
    for watch_u_sub in watch_u:
        hr = False
        for c in watch_u_sub.columns:
            if "Timestamp" in c:
                watch_u_sub[c] = pd.to_datetime(watch_u_sub[c], utc=True).dt.date
                date_col = c
            if "Heart Rate" in c or "HR" in c:
                hr = True
        if hr:
            sub_daily.append(watch_u_sub.groupby(date_col).agg({"Heart Rate":['mean','min','max']}))
            sub_daily[-1].columns = ['_'.join(col).strip() for col in sub_daily[-1].columns.values]
        else:
            sub_daily.append(watch_u_sub.groupby(date_col).sum())
    return pd.concat(sub_daily,axis=1)
    

In [4]:

#Processes a user's sleep state data to calculate total time spent in each sleep state per day.
#Parameters:
#sleep_u (DataFrame): A DataFrame containing sleep state transitions with start/end times and labels.
#Returns:
#DataFrame: Daily totals of sleep state durations (in minutes) for each sleep state.

def process_user_sleep_state(sleep_u):
    for c in sleep_u.columns:
        if "time" in c:
            sleep_u[c] = pd.to_datetime(sleep_u[c], utc=True)

    sleep_u['Start time'] = sleep_u['Start time'] - pd.to_timedelta(12, unit='h')
    sleep_u['End time'] = sleep_u['End time'] - pd.to_timedelta(12, unit='h')
    sleep_u['sleep_state_duration'] = (sleep_u['End time'] - sleep_u['Start time']).astype('timedelta64[s]').astype(int)/60
    sleep_u['date'] = sleep_u['Start time'].dt.date

    one_hot = pd.get_dummies(sleep_u['Sleep state'])
    one_hot[one_hot.columns] = one_hot.values * sleep_u['sleep_state_duration'].values.reshape(-1,1)

    sleep_daily = pd.concat([sleep_u['date'],one_hot],axis=1)
    sleep_daily = sleep_daily.groupby('date').sum()
    return sleep_daily

In [5]:
#Processes a user's sleep state data to calculate total time spent in each sleep state per day.
#Parameters:
#sleep_u (DataFrame): A DataFrame containing sleep state transitions with start/end times and labels.
#Returns:
#DataFrame: Daily totals of sleep physiology
def process_user_sleep_physio(sleep_u):
    for c in sleep_u.columns:
        if "Timestamp" in c:
            sleep_u[c] = pd.to_datetime(sleep_u[c], utc=True)        
    ## the timestamps of physio features are the same
    sleep_u['date'] = sleep_u['Timestamp'].dt.date
    
    sleep_u = sleep_u.groupby('date').agg({"Heart Rate":['mean','min','max'],"Respiration Rate":['mean','min','max'],"Snoring":'sum',"SDNN_1":['mean','min','max']})
    sleep_u.columns = ['_'.join(col).strip() for col in sleep_u.columns.values]
    return sleep_u

## Read and aggregate individual data from devices on a daily basis

In [6]:
## read raw sleep data of each participant
path = "/Users/ncespede/Desktop/Resilient-Dataset-main/Resilient-Internal-main/notebooks/Sleepmat_Watch_Data"
files = os.listdir(path)
all_users_daily = pd.DataFrame()
for i,uid in enumerate(files):
    print('user id: ',uid)

    fpath = os.path.join(path,uid)
    if not os.path.isdir(fpath):
        print('not dir: ',fpath)
        continue
    try:
        watch_u_hr = pd.read_csv(os.path.join(fpath,"ScanWatch_HR.csv"))
        watch_u_stp = pd.read_csv(os.path.join(fpath,"ScanWatch_Steps.csv"))
        watch_u = [watch_u_hr,watch_u_stp]
        
        sleep_u_state = pd.read_csv(os.path.join(fpath,"Sleep_state.csv"))
        sleep_u_state = sleep_u_state.drop_duplicates()
        sleep_u_physio = pd.read_csv(os.path.join(fpath,"Sleep_physio.csv"))
        sleep_u_physio = sleep_u_physio.drop_duplicates()
    except FileNotFoundError:
        print(fpath+'not exist')

    
    watch_u_daily = process_user_scanwatch_activity(watch_u)
    watch_u_daily = watch_u_daily.rename(columns={c:'Watch_'+c for c in watch_u_daily.columns})
    
    sleep_u_state_daily = process_user_sleep_state(sleep_u_state)
    sleep_u_phsio_daily = process_user_sleep_physio(sleep_u_physio)
    
    user_daily = pd.concat([watch_u_daily,sleep_u_state_daily,sleep_u_phsio_daily],axis=1)
    user_daily["user_id"] = uid
    
    
    all_users_daily = pd.concat([all_users_daily,user_daily],axis=0)
all_users_daily = all_users_daily.reset_index()    

user id:  7a61b537
user id:  31489056
user id:  f276ebe4
user id:  f5ca38f7
user id:  4ec9599f
user id:  9400f1b2
user id:  2c624232
user id:  811786ad
user id:  2858dcd1
user id:  bf6dbcba
user id:  .DS_Store
not dir:  /Users/ncespede/Desktop/Resilient-Dataset-main/Resilient-Internal-main/notebooks/Sleepmat_Watch_Data/.DS_Store
user id:  b7a56873
user id:  eb1e33e8
user id:  3d914f93
user id:  6b51d431
user id:  8527a891
user id:  c6f3ac57
user id:  c837649c
user id:  ddd9cd98
user id:  73475cb4
user id:  4a44dc15
user id:  44cb730c
user id:  4b227777
user id:  1a656259
user id:  5e86cb4b
user id:  d59eced1
user id:  25fc0e70
user id:  d65ca5bf
user id:  ef2d127d
user id:  4e074085
user id:  4523540f
user id:  3ff1feb9
user id:  76a50887
user id:  44d88da8
user id:  2fca346d
user id:  02d20bbd
user id:  6208ef0f
user id:  535fa30d
user id:  6b86b273
user id:  86e50149
user id:  41cfc0d1
user id:  c2356069
user id:  0e17daca
user id:  35135aaa
user id:  e629fa65
user id:  71ee45a3
user

In [7]:
all_users_daily = all_users_daily.rename(columns={"index":"date"})
all_users_daily.head()

Unnamed: 0,date,Watch_Heart Rate_mean,Watch_Heart Rate_min,Watch_Heart Rate_max,Watch_Steps,REM,deep,light,wakeup,Heart Rate_mean,Heart Rate_min,Heart Rate_max,Respiration Rate_mean,Respiration Rate_min,Respiration Rate_max,Snoring_sum,SDNN_1_mean,SDNN_1_min,SDNN_1_max,user_id
0,2024-10-08,70.0,66.0,78.0,,69.0,196.0,179.0,95.0,68.748428,64.0,78.0,14.440252,11.0,17.0,0.0,35.037736,0.0,49.0,7a61b537
1,2024-10-09,73.970803,55.0,102.0,3014.0,80.0,174.0,220.0,71.0,69.080882,62.0,80.0,14.018382,10.0,26.0,1300.0,37.542279,0.0,49.0,7a61b537
2,2024-10-10,74.423077,42.0,101.0,3108.0,46.0,190.0,218.0,55.0,70.961382,62.0,81.0,13.896341,9.0,22.0,1800.0,38.764228,0.0,42.0,7a61b537
3,2024-10-11,72.550388,60.0,188.0,3036.0,60.0,136.0,274.0,102.0,67.516981,48.0,77.0,13.716981,9.0,20.0,400.0,38.296226,0.0,49.0,7a61b537
4,2024-10-12,69.744,58.0,104.0,3207.0,31.0,159.0,280.0,87.0,66.51751,60.0,72.0,14.178988,10.0,24.0,200.0,38.937743,0.0,45.0,7a61b537


In [8]:
## drop sleep data that are invalid
all_users_daily.drop(all_users_daily.loc[(all_users_daily.deep!=0)&(all_users_daily.light==0)].index,axis=0,inplace=True)
all_users_daily.drop(all_users_daily.loc[(all_users_daily.light==0)&(all_users_daily.REM!=0)].index,axis=0,inplace=True)
all_users_daily.drop(all_users_daily.loc[(all_users_daily.light==0)&(all_users_daily.REM==0)&(all_users_daily.deep==0)].index,axis=0,inplace=True)

In [9]:
all_users_daily['sleep_duration'] = all_users_daily[['light', 'deep', 'REM']].sum(axis=1)
all_users_daily.loc[all_users_daily[['light', 'deep', 'REM']].isna().sum(axis=1)==3,'sleep_duration']=np.nan
all_users_daily.drop(all_users_daily.loc[all_users_daily.sleep_duration>18*60].index,axis=0,inplace=True)
all_users_daily.columns

Index(['date', 'Watch_Heart Rate_mean', 'Watch_Heart Rate_min',
       'Watch_Heart Rate_max', 'Watch_Steps', 'REM', 'deep', 'light', 'wakeup',
       'Heart Rate_mean', 'Heart Rate_min', 'Heart Rate_max',
       'Respiration Rate_mean', 'Respiration Rate_min', 'Respiration Rate_max',
       'Snoring_sum', 'SDNN_1_mean', 'SDNN_1_min', 'SDNN_1_max', 'user_id',
       'sleep_duration'],
      dtype='object')

## Load demographic information

In [10]:
demographic = pd.read_csv('/Users/ncespede/Desktop/Resilient-Dataset/finallll_ace.csv')
demographic.head()

Unnamed: 0,user_id,Sex,Age group,Essential hypertension,Osteoarthritis,phq_date,phq1,phq2,phq3,phq4,...,ace24g_6months,ace24h_6months,ace24i_6months,ace24j_6months,ace_total_6months,attention_subscale_6months,memory_subscale_6months,fluency_subscale_6months,language_subscale_6months,visuospatial_subscale_6months
0,6b86b273,Female,"[76, 87]",True,False,25/08/2023,0,1,1,1.0,...,,1.0,,,89.0,12.0,22.0,13.0,26.0,16.0
1,4e074085,Female,"[76, 87]",True,True,17/10/2023,0,0,3,0.0,...,,1.0,1.0,,88.0,18.0,19.0,13.0,26.0,12.0
2,4b227777,Male,"[76, 87]",True,True,17/10/2023,0,0,1,1.0,...,,,,,,,,,,
3,ef2d127d,Male,"[76, 87]",True,True,26/10/2023,0,0,0,0.0,...,1.0,1.0,1.0,1.0,73.0,13.0,15.0,10.0,25.0,10.0
4,93e3d8c5,Female,"[76, 87]",,,26/10/2023,1,1,0,1.0,...,,,,,,,,,,


In [11]:
print(list(demographic.columns))

['user_id', 'Sex', 'Age group', 'Essential hypertension', 'Osteoarthritis', 'phq_date', 'phq1', 'phq2', 'phq3', 'phq4', 'phq5', 'phq6', 'phq7', 'phq8', 'phq9', 'phq_extraq', 'phq_total', 'gad_date', 'gad1', 'gad2', 'gad3', 'gad4', 'gad5', 'gad6', 'gad7', 'gad_7_additional_question', 'gad_total', 'gds_date', 'gds1', 'gds2', 'gds3', 'gds4', 'gds5', 'gds6', 'gds7', 'gds8', 'gds9', 'gds10', 'gds11', 'gds12', 'gds13', 'gds14', 'gds15', 'gds_total', 'ace_date', 'ace1a', 'ace1b', 'ace1c', 'ace1d', 'ace1e', 'ace2a', 'ace2b', 'ace2c', 'ace2d', 'ace2e', 'ace3a', 'ace3b', 'ace3c', 'ace3d', 'ace4', 'ace5a', 'ace5b', 'ace5c', 'ace6', 'ace7', 'ace8a', 'ace8b', 'ace8c', 'ace8d', 'ace8e', 'ace8f', 'ace8g', 'ace9a', 'ace9b', 'ace9c', 'ace9d', 'ace10a', 'ace10b', 'ace10c', 'ace10d', 'ace11', 'ace12', 'ace13', 'ace14', 'ace15a', 'ace15b', 'ace15c', 'ace15d', 'ace15e', 'ace15f', 'ace15g', 'ace15h', 'ace15i', 'ace15j', 'ace15k', 'ace15l', 'ace16a', 'ace16b', 'ace16c', 'ace16d', 'ace17', 'ace18', 'ace19', '

In [12]:

# Correcting column name and iterating over booleans
for e in [True, False]:
    for o in [True, False]:
        count = demographic.loc[
            (demographic['Essential hypertension'] == e) & 
            (demographic['Osteoarthritis'] == o)
        ].shape[0]
        print(f'Hypertension: {e}, Osteoarthritis: {o}, Count: {count}')

Hypertension: True, Osteoarthritis: True, Count: 24
Hypertension: True, Osteoarthritis: False, Count: 19
Hypertension: False, Osteoarthritis: True, Count: 4
Hypertension: False, Osteoarthritis: False, Count: 8


In [13]:
demographic['Essential hypertension'].isna().sum(),demographic['Osteoarthritis'].isna().sum()

(np.int64(18), np.int64(18))

In [14]:
demographic.Sex.isna().sum(),(demographic.Sex=='Male').sum(),(demographic.Sex=='Female').sum()

(np.int64(1), np.int64(30), np.int64(42))

In [15]:
age_group_counts = demographic['Age group'].value_counts(dropna=True)
print("Counts for each age group:")
print(age_group_counts)

Counts for each age group:
Age group
[76, 87]    45
[88, 99]    14
[72, 75]    11
Name: count, dtype: int64


In [16]:
## add demographic information to aggreated daily data
all_users_daily['Sex'] = all_users_daily['user_id'].apply(lambda x:demographic.loc[demographic.user_id==x,'Sex'].values[0])
all_users_daily['Age group'] = all_users_daily['user_id'].apply(lambda x:demographic.loc[demographic.user_id==x,'Age group'].values[0])
print(all_users_daily.columns)

Index(['date', 'Watch_Heart Rate_mean', 'Watch_Heart Rate_min',
       'Watch_Heart Rate_max', 'Watch_Steps', 'REM', 'deep', 'light', 'wakeup',
       'Heart Rate_mean', 'Heart Rate_min', 'Heart Rate_max',
       'Respiration Rate_mean', 'Respiration Rate_min', 'Respiration Rate_max',
       'Snoring_sum', 'SDNN_1_mean', 'SDNN_1_min', 'SDNN_1_max', 'user_id',
       'sleep_duration', 'Sex', 'Age group'],
      dtype='object')


In [17]:
import pickle
# Saving the variables.
data_to_save = {
    "df": all_users_daily, # Daily processed data for all users
    "demographics": demographic # Demographic information of users
}
# Create "preprocessed_data.pkl" in write-binary mode
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump(data_to_save, f)