In [None]:
import numpy as np
import pandas as pd
import os
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.dates as mdates
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline

In [None]:
#Processes a user's ScanWatch activity data.
# Parameters:
# watch_u (list of DataFrames): A list where each DataFrame contains activity data from a user's watch.
# Returns:
# DataFrame: A single DataFrame with aggregated daily metrics (mean/min/max HR or total values).
def process_user_scanwatch_activity(watch_u):
    sub_daily = []
    for watch_u_sub in watch_u:
        hr = False
        for c in watch_u_sub.columns:
            if "Timestamp" in c:
                watch_u_sub[c] = pd.to_datetime(watch_u_sub[c], utc=True).dt.date
                date_col = c
            if "Heart Rate" in c or "HR" in c:
                hr = True
        if hr:
            sub_daily.append(watch_u_sub.groupby(date_col).agg({"Heart Rate":['mean','min','max']}))
            sub_daily[-1].columns = ['_'.join(col).strip() for col in sub_daily[-1].columns.values]
        else:
            sub_daily.append(watch_u_sub.groupby(date_col).sum())
    return pd.concat(sub_daily,axis=1)
    

In [None]:

#Processes a user's sleep state data to calculate total time spent in each sleep state per day.
#Parameters:
#sleep_u (DataFrame): A DataFrame containing sleep state transitions with start/end times and labels.
#Returns:
#DataFrame: Daily totals of sleep state durations (in minutes) for each sleep state.

def process_user_sleep_state(sleep_u):
    for c in sleep_u.columns:
        if "time" in c:
            sleep_u[c] = pd.to_datetime(sleep_u[c], utc=True)

    sleep_u['Start time'] = sleep_u['Start time'] - pd.to_timedelta(12, unit='h')
    sleep_u['End time'] = sleep_u['End time'] - pd.to_timedelta(12, unit='h')
    sleep_u['sleep_state_duration'] = (sleep_u['End time'] - sleep_u['Start time']).astype('timedelta64[s]').astype(int)/60
    sleep_u['date'] = sleep_u['Start time'].dt.date

    one_hot = pd.get_dummies(sleep_u['Sleep state'])
    one_hot[one_hot.columns] = one_hot.values * sleep_u['sleep_state_duration'].values.reshape(-1,1)

    sleep_daily = pd.concat([sleep_u['date'],one_hot],axis=1)
    sleep_daily = sleep_daily.groupby('date').sum()
    return sleep_daily

In [None]:
#Processes a user's sleep state data to calculate total time spent in each sleep state per day.
#Parameters:
#sleep_u (DataFrame): A DataFrame containing sleep state transitions with start/end times and labels.
#Returns:
#DataFrame: Daily totals of sleep physiology
def process_user_sleep_physio(sleep_u):
    for c in sleep_u.columns:
        if "Timestamp" in c:
            sleep_u[c] = pd.to_datetime(sleep_u[c], utc=True)        
    ## the timestamps of physio features are the same
    sleep_u['date'] = sleep_u['Timestamp'].dt.date
    
    sleep_u = sleep_u.groupby('date').agg({"Heart Rate":['mean','min','max'],"Respiration Rate":['mean','min','max'],"Snoring":'sum',"SDNN_1":['mean','min','max']})
    sleep_u.columns = ['_'.join(col).strip() for col in sleep_u.columns.values]
    return sleep_u

## Read and aggregate individual data from devices on a daily basis

In [None]:
## read raw sleep data of each participant
path = "./Sleepmat_Watch_Data"
files = os.listdir(path)
all_users_daily = pd.DataFrame()
for i,uid in enumerate(files):
    print('user id: ',uid)

    fpath = os.path.join(path,uid)
    if not os.path.isdir(fpath):
        print('not dir: ',fpath)
        continue
    try:
        watch_u_hr = pd.read_csv(os.path.join(fpath,"ScanWatch_HR.csv"))
        watch_u_stp = pd.read_csv(os.path.join(fpath,"ScanWatch_Steps.csv"))
        watch_u = [watch_u_hr,watch_u_stp]
        
        sleep_u_state = pd.read_csv(os.path.join(fpath,"Sleep_state.csv"))
        sleep_u_state = sleep_u_state.drop_duplicates()
        sleep_u_physio = pd.read_csv(os.path.join(fpath,"Sleep_physio.csv"))
        sleep_u_physio = sleep_u_physio.drop_duplicates()
    except FileNotFoundError:
        print(fpath+'not exist')

    
    watch_u_daily = process_user_scanwatch_activity(watch_u)
    watch_u_daily = watch_u_daily.rename(columns={c:'Watch_'+c for c in watch_u_daily.columns})
    
    sleep_u_state_daily = process_user_sleep_state(sleep_u_state)
    sleep_u_phsio_daily = process_user_sleep_physio(sleep_u_physio)
    
    user_daily = pd.concat([watch_u_daily,sleep_u_state_daily,sleep_u_phsio_daily],axis=1)
    user_daily["user_id"] = uid
    
    
    all_users_daily = pd.concat([all_users_daily,user_daily],axis=0)
all_users_daily = all_users_daily.reset_index()    

In [None]:
all_users_daily = all_users_daily.rename(columns={"index":"date"})
all_users_daily.head()

In [None]:
## drop sleep data that are invalid
all_users_daily.drop(all_users_daily.loc[(all_users_daily.deep!=0)&(all_users_daily.light==0)].index,axis=0,inplace=True)
all_users_daily.drop(all_users_daily.loc[(all_users_daily.light==0)&(all_users_daily.REM!=0)].index,axis=0,inplace=True)
all_users_daily.drop(all_users_daily.loc[(all_users_daily.light==0)&(all_users_daily.REM==0)&(all_users_daily.deep==0)].index,axis=0,inplace=True)

In [None]:
all_users_daily['sleep_duration'] = all_users_daily[['light', 'deep', 'REM']].sum(axis=1)
all_users_daily.loc[all_users_daily[['light', 'deep', 'REM']].isna().sum(axis=1)==3,'sleep_duration']=np.nan
all_users_daily.drop(all_users_daily.loc[all_users_daily.sleep_duration>18*60].index,axis=0,inplace=True)
all_users_daily.columns

## Load demographic information

In [None]:
demographic = pd.read_csv('./Demographics.csv')
demographic.head()

In [None]:
print(list(demographic.columns))

In [None]:

# Correcting column name and iterating over booleans
for e in [True, False]:
    for o in [True, False]:
        count = demographic.loc[
            (demographic['Essential hypertension'] == e) & 
            (demographic['Osteoarthritis'] == o)
        ].shape[0]
        print(f'Hypertension: {e}, Osteoarthritis: {o}, Count: {count}')

In [None]:
demographic['Essential hypertension'].isna().sum(),demographic['Osteoarthritis'].isna().sum()

In [None]:
demographic.Sex.isna().sum(),(demographic.Sex=='Male').sum(),(demographic.Sex=='Female').sum()

In [None]:
age_group_counts = demographic['Age group'].value_counts(dropna=True)
print("Counts for each age group:")
print(age_group_counts)

In [None]:
## add demographic information to aggreated daily data
all_users_daily['Sex'] = all_users_daily['user_id'].apply(lambda x:demographic.loc[demographic.user_id==x,'Sex'].values[0])
all_users_daily['Age group'] = all_users_daily['user_id'].apply(lambda x:demographic.loc[demographic.user_id==x,'Age group'].values[0])
print(all_users_daily.columns)

In [None]:
import pickle
# Saving the variables.
data_to_save = {
    "df": all_users_daily, # Daily processed data for all users
    "demographics": demographic # Demographic information of users
}
# Create "preprocessed_data.pkl" in write-binary mode
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump(data_to_save, f)