In [2]:
# imports

from pathlib import Path
from collections import namedtuple
from zoneinfo import ZoneInfo
from datetime import datetime
import os
import glob
import json
import pandas as pd
import numpy as np

In [4]:
# find min/max heart rate for every day

def timestamp_hr(csv_file):
    list_min_hr = []
    list_max_hr = []
    list_day = []

    df = pd.read_csv(csv_file)
    
    if df.empty or 'timestamp' not in df.columns or 'heartrate' not in df.columns:
        return [],[],[]

    unix_val = df['timestamp'].loc[0] # unix time in first row
    prev_day = datetime.fromtimestamp(unix_val,tz=ZoneInfo("America/New_York")).day
    min = df['heartrate'].loc[0]
    max = df['heartrate'].loc[0]

    for i in range(1,len(df)): # iterates through each row in the dataframe
        timestamp = df['timestamp'].loc[i]
        curr_day = datetime.fromtimestamp(timestamp,tz=ZoneInfo("America/New_York")).day
        curr_hr = df['heartrate'].loc[i]
        if prev_day == curr_day:
            if (curr_hr < min):
                min = curr_hr
            elif (curr_hr > max):
                max = curr_hr
        elif curr_day != prev_day:
            list_min_hr.append(min)
            list_max_hr.append(max)
            list_day.append(unix_val)
            
            min = curr_hr
            max = curr_hr
            unix_val = timestamp
            prev_day = curr_day

    unix_val = df['timestamp'].loc[len(df.index)-1]
    list_min_hr.append(min)
    list_max_hr.append(max)
    list_day.append(unix_val)

    return list_min_hr, list_max_hr, list_day

In [None]:
# ISSUE FIX -- fixed

minimum, maximum, day = timestamp_hr(r"C:\Users\VictoriaAgain\Downloads\download\80929938f623f44614a029108a740d00\combined_heartrate.csv")

print(len(day))

datelist = []
for i in range(len(day)):
    date = datetime.fromtimestamp(day[i],tz=ZoneInfo("America/New_York")).date()
    print(date)
    # datelist.append(date)
# print(datelist)
# convert_day = []
# for i in range(len(day)):
#     convert_day.append(datetime.fromtimestamp(day[i]).day)
# print(convert_day)

In [8]:
# total steps by day for one patient given csv file path

def timestamp_total_step(csv_file):
    list_total_step = []
    list_day_step = []
    
    df = pd.read_csv(csv_file)

    if df.empty or 'stop' not in df.columns or 'steps' not in df.columns:
        return [],[]
    
    unix_prev = df['stop'].loc[0]
    temp_day_prev = datetime.fromtimestamp(df['stop'].loc[0]).day
    temp_total_step = 0
    
    for i in df.index:
        val = df['stop'].loc[i]
        temp_day_curr = datetime.fromtimestamp(val).day
        if temp_day_prev == temp_day_curr:
            temp_total_step += df['steps'].loc[i]
        elif temp_day_curr != temp_day_prev:
            list_day_step.append(unix_prev)
            list_total_step.append(temp_total_step)
            unix_prev = df['stop'].loc[i]
            temp_total_step = 0
            temp_day_prev = temp_day_curr

    # for last day otherwise not included
    unix_prev = df['stop'].loc[len(df.index)-1]
    list_day_step.append(unix_prev)
    list_total_step.append(temp_total_step)
    
    return list_total_step, list_day_step

In [None]:
# testing
step,day = timestamp_total_step(r"C:\Users\VictoriaAgain\Downloads\download\80929938f623f44614a029108a740d00\combined_steps.csv")
step
# datelist = []
# for i in range(len(day)):
#     date = datetime.fromtimestamp(day[i],tz=ZoneInfo("America/New_York")).date()
#     print(date)
# df = pd.read_csv(r"C:\Users\VictoriaAgain\Downloads\download\90b8c2e2c843ffd77f8621c7d6ed044d\combined_steps.csv")

In [26]:
# normalizing heartrate and steps for each patient before merging lists
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

def norm_data(min, max, step):
    min = np.array(min)
    max = np.array(max)
    step = np.array(step)

    standard_min, standard_max, standard_step = [], [], []

    if (min.size > 0 and max.size > 0 and step.size > 0):
        # sklearn normalization
        # norm_hr = preprocessing.normalize(hr.reshape(-1,1))
        # norm_step = preprocessing.normalize(step.reshape(-1,1))
    
        # using minmaxscaler
        # scaler_hr = preprocessing.MinMaxScaler(feature_range=(0,10))
        # scaler_step = preprocessing.MinMaxScaler(feature_range=(0,10))
        # minmax_hr = scaler_hr.fit_transform(hr.reshape(-1,1))
        # minmax_step = scaler_step.fit_transform(step.reshape(-1,1))
    
        # using zscore norm/standardization
        z_scaler_min = StandardScaler()
        z_scaler_max = StandardScaler()
        z_scaler_step = StandardScaler()
        standard_min = z_scaler_min.fit_transform(min.reshape(-1,1))
        standard_max = z_scaler_max.fit_transform(max.reshape(-1,1))
        standard_step = z_scaler_step.fit_transform(step.reshape(-1,1))
    
    return(standard_min, standard_max, standard_step)

In [None]:
# testing normalization
step_arr, step_day = timestamp_total_step(r"C:\Users\VictoriaAgain\Downloads\download\9b7f67d9f8d32605492cfa1c7282d35a\combined_steps.csv")
min,max, hr_day = timestamp_hr(r"C:\Users\VictoriaAgain\Downloads\download\9b7f67d9f8d32605492cfa1c7282d35a\combined_heartrate.csv")
standard_m, standard_ma, standard_step = norm_data(min,max,step_arr)
standard_step

In [30]:
# ISSUE FIX -- repeats min/max 6 times for each day
def compile_step_hr(hr_file,step_file):
    step_arr, step_day = timestamp_total_step(step_file)
    min, max, hr_day = timestamp_hr(hr_file)

    min, max, step_arr = norm_data(min, max, step_arr)
    combined_arr = []

    if (len(min) > 0 and len(max) > 0 and len(step_arr) > 0):
        min = min.flatten()
        max = max.flatten()
        step_arr = step_arr.flatten()
        
        list_day = []
        list_max = []
        list_min = []
        list_step = []
    
        # smaller functions work -- same dates -- something here is WRONG
        for i in range(np.minimum(len(hr_day),len(step_day))):  # whichever is smaller
            hr_date = datetime.fromtimestamp(hr_day[i],tz=ZoneInfo("America/New_York")).date()
            step_date = datetime.fromtimestamp(step_day[i],tz=ZoneInfo("America/New_York")).date()
            # hr_month = datetime.fromtimestamp(hr_day[i],tz=ZoneInfo("America/New_York")).month
            # hr_datetime_day = datetime.fromtimestamp(hr_day[i],tz=ZoneInfo("America/New_York")).day
            # step_month = datetime.fromtimestamp(step_day[i],tz=ZoneInfo("America/New_York")).month
            # step_datetime_day = datetime.fromtimestamp(step_day[i],tz=ZoneInfo("America/New_York")).day
            
            # need double for loop in case we skip a day!
            # print(hr_month, ",", hr_datetime_day)
            # print(step_month, ",", step_datetime_day)
            # if (hr_month == step_month and hr_datetime_day == step_datetime_day): # checking if date is the same
            if hr_date == step_date:
                list_day.append(hr_day[i])
                list_max.append(max[i])
                list_min.append(min[i])
                list_step.append(step_arr[i])
            # elif (hr_datetime_day > step_datetime_day): # heartrate skips, go through rest of step days
            elif hr_date > step_date:
                for j in range(i,len(step_day)):
                    # if (hr_month == step_month and hr_datetime_day == datetime.fromtimestamp(step_day[j],tz=ZoneInfo("America/New_York")).day):
                    if datetime.fromtimestamp(step_day[j], tz=ZoneInfo("America/New_York")).date() == hr_date:
                        list_day.append(hr_day[i])
                        list_max.append(max[i])
                        list_min.append(min[i])
                        list_step.append(step_arr[j])
            # elif (step_datetime_day > hr_datetime_day): # step day skips, go through rest of heart rate days
            elif step_date > hr_date:
                for k in range(i,len(hr_day)):
                    # if (hr_month == step_month and step_datetime_day == datetime.fromtimestamp(hr_day[k],tz=ZoneInfo("America/New_York")).day):
                    if datetime.fromtimestamp(hr_day[k], tz=ZoneInfo("America/New_York")).date() == step_date:
                        list_day.append(hr_day[k])
                        list_max.append(max[k])
                        list_min.append(min[k])
                        list_step.append(step_arr[i])
        
        list_patient_id = [str(hr_file)[42:74]]*len(list_day)
        combined_arr = np.column_stack((list_patient_id,list_day,list_min,list_max,list_step))
    return combined_arr

In [34]:
# ISSUE - some files have discrepancy when compiling -- find out if issue is heartrate or steps
# this file compiler ends with more files than the average heartrate

# STEPS -- test run the other file on each component of the file


arr = compile_step_hr(r"C:\Users\VictoriaAgain\Downloads\download\80929938f623f44614a029108a740d00\combined_heartrate.csv",r"C:\Users\VictoriaAgain\Downloads\download\80929938f623f44614a029108a740d00\combined_steps.csv")
# arr = compile_step_hr(r"C:\Users\VictoriaAgain\Downloads\download\90b8c2e2c843ffd77f8621c7d6ed044d\combined_heartrate.csv",r"C:\Users\VictoriaAgain\Downloads\download\90b8c2e2c843ffd77f8621c7d6ed044d\combined_steps.csv")
df = pd.DataFrame(arr)

# id day min max step

df
# for i in df.index:
#     print(datetime.fromtimestamp(df[1].loc[i],tz=ZoneInfo("America/New_York")))

Unnamed: 0,0,1,2,3,4
0,80929938f623f44614a029108a740d00,1637412569.2489471,-1.4553926608184518,-0.016469907684625546,-0.41310323057178133
1,80929938f623f44614a029108a740d00,1637470817.517382,-1.798480656930095,-0.9592301406666326,-0.1666306530356376
2,80929938f623f44614a029108a740d00,1637592203.2239146,-0.2545846744276997,-0.7706780940702311,0.5541016538161866
3,80929938f623f44614a029108a740d00,1637704280.3434532,1.1177673100188739,-2.373370490139643,-1.0136447842260339
4,80929938f623f44614a029108a740d00,1637758959.4615457,-1.28384866276263,-0.29929797757922766,1.0142396586140379
...,...,...,...,...,...
161,80929938f623f44614a029108a740d00,1652544886.0262475,0.4315913177955871,-0.9120921290175322,-1.0897285544237034
162,80929938f623f44614a029108a740d00,1652635746.3499784,-0.42612867248352143,-0.6764020707720305,-0.8674816416057218
163,80929938f623f44614a029108a740d00,1652706240.2805843,0.7746793139072305,-1.147782187263034,-0.3009882427001445
164,80929938f623f44614a029108a740d00,1652794607.8277097,1.2893113080746956,-0.20502195428102696,1.9295615740188388


In [38]:
p = Path("C:/Users/VictoriaAgain/Downloads/download")
raw_files = list(p.rglob('*.csv*')) # returns list of all csv files within the download folder
files = []
all_files = []

for i in range(len(raw_files)): # list of machine readable file names in order
    files.append(str(raw_files[i]).replace('\\','/'))
for j in range(0,len(files)-1,2):
    combined_file = compile_step_hr(files[j],files[j+1])
    if (len(combined_file) > 0):
        all_files.append(combined_file)

final_df = pd.DataFrame(np.vstack(all_files),columns=['patient_id','timestamp','min','max','steps'])
# final_df
final_df.to_csv('combined_minmax_NORM.csv',index=False,header=True)