In [4]:
# imports
from pathlib import Path
from collections import namedtuple
from datetime import datetime
import os
import glob
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
# taking the average heart rate of each day for a patient
# make an array that has patient id, average heart rate, day
# given an input of the total patient heart rate csv file

def timestamp_avg_hr(csv_file):
    list_avg_hr = []
    list_day_hr = []
    
    df = pd.read_csv(csv_file) # csv into dataframe

    if df.empty or 'timestamp' not in df.columns or 'heartrate' not in df.columns:
        return [],[]
        
    unix_prev = df['timestamp'].loc[0]
    temp_day_prev = datetime.fromtimestamp(df['timestamp'].loc[0]).day # first day
    temp_hr_sum = 0
    counter = 0

    for i in df.index: # iterates through each row
        val = df['timestamp'].loc[i] # the timestamp value at that row
        temp_day_curr = datetime.fromtimestamp(val).day # converting to # date
        if temp_day_prev == temp_day_curr:
            counter += 1
            temp_hr_sum += df['heartrate'].loc[i]
        elif temp_day_curr != temp_day_prev:
            if (counter != 0):
                list_day_hr.append(unix_prev) # add day to list
                avg_hr = temp_hr_sum/counter
                list_avg_hr.append(avg_hr)
                temp_hr_sum = 0
                counter = 0
                unix_prev = df['timestamp'].loc[i]
                temp_day_prev = temp_day_curr

    # for last day otherwise not included
    if (counter != 0):
        unix_prev = df['timestamp'].loc[len(df.index)-1]
        list_day_hr.append(unix_prev)
        avg_hr = temp_hr_sum/counter
        list_avg_hr.append(avg_hr)
            
    return list_avg_hr, list_day_hr


In [290]:
# testing
hr,day = timestamp_avg_hr(r"C:\Users\VictoriaAgain\Downloads\download\02737ec6ea3d15c132c6d82ccc0fa7c0\combined_heartrate.csv")

convert_day = []
for i in range(len(day)):
    convert_day.append(datetime.fromtimestamp(day[i]).day)
# print(convert_day)
print(convert_day)

[11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 11, 12, 13, 14, 15, 16, 24, 25, 26, 27, 28, 1]


In [9]:
# total steps by day for one patient given csv file path

def timestamp_total_step(csv_file):
    list_total_step = []
    list_day_step = []
    
    df = pd.read_csv(csv_file)

    if df.empty or 'stop' not in df.columns or 'steps' not in df.columns:
        return [],[]
    
    unix_prev = df['stop'].loc[0]
    temp_day_prev = datetime.fromtimestamp(df['stop'].loc[0]).day
    temp_total_step = 0
    
    for i in df.index:
        val = df['stop'].loc[i]
        temp_day_curr = datetime.fromtimestamp(val).day
        if temp_day_prev == temp_day_curr:
            temp_total_step += df['steps'].loc[i]
        elif temp_day_curr != temp_day_prev:
            list_day_step.append(unix_prev)
            list_total_step.append(temp_total_step)
            unix_prev = df['stop'].loc[i]
            temp_total_step = 0
            temp_day_prev = temp_day_curr

    # for last day otherwise not included
    unix_prev = df['stop'].loc[len(df.index)-1]
    list_day_step.append(unix_prev)
    list_total_step.append(temp_total_step)
    
    return list_total_step, list_day_step
    

In [286]:
# testing
step,day = timestamp_total_step(r"C:\Users\VictoriaAgain\Downloads\download\02737ec6ea3d15c132c6d82ccc0fa7c0\combined_steps.csv")
convert_day = []
for i in range(len(day)):
    convert_day.append(datetime.fromtimestamp(day[i]).day)
print(convert_day)
# df = pd.read_csv(r"C:\Users\VictoriaAgain\Downloads\download\90b8c2e2c843ffd77f8621c7d6ed044d\combined_steps.csv")

[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [67]:
# combining heart rate and step files for each patient
# given the day is the same -- check month and day are the same, create new 2d array with same date and corresponding heart rate + steps
# also return patient id via substring

def compile_step_hr(hr_file,step_file):
    step_arr, step_day = timestamp_total_step(step_file)
    hr_arr, hr_day = timestamp_avg_hr(hr_file)
    
    list_day = []
    list_hr = []
    list_step = []

    # smaller functions work -- same dates -- something here is WRONG
    for i in range(np.minimum(len(hr_day),len(step_day))):  # whichever is smaller
        hr_month = datetime.fromtimestamp(hr_day[i]).month
        hr_datetime_day = datetime.fromtimestamp(hr_day[i]).day
        step_month = datetime.fromtimestamp(step_day[i]).month
        step_datetime_day = datetime.fromtimestamp(step_day[i]).day
        
        # need double for loop in case we skip a day!
        # print(hr_month, ",", hr_datetime_day)
        # print(step_month, ",", step_datetime_day)
        if (hr_month == step_month and hr_datetime_day == step_datetime_day): # checking if date is the same
            # print(hr_arr[i], step_arr[i])
            list_day.append(hr_day[i])
            list_hr.append(hr_arr[i])
            list_step.append(step_arr[i])
        elif (hr_datetime_day > step_datetime_day): # heartrate skips, go through rest of step days
            for j in range(i,len(step_day)):
                if (hr_month == step_month and hr_datetime_day == datetime.fromtimestamp(step_day[j]).day):
                    list_day.append(hr_day[i])
                    list_hr.append(hr_arr[i])
                    list_step.append(step_arr[j])
        elif (step_datetime_day > hr_datetime_day): # step day skips, go through rest of heart rate days
            for k in range(i,len(hr_day)):
                if (hr_month == step_month and step_datetime_day == datetime.fromtimestamp(hr_day[k]).day):
                    list_day.append(hr_day[k])
                    list_hr.append(hr_arr[k])
                    list_step.append(step_arr[i])
    
    # testing
    # list_day is correct, only shows same days in both lists
    for i in range(len(list_day)):
        convert_day.append(datetime.fromtimestamp(list_day[i]).day)
    # print(len(list_day),len(list_hr),len(list_step))
    list_patient_id = [str(hr_file)[42:74]]*len(list_day)
    # print(len(list_patient_id))
    # print(step_arr, datetime.fromtimestamp(step_day).day, hr_arr, datetime.fromtimestamp(hr_day).day)
    combined_arr = np.column_stack((list_patient_id,list_day,list_hr,list_step))
    # print(combined_arr)
    return combined_arr

In [308]:
# testing

# taking array and turning into csv file
arr = compile_step_hr(r"C:\Users\VictoriaAgain\Downloads\download\80929938f623f44614a029108a740d00\combined_heartrate.csv",r"C:\Users\VictoriaAgain\Downloads\download\80929938f623f44614a029108a740d00\combined_steps.csv")
arr
# df = pd.DataFrame(arr)
# df.to_csv('temp_output.csv',index=False,header=False)
# patient_id time hr step

array([['80929938f623f44614a029108a740d00', '1637412569.2489471',
        '69.32836877807112', '4106.000000000001'],
       ['80929938f623f44614a029108a740d00', '1637412569.2489471',
        '69.32836877807112', '9930.58474614884'],
       ['80929938f623f44614a029108a740d00', '1637412569.2489471',
        '69.32836877807112', '5663.000000000003'],
       ['80929938f623f44614a029108a740d00', '1637412569.2489471',
        '69.32836877807112', '2448.044744116518'],
       ['80929938f623f44614a029108a740d00', '1637412569.2489471',
        '69.32836877807112', '4789.999999999996'],
       ['80929938f623f44614a029108a740d00', '1637412569.2489471',
        '69.32836877807112', '6616.503479757015'],
       ['80929938f623f44614a029108a740d00', '1637470817.517382',
        '65.38250787169845', '4864.040551797268'],
       ['80929938f623f44614a029108a740d00', '1637470817.517382',
        '65.38250787169845', '5695.129824384994'],
       ['80929938f623f44614a029108a740d00', '1637470817.517382',
  

In [322]:
# given a folder go through each of the folders and create an array based on the combined heartrate and steps

p = Path("C:/Users/VictoriaAgain/Downloads/download")
raw_files = list(p.rglob('*.csv*'))
files = []
all_files = []

for i in range(len(raw_files)):
    files.append(str(raw_files[i]).replace('\\','/'))
for j in range(len(files)-1):
    all_files.append(compile_step_hr(files[j],files[j+1]))

final_df = pd.DataFrame(np.vstack(all_files),columns=['patient_id','timestamp','average_hr','steps'])
final_df.to_csv('combined_files_avg_hr.csv',index=False,header=True)

In [57]:
# look at different classification models!!
# sk learn package -- regression tracking, classification -- try and use classifier!

# new function -- go through all of the folders and json files, create a csv file with all of the daily PROMIS responses, days, and patient_id
# if json file has daily faigue 1 item, put patient_id, promis score, and date in big csv file
# go through both hr_step and promis_file, combined based on dates and patient_id

def compile_fatigue(folder_path):
    p = Path(folder_path)
    raw_promis_files = list(p.rglob('*.json*')) # all json files
    daily_promis_files = []
    
    for i in range(len(raw_promis_files)):
        readable_promis_file = ((str(raw_promis_files[i])).replace('\\','/')) # converts all survey files into inputtable format
        df = pd.read_json(readable_promis_file)
        temp_daily_arr = []
        if 'DailyFatigue_1Item' in df['answers'] and 'key' in df.columns:
            temp_daily_arr.append(str(readable_promis_file)[72:104])
            temp_daily_arr.append(int(str(df['key'])[27:29])) # month
            temp_daily_arr.append(int(str(df['key'])[30:32])) # day
            temp_daily_arr.append(df['answers']['DailyFatigue_1Item'])
            daily_promis_files.append(temp_daily_arr)
            temp_daily_arr = []
    return daily_promis_files

In [63]:
# testing

df = pd.DataFrame(np.vstack(compile_fatigue(r"C:\Users\VictoriaAgain\Downloads\surveys-20250613T020413Z-1-001\surveys")),columns=['patient_id','month','day','promis_score'])
df.to_csv('combined_promis_scores.csv',index=False,header=True)

In [142]:
# comparing both average heartrate and promis score files
# given both files, find where dates (unix time to month + date) match up and patient_id matches
# add all information to new combined csv file

df_hr = pd.read_csv('combined_files_avg_hr.csv')
df_promis = pd.read_csv('combined_promis_scores.csv')

# convert timestamp to datetime and extract month + day
df_hr['datetime'] = pd.to_datetime(df_hr['timestamp'], unit='s')
df_hr['month'] = df_hr['datetime'].dt.month
df_hr['day'] = df_hr['datetime'].dt.day

# filter cols
df_hr_filtered = df_hr[['patient_id', 'month', 'day', 'average_hr', 'steps']]

# merge dfs on patient_id, month, and day
df_merged = pd.merge(df_promis, df_hr_filtered,
                     on = ['patient_id', 'month', 'day'],
                     how = 'inner')

df_final = df_merged[['patient_id', 'month', 'day', 'average_hr', 'steps', 'promis_score']]
df_final.columns = ['patient id', 'month', 'day', 'heartrate', 'steps', 'promis score']

df_final.to_csv('final_avg_hr_data_updated.csv', index=False)

In [15]:
# converting promis scores to numerical values

def compile_fatigue_numbers(folder_path):
    p = Path(folder_path)
    raw_promis_files = list(p.rglob('*.json*')) # all json files
    daily_promis_numbers = []
    
    for i in range(len(raw_promis_files)):
        readable_promis_file = ((str(raw_promis_files[i])).replace('\\','/')) # converts all survey files into inputtable format
        df = pd.read_json(readable_promis_file)
        temp_daily_arr = []
        if 'DailyFatigue_1Item' in df['answers'] and 'key' in df.columns:
            temp_daily_arr.append(str(readable_promis_file)[72:104])
            temp_daily_arr.append(int(str(df['key'])[27:29])) # month
            temp_daily_arr.append(int(str(df['key'])[30:32])) # day
            promis_score = df['answers']['DailyFatigue_1Item']
            if promis_score == 'Not fatigued at all':
                temp_daily_arr.append(0)
            elif promis_score == 'A little bit fatigued':
                temp_daily_arr.append(1)
            elif promis_score == 'Somewhat fatigued':
                temp_daily_arr.append(2)
            elif promis_score == 'Very fatigued':
                temp_daily_arr.append(3)
            daily_promis_numbers.append(temp_daily_arr)
            temp_daily_arr = []
    return daily_promis_numbers

In [17]:
df = pd.DataFrame(np.vstack(compile_fatigue_numbers(r"C:\Users\VictoriaAgain\Downloads\surveys-20250613T020413Z-1-001\surveys")),columns=['patient_id','month','day','promis_score'])
df.to_csv('combined_promis_nums.csv',index=False,header=True)

In [6]:
df_hr = pd.read_csv('combined_files_avg_hr.csv')
df_promis = pd.read_csv('combined_promis_nums.csv')

# convert timestamp to datetime and extract month + day
df_hr['datetime'] = pd.to_datetime(df_hr['timestamp'], unit='s')
df_hr['month'] = df_hr['datetime'].dt.month
df_hr['day'] = df_hr['datetime'].dt.day

# filter cols
df_hr_filtered = df_hr[['patient_id', 'month', 'day', 'average_hr', 'steps']]

# merge dfs on patient_id, month, and day
df_merged = pd.merge(df_promis, df_hr_filtered,
                     on = ['patient_id', 'month', 'day'],
                     how = 'inner')

df_final = df_merged[['patient_id', 'month', 'day', 'average_hr', 'steps', 'promis_score']]
df_final.columns = ['patient id', 'month', 'day', 'heartrate', 'steps', 'promis score']

df_final.to_csv('final_avg_hr_nums_updated.csv', index=False)

UsageError: Line magic function `%git` not found.
