# Apple Health POC

[End to End ETL Pipeline](https://medium.com/@ericfflynn/a-cloud-based-etl-pipeline-apple-health-data-to-mysql-48391576ce8e)

[Health Auto Export Docs](https://github.com/Lybron/health-auto-export)

[Streamlit Visual Example](https://github.com/ericfflynn/health-app/blob/main/Home.py)

[Elasticsearch Visual Example](https://github.com/markwk/qs_ledger/tree/master/apple_health)

[Python Apple Health Package](https://github.com/fedecalendino/apple-health)

This PoC will outline collecting data from Apple Health and preprocessing it for visualization. The current process will handle processing the `export.xml` file from Apple Health (retrieved from IPhone). Once we can confirm that the data is extracted, transformed and loaded again properly we will move on to the next step. I want to do some analysis into the Apple Health Kit to see all of the potential XML tags that can be extracted. I want to build my own data pipeline to process the data.


1. Config

2. Extract data

3. Transform/Preprocess Data

4. Load data (as parquet)

5. Build weekly calendar workout

## 1. Config

In [1]:
ROOT_DIR = '../../data/workout/'
import datetime
import pandas as pd
from health import HealthData
from consts import *
import xmltodict
import pyarrow as pa
import re
from typing import List
import json
import os

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

def _parse_float(value, default=None):
    if value is None:
        return default

    if value in HK_CONSTANTS:
        return HK_CONSTANTS[value]

    try:
        return float(value)
    except (ValueError, TypeError):
        return default


def _parse_date(value):
    if value is None:
        return None
    if type(value) == datetime.datetime:
        return pd.Timestamp(value)
    return pd.Timestamp(value)


def _clean_string(s):
    if isinstance(s, str):
        return re.sub("[\W_]+", '', s).upper().replace('Â', '')
    else:
        return s


def _parse_source_id(source_name):
    source_id = _clean_string(source_name)
    return source_id


def _parse_device_string(device_string):
    # Split on commas
    components = device_string.split(',')

    # Initialize the dictionary to store key-value pairs
    device_info = {}

    # Initialize variables to hold the key and value
    current_key = None
    current_value = []

    # Process each component
    for component in components:
        # Split on colon to separate key and value
        if ':' in component:
            if current_key is not None:
                # Join the current value parts and store in the dictionary
                device_info[current_key.strip()] = '.'.join(current_value).strip()
            # Split into key and value parts
            key, value = component.split(':', 1)
            current_key = key.replace('&lt;', '').replace('>', '').replace('<', '')
            current_value = [value]
        else:
            # Append the current component to the value list
            current_value.append(component)

    # Store the last key-value pair
    if current_key is not None:
        device_info[current_key.strip()] = ':'.join(current_value).strip()

    return device_info

def get_dataframe(path: str, columns: List = None):
    """
    Read a DataFrame from a parquet file.

    Args:
        path (str): Path to the parquet file.
        columns (List): List of columns to select (default is None).

    Returns:
        pd.DataFrame: Read DataFrame.
    """
    try:
        return pd.read_parquet(path, engine='pyarrow', dtype_backend='numpy_nullable', columns=columns)
    except Exception as e:
        print(e)
        return pd.DataFrame()

def put_dataframe(df: pd.DataFrame, path: str):
    """
    Write a DataFrame to a parquet file.

    Args:
        df (pd.DataFrame): DataFrame to write.
        path (str): Path to the parquet file.
        schema (dict): Schema dictionary.

    Returns:
        None
    """
    key, file_name = path.rsplit('/', 1)
    if file_name.split('.')[1] != 'parquet':
        raise Exception("Invalid Filetype for Storage (Supported: 'parquet')")
    os.makedirs(key, exist_ok=True)
    df.to_parquet(f"{key}/{file_name}",engine='pyarrow', schema=pa.Schema.from_pandas(df))

def put_json(obj, path: str):
    with open(path, 'w') as f:
        json.dump(obj, f, indent=4, default=str)
    f.close()

TYPE = "@type"
SOURCE_NAME = "@sourceName"
SOURCE_VERSION = "@sourceVersion"
DEVICE = "@device"
CREATION_DATE = "@creationDate"
START_DATE = "@startDate"
END_DATE = "@endDate"
DATE = '@date'
UNIT = "@unit"
BPM = "@bpm"
TIME = "@time"

KEY = "@key"
VALUE = "@value"
MIN='@minimum'
MAX='@maximum'
AVG='@average'
SUM='@sum'



## 2. Extract data

**Record**


**Activity Summary**


**Workout**


**Config**

In [21]:
from enum import Enum, IntEnum

class AppleStandHour(IntEnum):
    HKCategoryValueAppleStandHourStood = 0
    HKCategoryValueAppleStandHourIdle = 1
    
    def __str__(self) -> str:
        return self.name

print(AppleStandHour.HKCategoryValueAppleStandHourStood.value)

0


In [2]:
file = ROOT_DIR + 'export.xml'

class HealthKitData:
    '''
    XML HealthKit data. All data is sorted by date low-to-high
    '''

    def __init__(self, file):
        if type(file) == dict:
            self.data = file
        else:
            self.file = file
            self.config = {}
            self.data = None
            print('Reading export file...')
            with open(file) as file:
                xml = xmltodict.parse(file.read())
                self.data = xml["HealthData"]
        print('Processing Data...')
        self.config = self._get_config()
        self.activity_summaries = self._get_activity_summaries()
        self.workouts = self._get_workouts()
        self.records = self._get_records()

    def set_sources(self, sources):
        self.config['sources'] = sources

    def save(self, save_path):
        data = {
            'activity_summaries.json': self.activity_summaries,
            'config.json': self.config,
            'workouts.parquet': self.workouts,
            'records.parquet': self.records,

        }
        for key, obj in data.items():
            print(f"Saving: {key}")
            if '.json' in key:
                put_json(obj, path=f'{save_path}{key}')
            elif key == 'workouts.parquet':
                df = pd.DataFrame(obj)
                df['last_updated'] = datetime.datetime.now()
                put_dataframe(df, path=f'{save_path}{key}')
            elif key == 'records.parquet':
                df = pd.DataFrame(obj)
                df['last_updated'] = datetime.datetime.now()
                put_dataframe(df, path=f'{save_path}{key}')
            else:
                raise Exception(f'Invalid Key: {key} to save')




    def _get_records(self):
        """
        Parse Records from Payload and update config sources with record device sources
        Return a dict
        """
        sources = {}

        records = self.data['Record']#self.data.pop('Record', [])
        out = []
        for r in records:
            rec = Record(**r)
            sources.update(self._parse_source(r))
            out.append(rec.__dict__)

        self.set_sources(sources)
        return out

    def _parse_source(self, data):
        """
        Parse Sources from Payload
        Return a dict
        """
        source_name = data.get(SOURCE_NAME)

        source_type = "UNKNOWN"
        if 'WATCH' in source_name.upper():
            source_type = 'WATCH'
        elif 'RENPHO' in source_name.upper():
            source_type = 'SCALE'
        elif 'PHONE' in source_name.upper():
            source_type = 'PHONE'
        elif 'MYFITNESSPAL' in source_name.upper():
            source_type = 'APP'
        elif 'HEALTH' in source_name.upper():
            source_type = 'APP'  # manually entered value into health app
        elif 'STRONG' in source_name.upper():
            source_type = 'APP'
        elif 'SLEEP' in source_name.upper():
            source_type = 'APP'

        source_version = data.get(SOURCE_VERSION, None)
        source_id = _parse_source_id(source_name)
        device = data.get(DEVICE, None)
        source = {
            "source_name": source_name,
            "source_version": source_version,
            "source_type": source_type,
        }
        if device is not None:
            source = {
                **source,
                **_parse_device_string(device)
            }

        return {source_id: source}

    def _get_config(self):
        """
        Parse Config from Payload from Me, ExportDate and Unique Devices found in Records
        Return a dict
        """
        me = self.data['Me']#self.data.pop('Me', {})
        export_date = self.data['ExportDate']#self.data.pop('ExportDate', {})
        dob = _parse_date(me.get(HK_ME_DATE_OF_BIRTH))

        return {
            'date_of_birth': dob,
            'age': (datetime.datetime.now() - dob.to_pydatetime()).days // 365, # For comparison convert both to datetime.datetime to get datetime.timedelta
            'biological_sex': me.get(HK_ME_BIOLOGICAL_SEX),
            'blood_type': me.get(HK_ME_BLOOD_TYPE),
            'skin_type': me.get(HK_ME_SKIN_TYPE),
            'wheelchair_use': me.get(HK_ME_WHEELCHAIR_USE),
            'sources': {},
            'last_updated': pd.Timestamp(export_date['@value']),
        }

    def _get_activity_summaries(self):
        """
        Parse ActivitySummaries from Payload
        Return a dict of {date: {key: value}}
        """
        activity_summaries = self.data['ActivitySummary']  # self.data.pop('ActivitySummary', [])

        parsed = {}
        for activity_summary in activity_summaries:
            date = activity_summary.get(DATE_COMPONENTS)
            parsed[date] = {
                'active_energy_burned': _parse_float(activity_summary.get(ACTIVE_ENERGY_BURNED, None)),
                'active_energy_burned_goal': _parse_float(activity_summary.get(ACTIVE_ENERGY_BURNED_GOAL, None)),
                'active_energy_burned_unit': activity_summary.get(ACTIVE_ENERGY_BURNED_UNIT, "Cal"),
                'exercise_time': _parse_float(activity_summary.get(APPLE_EXERCISE_TIME, None)),
                'exercise_time_goal': _parse_float(activity_summary.get(APPLE_EXERCISE_TIME_GOAL, None)),
                'stand_hours': _parse_float(activity_summary.get(APPLE_STAND_HOURS, None)),
                'stand_hours_goal': _parse_float(activity_summary.get(APPLE_STAND_HOURS_GOAL, None)),
            }
        return parsed

    def _get_workouts(self):
        workouts = self.data['Workout']#self.data.pop('Workout', [])
        return [Workout(**w).__dict__ for w in workouts]


class Record:
    NAME_KEY = TYPE

    def __init__(self, **data):
        """
        Parse a Record from the Payload, handle metadata creation and source_id
        """
        self.name: str = data[self.NAME_KEY]
        self.source_id = _parse_source_id(data.get(SOURCE_NAME))
        self.created_at: pd.Timestamp = _parse_date(data.get(CREATION_DATE, None))
        self.start: pd.Timestamp = _parse_date(data.get(START_DATE))
        self.start_date_str: str = self.start.strftime("%Y-%m-%d")
        self.end: pd.Timestamp = _parse_date(data.get(END_DATE))
        self.unit: str = data.get(UNIT, None)
        self.value: float = _parse_float(data.get(VALUE, None))
        self.heartrate_variability = []

        metadata = data.get("MetadataEntry", None)
        if metadata is None:
            self.metadata = []
        elif isinstance(metadata, dict):
            self.metadata = [self._parse_metadata(metadata)]
        elif isinstance(metadata, list):
            self.metadata = list(map(lambda m: self._parse_metadata(m), metadata))

        heartrate_variability = data.get("HeartRateVariabilityMetadataList", None)
        if heartrate_variability is not None:
            heartrate_variability = heartrate_variability.get("InstantaneousBeatsPerMinute", None)
            self.heartrate_variability = list(map(lambda m: self._parse_heartrate_variability(m), heartrate_variability))

    def _parse_metadata(self, data):
        return {'key': data.get(KEY), 'value': data.get(VALUE)}

    def _parse_heartrate_variability(self, data):
        return {'bpm': data.get(BPM), 'time': _parse_date(data.get(TIME))}


class Workout(Record):
    NAME_KEY = WORKOUT_ACTIVITY_TYPE

    def __init__(self, **data):
        super().__init__(**data)
        self.duration: float = _parse_float(data.get(DURATION))
        self.duration_unit: str = data.get(DURATION_UNIT)

        self.distance: float = _parse_float(data.get(TOTAL_DISTANCE, None))
        self.distance_unit: str = data.get(TOTAL_DISTANCE_UNIT, 'mi')

        self.energy_burned: float = _parse_float(data.get(TOTAL_ENERGY_BURNED, None))
        self.energy_burned_unit: str = data.get(TOTAL_ENERGY_BURNED_UNIT, 'Cal')

        self.flights_climbed: float = _parse_float(data.get(TOTAL_FLIGHTS_CLIMBED))
        self.swimming_strokes: float = _parse_float(data.get(TOTAL_SWIMMING_STROKE_COUNT))
        self.events = []
        self.statistics = []
        self.route = None

        workout_events = data.get("WorkoutEvent", None)
        if workout_events is not None:
            if isinstance(workout_events, dict):
                self.events = [self._parse_workout_event(workout_events)]
            else:
                self.events = list(map(lambda m: self._parse_workout_event(m), workout_events))

        workout_statistics = data.get("WorkoutStatistics", None)
        if workout_statistics is not None:
            if isinstance(workout_statistics, dict):
                self.statistics = [self._parse_workout_statistic(workout_statistics)]
            else:
                self.statistics = list(map(lambda m: self._parse_workout_statistic(m), workout_statistics))

        workout_route = data.get("WorkoutRoute", None)
        if workout_route is not None:
            self.route = self._parse_workout_route(workout_route)

        if self.distance is None or self.distance_unit is None:
            statistic = self.get_statistic(HK_RECORD_DISTANCE_WALKING_RUNNING)
            if statistic is not None:
                self.distance = statistic['sum'] if statistic['sum'] is not None else None
                self.distance_unit = statistic['unit'] if statistic['unit'] is not None else 'mi'

        if self.energy_burned is None or self.energy_burned_unit is None:
            statistic = self.get_statistic(HK_RECORD_ACTIVE_ENERGY_BURNED)
            if statistic is not None:
                self.energy_burned = statistic['sum'] if statistic['sum'] is not None else None
                self.energy_burned_unit = statistic['unit'] if statistic['unit'] is not None else 'Cal'

    def get_statistic(self, statistic_type):
        for s in self.statistics:
            if s['name'] == statistic_type:
                return s
        return None

    def get_event(self, event_type):
        for s in self.events:
            if s['name'] == event_type:
                return s
        return None

    def _parse_workout_event(self, data):
        return {
            'name': data.get(TYPE),
            'date': _parse_date(data.get(DATE)),
            'duration': _parse_float(data.get(DURATION)),
            'duration_unit': data.get(DURATION_UNIT),
        }

    def _parse_workout_statistic(self, data):
        statistic_type = "SUM" if data.get(SUM, None) is not None else "AVG"
        return {
            'name': data.get(TYPE),
            'type': statistic_type,
            'start_date': _parse_date(data.get(START_DATE, None)),
            'end_date': _parse_date(data.get(END_DATE, None)),
            'sum': _parse_float(data.get(SUM, None)),
            'average': _parse_float(data.get(AVG, None)),
            'min': _parse_float(data.get(MIN, None)),
            'max': _parse_float(data.get(MAX, None)),
            'unit': _parse_float(data.get(UNIT, None)),
        }

    def _parse_workout_route(self, data):
        file_reference = data.get("FileReference", None)
        file_path = file_reference.get("@path", None)
        metadata = data.get("MetadataEntry", None)
        if metadata is None:
            metadata = []
        elif isinstance(metadata, dict):
            metadata = [self._parse_metadata(metadata)]
        elif isinstance(metadata, list):
            metadata = list(map(lambda m: self._parse_metadata(m), metadata))

        return {
            'file_path': file_path,
            'file_name': file_path.split("/")[-1],
            'metadata': metadata,
        }
        
save_path = f"{ROOT_DIR}processed/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

#hkd = HealthKitData(file)
#hkd.save(save_path)
#hkd.workouts[-6]

In [31]:
df = pd.read_parquet(save_path + 'records.parquet')


In [32]:
sleep = [
    HK_RECORD_SLEEP_ANALYSIS,
    HK_RECORD_SLEEP_ANALYSIS_ASLEEP,
    HK_RECORD_SLEEP_ANALYSIS_AWAKE,
    HK_RECORD_SLEEP_ANALYSIS_IN_BED,
    HK_RECORD_SLEEP_DURATION_GOAL,
    "HKCategoryTypeIdentifierSleepAnalysisAsleepCore",
    "HKCategoryTypeIdentifierSleepAnalysisAsleepDeep",
    "HKCategoryTypeIdentifierSleepAnalysisAsleepREM"
]

scale = [
    HK_RECORD_BODY_MASS_INDEX,
    HK_RECORD_HEIGHT,
    HK_RECORD_BODY_MASS,
    HK_RECORD_BODY_FAT_PERCENTAGE,
    HK_RECORD_LEAN_BODY_MASS,
]

general = [
    HK_RECORD_MINDFUL_SESSION,
    HK_RECORD_AUDIO_EXPOSURE_EVENT,
    HK_RECORD_HEADPHONE_AUDIO_EXPOSURE,
    HK_RECORD_ENVIRONMENTAL_AUDIO_EXPOSURE,
    HK_RECORD_AUDIO_EXPOSURE_EVENT_LOUD_ENVIRONMENT,
    HK_RECORD_INHALER_USAGE,
    HK_RECORD_INSULIN_DELIVERY,
    HK_RECORD_NUMBER_OF_TIMES_FALLEN,
    HK_RECORD_NOT_APPLICABLE,
    HK_RECORD_TOOTHBRUSHING_EVENT,
    HK_RECORD_HANDWASHING_EVENT,
    HK_RECORD_ELECTRODERMAL_ACTIVITY,
]

reproductive = [
    HK_RECORD_CERVICAL_MUCUS_QUALITY_CREAMY,
    HK_RECORD_CERVICAL_MUCUS_QUALITY_DRY,
    HK_RECORD_CERVICAL_MUCUS_QUALITY_EGG_WHITE,
    HK_RECORD_CERVICAL_MUCUS_QUALITY_STICKY,
    HK_RECORD_CERVICAL_MUCUS_QUALITY_WATERY,
    HK_RECORD_OVULATION_TEST_RESULT_ESTROGEN_SURGE,
    HK_RECORD_OVULATION_TEST_RESULT_INDETERMINATE,
    HK_RECORD_OVULATION_TEST_RESULT_LUTEINIZING_HORMONE_SURGE,
    HK_RECORD_OVULATION_TEST_RESULT_NEGATIVE
]

exercise = [
    HK_RECORD_HEART_RATE,
    HK_RECORD_RESPIRATORY_RATE,
    HK_RECORD_STEP_COUNT,
    HK_RECORD_DISTANCE_WALKING_RUNNING,
    HK_RECORD_BASAL_ENERGY_BURNED,
    HK_RECORD_ACTIVE_ENERGY_BURNED,
    HK_RECORD_FLIGHTS_CLIMBED,
    HK_RECORD_APPLE_EXERCISE_TIME,
    HK_RECORD_DISTANCE_CYCLING,
    HK_RECORD_RESTING_HEART_RATE,
    HK_RECORD_V_O2_MAX,
    HK_RECORD_WALKING_HEART_RATE_AVERAGE,
    HK_RECORD_WALKING_DOUBLE_SUPPORT_PERCENTAGE,
    HK_RECORD_SIX_MINUTE_WALK_TEST_DISTANCE,
    HK_RECORD_APPLE_STAND_TIME,
    HK_RECORD_WALKING_SPEED,
    HK_RECORD_WALKING_STEP_LENGTH,
    HK_RECORD_WALKING_ASYMMETRY_PERCENTAGE,
    HK_RECORD_STAIR_ASCENT_SPEED,
    HK_RECORD_STAIR_DESCENT_SPEED,
    HK_RECORD_APPLE_WALKING_STEADINESS,
    HK_RECORD_RUNNING_STRIDE_LENGTH,
    HK_RECORD_RUNNING_VERTICAL_OSCILLATION,
    HK_RECORD_RUNNING_GROUND_CONTACT_TIME,
    HK_RECORD_RUNNING_HEART_RATE_RECOVERY_ONE_MINUTE,
    HK_RECORD_RUNNING_POWER,
    HK_RECORD_RUNNING_SPEED,
    HK_RECORD_APPLE_STAND_HOUR,
    HK_RECORD_HEART_RATE_VARIABILITY_S_D_N_N
]

[i for i in df.name.unique() if 'Dietary' not in i and i not in sleep and i not in scale and i not in general and i not in reproductive and i not in exercise]




[]

In [38]:
sleep_df = df[df.name.isin(sleep)].copy()

In [39]:

sleep_df.value.value_counts()

value
3.0    4624
0.0    2324
5.0    1863
2.0    1607
4.0    1305
8.0       1
Name: count, dtype: int64

In [46]:
sleep_df['duration'] = (sleep_df['end'] - sleep_df['start']).dt.seconds
sleep_df['unit'] = 'seconds'
a = sleep_df[sleep_df['start_date_str'] == '2023-05-27'].groupby('value')['duration'].sum().reset_index()
a['duration'] = a['duration'] / 3600
a

Unnamed: 0,value,duration
0,2.0,0.741667
1,3.0,5.55
2,4.0,0.575
3,5.0,1.591667


In [47]:
b = sleep_df[sleep_df['start_date_str'] == '2023-05-28'].groupby('value')['duration'].sum().reset_index()
b['duration'] = b['duration'] / 3600
b

Unnamed: 0,value,duration
0,0.0,6.655833
1,2.0,0.491667
2,3.0,4.233333
3,4.0,0.741667
4,5.0,1.491667


In [53]:
# Define meal split records and daily total records
meal_split_records = [
    HK_RECORD_DIETARY_CARBOHYDRATES,
    HK_RECORD_DIETARY_CHOLESTEROL,
    HK_RECORD_DIETARY_ENERGY_CONSUMED,
    HK_RECORD_DIETARY_FAT_MONOUNSATURATED,
    HK_RECORD_DIETARY_FAT_POLYUNSATURATED,
    HK_RECORD_DIETARY_FAT_SATURATED,
    HK_RECORD_DIETARY_FAT_TOTAL,
    HK_RECORD_DIETARY_FIBER,
    HK_RECORD_DIETARY_PROTEIN,
    HK_RECORD_DIETARY_SODIUM,
    HK_RECORD_DIETARY_WATER,
]

daily_total_records = [
    HK_RECORD_DIETARY_BIOTIN,
    HK_RECORD_DIETARY_CAFFEINE,
    HK_RECORD_DIETARY_CALCIUM,
    HK_RECORD_DIETARY_CHLORIDE,
    HK_RECORD_DIETARY_CHROMIUM,
    HK_RECORD_DIETARY_COPPER,
    HK_RECORD_DIETARY_FOLATE,
    HK_RECORD_DIETARY_IODINE,
    HK_RECORD_DIETARY_IRON,
    HK_RECORD_DIETARY_MAGNESIUM,
    HK_RECORD_DIETARY_MANGANESE,
    HK_RECORD_DIETARY_MOLYBDENUM,
    HK_RECORD_DIETARY_NIACIN,
    HK_RECORD_DIETARY_PANTOTHENIC_ACID,
    HK_RECORD_DIETARY_PHOSPHORUS,
    HK_RECORD_DIETARY_POTASSIUM,
    HK_RECORD_DIETARY_RIBOFLAVIN,
    HK_RECORD_DIETARY_SELENIUM,
    HK_RECORD_DIETARY_SUGAR,
    HK_RECORD_DIETARY_THIAMIN,
    HK_RECORD_DIETARY_VITAMIN_A,
    HK_RECORD_DIETARY_VITAMIN_B12,
    HK_RECORD_DIETARY_VITAMIN_B6,
    HK_RECORD_DIETARY_VITAMIN_C,
    HK_RECORD_DIETARY_VITAMIN_D,
    HK_RECORD_DIETARY_VITAMIN_E,
    HK_RECORD_DIETARY_VITAMIN_K,
    HK_RECORD_DIETARY_ZINC,
]

# Generate the dictionary structure for a given day
def generate_daily_nutrition_view(date_str):
    daily_nutrition_view = {
        "date": date_str
    }
    
    # Add meal split records
    for record in meal_split_records:
        record_key = record.split("HKQuantityTypeIdentifierDietary")[1].lower()
        daily_nutrition_view[f"breakfast_{record_key}"] = 0.0
        daily_nutrition_view[f"lunch_{record_key}"] = 0.0
        daily_nutrition_view[f"dinner_{record_key}"] = 0.0
        daily_nutrition_view[f"snacks_{record_key}"] = 0.0
        daily_nutrition_view[f"total_{record_key}"] = 0.0
    
    # Add daily total records
    for record in daily_total_records:
        record_key = record.split("HKQuantityTypeIdentifierDietary")[1].lower()
        daily_nutrition_view[f"total_{record_key}"] = 0.0
    
    return daily_nutrition_view

# Update the daily nutrition view with a record
def update_daily_nutrition_view(daily_nutrition_view, record):
    record_type = record['name']
    record_key = record_type.split("HKQuantityTypeIdentifierDietary")[1].lower()
    value = record['value']
    
    # Check if the record is a meal split record
    if record_type in meal_split_records:
        meal_type = next((item['value'].lower() for item in record['metadata'] if item['key'].lower() == 'meal'), None)
        if meal_type:
            daily_nutrition_view[f"{meal_type}_{record_key}"] += value
            daily_nutrition_view[f"total_{record_key}"] += value
    elif record_type in daily_total_records:
        daily_nutrition_view[f"total_{record_key}"] += value

# Example records
records = hkd.records

# Dictionary to store daily nutrition views
daily_nutrition_views = {}

# Iterate through the records and update the daily nutrition views
for record in records:
    if 'dietary' not in record['name'].lower():
        continue
    date_str = record['start_date_str']
    if date_str not in daily_nutrition_views:
        daily_nutrition_views[date_str] = generate_daily_nutrition_view(date_str)
    update_daily_nutrition_view(daily_nutrition_views[date_str], record)

# Print the updated daily nutrition views


In [60]:
a = []
import numpy as np
for k, v in daily_nutrition_views.items():
    a.append(v)

nutrition_df = pd.DataFrame(a)
nutrition_df = nutrition_df.replace(0, np.nan)

nutrition_df

Unnamed: 0,date,breakfast_carbohydrates,lunch_carbohydrates,dinner_carbohydrates,snacks_carbohydrates,total_carbohydrates,breakfast_cholesterol,lunch_cholesterol,dinner_cholesterol,snacks_cholesterol,total_cholesterol,breakfast_energyconsumed,lunch_energyconsumed,dinner_energyconsumed,snacks_energyconsumed,total_energyconsumed,breakfast_fatmonounsaturated,lunch_fatmonounsaturated,dinner_fatmonounsaturated,snacks_fatmonounsaturated,total_fatmonounsaturated,breakfast_fatpolyunsaturated,lunch_fatpolyunsaturated,dinner_fatpolyunsaturated,snacks_fatpolyunsaturated,total_fatpolyunsaturated,breakfast_fatsaturated,lunch_fatsaturated,dinner_fatsaturated,snacks_fatsaturated,total_fatsaturated,breakfast_fattotal,lunch_fattotal,dinner_fattotal,snacks_fattotal,total_fattotal,breakfast_fiber,lunch_fiber,dinner_fiber,snacks_fiber,total_fiber,breakfast_protein,lunch_protein,dinner_protein,snacks_protein,total_protein,breakfast_sodium,lunch_sodium,dinner_sodium,snacks_sodium,total_sodium,breakfast_water,lunch_water,dinner_water,snacks_water,total_water,total_biotin,total_caffeine,total_calcium,total_chloride,total_chromium,total_copper,total_folate,total_iodine,total_iron,total_magnesium,total_manganese,total_molybdenum,total_niacin,total_pantothenicacid,total_phosphorus,total_potassium,total_riboflavin,total_selenium,total_sugar,total_thiamin,total_vitamina,total_vitaminb12,total_vitaminb6,total_vitaminc,total_vitamind,total_vitamine,total_vitamink,total_zinc
0,2023-05-14,26.9512,81.0,131.954,,239.9052,,,264.0,,264.0,105.02,574.0,1371.01,,2050.03,0.03776,,3.29068,,3.32844,0.08614,,2.91766,,3.0038,0.13216,10.5,18.2519,,28.88406,0.3894,24.0,39.5518,,63.9412,3.068,7.0,10.2464,,20.3144,1.2862,9.0,138.967,,149.2532,1.18,181.5,1908.21,,2090.89,,,,,,,,475.16,,,,,,5.170259,,,,,,,2391.87,,,108.2608,,,,,88.7622,,,,
1,2023-05-15,112.406,,,15.4,127.806,352.8,,,,352.8,1525.82,,150.0,585.8,2261.62,17.5938,,,,17.5938,3.84324,,,,3.84324,20.0324,,,3.6,23.6324,57.4794,,,5.4,62.8794,14.4305,,,2.2,16.6305,109.222,,,1.4,110.622,884.33,,,6.8,891.13,,,,,,,,240.0,,,,,,8.8538,,,,,,,3041.84,,,30.5214,,,,,85.806,,,,
2,2023-05-16,,19.2816,90.0,,109.2816,,183.9,,,183.9,300.0,787.829,1090.0,,2177.829,,23.1295,,,23.1295,,6.3314,,,6.3314,,20.6391,6.5,,27.1391,,63.7093,62.0,,125.7093,,6.6916,,,6.6916,,46.3646,43.0,,89.3646,,302.067,2830.0,,3132.067,,,,,,,,93.2,,,,,,5.1375,,,,,,,1067.0,,,1.5318,,,,,22.5,,,,
3,2023-05-13,26.9512,,,,26.9512,,,,,,105.02,,,,105.02,0.03776,,,,0.03776,0.08614,,,,0.08614,0.13216,,,,0.13216,0.3894,,,,0.3894,3.068,,,,3.068,1.2862,,,,1.2862,1.18,,,,1.18,,,,,,,,5.9,,,,,,0.306799,,,,,,,422.44,,,14.4314,,,,,10.266,,,,
4,2023-05-17,,12.3975,56.668,112.0,181.0655,,180.3,28.0,320.8,529.1,,521.8,376.67,1146.8,2045.27,,10.4902,,3.141,13.6312,,3.8496,,2.4876,6.3372,,4.1697,1.5,16.0752,21.7449,,26.305,13.3334,32.56,72.1984,,6.025,8.33345,,14.35845,,56.816,25.5335,83.316,165.6655,,765.65,743.335,735.4,2244.385,,,,,,,,484.2,,,,,,8.41652,,,,,,,2247.62,,,123.995,,,,,22.5,,,,
5,2023-05-18,,83.0,,,83.0,,,,,,,520.0,,,520.0,,,,,,,,,,,,10.0,,,10.0,,19.0,,,19.0,,5.0,,,5.0,,7.0,,,7.0,,50.0,,,50.0,,,,,,,,,,,,,,,,,,,,,,,,43.0,,,,,,,,,
6,2023-05-19,,,106.0,118.0,224.0,,,60.0,18.0,78.0,130.0,460.0,960.0,995.0,2545.0,,,,6.0,6.0,,,,12.0,12.0,,,16.0,6.2,22.2,,,38.0,25.0,63.0,,,8.0,3.0,11.0,,,52.0,14.0,66.0,,,2260.0,736.0,2996.0,,,,,,,,1180.0,,,,,,9.9,,,,,,,248.0,,,22.0,,,,,19.8,,,,
7,2023-05-20,,19.0,42.52,44.0,105.52,,,127.2,35.0,162.2,,900.0,581.2,520.0,2001.2,,0.5,2.094,,2.594,,0.5,1.6584,,2.1584,,,1.2168,8.0,9.2168,,1.0,22.56,13.0,36.56,,0.5,2.4,4.0,6.9,,3.0,47.964,58.0,108.964,,90.0,633.6,300.0,1023.6,,,,,,,,618.4,,,,,,3.588,,,,,,,1172.4,,,40.24,,,,,,,,,
8,2023-05-21,,,41.76,,41.76,,,121.2,,121.2,110.0,420.0,501.2,100.0,1131.2,,,2.094,,2.094,,,1.6584,,1.6584,,,1.2168,,1.2168,,,15.05,,15.05,,,2.2,,2.2,,,45.254,,45.254,,,528.6,,528.6,,,,,,,,8.4,,,,,,2.328,,,,,,,452.4,,,2.12,,,,,,,,,
9,2023-05-22,,,38.1508,,38.1508,,,,,,460.0,410.0,954.163,600.0,2424.163,,,0.005,,0.005,,,0.0565,,0.0565,,,0.025,,0.025,,,32.51,,32.51,,,4.0166,,4.0166,,,92.8149,,92.8149,,,3.0,,3.0,,,,,,,,45.166,,,,,,1.95496,,,,,,,932.146,,,2.9633,,,,,7.69995,,,,


In [31]:
w = Workout(**data['Workout'][-6])
w.get_statistic('HKQuantityTypeIdentifierActiveEnergyBurned')
w.__dict__

{'name': 'HKQuantityTypeIdentifierDistanceWalkingRunning', 'type': 'SUM', 'start_date': datetime.datetime(2024, 7, 17, 12, 37, 53, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'end_date': datetime.datetime(2024, 7, 17, 13, 30, 59, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'sum': 3.4201, 'average': None, 'min': None, 'max': None, 'unit': None}
{'name': 'HKQuantityTypeIdentifierActiveEnergyBurned', 'type': 'SUM', 'start_date': datetime.datetime(2024, 7, 17, 12, 37, 53, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'end_date': datetime.datetime(2024, 7, 17, 13, 30, 59, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'sum': 436.867, 'average': None, 'min': None, 'max': None, 'unit': None}
{'name': 'HKQuantityTypeIdentifierActiveEnergyBurned', 'type': 'SUM', 'start_date': datetime.datetime(2024, 7, 17, 12, 37, 53, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'end_

{'name': 'HKWorkoutActivityTypeRunning',
 'source_id': 'CHARLIESAPPLEWATCH9_6_3',
 'created_at': datetime.datetime(2024, 7, 17, 13, 31, 7, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))),
 'start': datetime.datetime(2024, 7, 17, 12, 37, 53, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))),
 'start_date_str': '2024-07-17',
 'end': datetime.datetime(2024, 7, 17, 13, 30, 59, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))),
 'unit': None,
 'value': None,
 'heartrate_variability': [],
 'metadata': [{'key': 'HKIndoorWorkout', 'value': '0'},
  {'key': 'HKElevationAscended', 'value': '207 cm'},
  {'key': 'HKWeatherHumidity', 'value': '6600 %'},
  {'key': 'HKTimeZone', 'value': 'America/New_York'},
  {'key': 'HKWeatherTemperature', 'value': '89.456 degF'},
  {'key': 'HKAverageMETs', 'value': '6.99379 kcal/hrÂ·kg'},
  {'key': 'HKIndoorWorkout', 'value': '0'},
  {'key': 'HKElevationAscended', 'value': '207 cm'},
  {'key': 'HKWeather

In [12]:
file = ROOT_DIR + 'export.xml'

data = HealthData.read(
    file,
    include_me=True,
    include_activity_summaries=True,
    include_correlations=False,
    include_records=True,
    include_workouts=True,
)

print(data.me.biological_sex)
print(f"{len(data.activity_summaries)} activity records")
print(f"{len(data.correlations)} correlations")
print(f"{len(data.records)} records")
print(f"{len(data.workouts)} workouts")

HKBiologicalSexMale
539 activity records
0 correlations
1732268 records
545 workouts


In [64]:
day = datetime.datetime.date(datetime.datetime(2024,7,17))

config = data.me.__dict__
day_activity_summary = [summary for summary in data.activity_summaries if summary.date.date() == day]
day_workouts = [workout for workout in data.workouts if workout.start.date() == day]
day_records = [record for record in data.records if record.start.date() == day]

In [65]:
config

{'date_of_birth': datetime.datetime(1998, 4, 11, 0, 0),
 'biological_sex': 'HKBiologicalSexMale',
 'blood_type': 'HKBloodTypeNotSet',
 'skin_type': 'HKFitzpatrickSkinTypeNotSet',
 'wheelchair_use': None}

In [58]:
day_activity_summary[0].__dict__

{'date': datetime.datetime(2024, 7, 17, 0, 0),
 'active_energy_burned': 3227.75,
 'active_energy_burned_goal': 500.0,
 'active_energy_burned_unit': 'Cal',
 'exercise_time': 240.0,
 'exercise_time_goal': 45.0,
 'stand_hours': 13.0,
 'stand_hours_goal': 8.0}

In [62]:
workout = day_workouts[1].__dict__

workout_start_time = workout['start']
workout_end_time = workout['end']
print(workout)
workout_records = [record for record in data.records if record.start >= workout_start_time and record.end <= workout_end_time]
workout_records_df = pd.DataFrame([record.__dict__ for record in workout_records])
workout_records_df

{'name': 'HKWorkoutActivityTypeRunning', 'source': 'Charlieâ€™s AppleÂ\xa0Watch', 'created_at': datetime.datetime(2024, 7, 17, 13, 31, 7, tzinfo=tzoffset(None, -14400)), 'start': datetime.datetime(2024, 7, 17, 12, 37, 53, tzinfo=tzoffset(None, -14400)), 'end': datetime.datetime(2024, 7, 17, 13, 30, 59, tzinfo=tzoffset(None, -14400)), 'metadata': [HKIndoorWorkout: 0, HKElevationAscended: 207 cm, HKWeatherHumidity: 6600 %, HKTimeZone: America/New_York, HKWeatherTemperature: 89.456 degF, HKAverageMETs: 6.99379 kcal/hrÂ·kg, HKIndoorWorkout: 0, HKElevationAscended: 207 cm, HKWeatherHumidity: 6600 %, HKTimeZone: America/New_York, HKWeatherTemperature: 89.456 degF, HKAverageMETs: 6.99379 kcal/hrÂ·kg], 'duration': 53.09000549912453, 'duration_unit': 'min', 'distance': 0.0, 'distance_unit': None, 'energy_burned': 0.0, 'energy_burned_unit': None, 'flights_climbed': 0.0, 'swimming_strokes': 0.0}


Unnamed: 0,name,source,created_at,start,end,metadata,unit,value,heart_rate
0,HKQuantityTypeIdentifierHeartRate,Charlieâ€™s AppleÂ Watch,2024-07-17 12:37:55-04:00,2024-07-17 12:37:53-04:00,2024-07-17 12:37:53-04:00,[HKMetadataKeyHeartRateMotionContext: 2],count/min,132.00000,[]
1,HKQuantityTypeIdentifierHeartRate,Charlieâ€™s AppleÂ Watch,2024-07-17 12:38:00-04:00,2024-07-17 12:37:55-04:00,2024-07-17 12:37:55-04:00,[HKMetadataKeyHeartRateMotionContext: 2],count/min,132.00000,[]
2,HKQuantityTypeIdentifierHeartRate,Charlieâ€™s AppleÂ Watch,2024-07-17 12:38:05-04:00,2024-07-17 12:38:01-04:00,2024-07-17 12:38:01-04:00,[HKMetadataKeyHeartRateMotionContext: 2],count/min,131.00000,[]
3,HKQuantityTypeIdentifierHeartRate,Charlieâ€™s AppleÂ Watch,2024-07-17 12:38:10-04:00,2024-07-17 12:38:04-04:00,2024-07-17 12:38:04-04:00,[HKMetadataKeyHeartRateMotionContext: 2],count/min,129.00000,[]
4,HKQuantityTypeIdentifierHeartRate,Charlieâ€™s AppleÂ Watch,2024-07-17 12:38:15-04:00,2024-07-17 12:38:10-04:00,2024-07-17 12:38:10-04:00,[HKMetadataKeyHeartRateMotionContext: 2],count/min,122.00000,[]
...,...,...,...,...,...,...,...,...,...
9052,HKQuantityTypeIdentifierRunningSpeed,Charlieâ€™s AppleÂ Watch,2024-07-17 13:30:48-04:00,2024-07-17 13:30:45-04:00,2024-07-17 13:30:45-04:00,[],mi/hr,6.87578,[]
9053,HKQuantityTypeIdentifierRunningSpeed,Charlieâ€™s AppleÂ Watch,2024-07-17 13:30:51-04:00,2024-07-17 13:30:48-04:00,2024-07-17 13:30:48-04:00,[],mi/hr,6.91889,[]
9054,HKQuantityTypeIdentifierRunningSpeed,Charlieâ€™s AppleÂ Watch,2024-07-17 13:30:52-04:00,2024-07-17 13:30:50-04:00,2024-07-17 13:30:50-04:00,[],mi/hr,7.05495,[]
9055,HKQuantityTypeIdentifierRunningSpeed,Charlieâ€™s AppleÂ Watch,2024-07-17 13:30:56-04:00,2024-07-17 13:30:53-04:00,2024-07-17 13:30:53-04:00,[],mi/hr,7.12622,[]


In [63]:
workout_records_df.name.value_counts()

name
HKQuantityTypeIdentifierBasalEnergyBurned                 1249
HKQuantityTypeIdentifierActiveEnergyBurned                1249
HKQuantityTypeIdentifierDistanceWalkingRunning            1234
HKQuantityTypeIdentifierStepCount                         1233
HKQuantityTypeIdentifierRunningSpeed                      1229
HKQuantityTypeIdentifierRunningPower                      1227
HKQuantityTypeIdentifierHeartRate                          637
HKQuantityTypeIdentifierRunningStrideLength                310
HKQuantityTypeIdentifierRunningVerticalOscillation         305
HKQuantityTypeIdentifierRunningGroundContactTime           295
HKQuantityTypeIdentifierAppleExerciseTime                   52
HKQuantityTypeIdentifierAppleStandTime                      10
HKQuantityTypeIdentifierWalkingSpeed                        10
HKQuantityTypeIdentifierWalkingStepLength                   10
HKQuantityTypeIdentifierWalkingDoubleSupportPercentage       5
HKQuantityTypeIdentifierEnvironmentalAudioExposure

## 3. Transform/Preprocess Data


In [32]:
import datetime 

## I want to create a config file and a data file that is either a parquet or a csv of all of the data that I have.

class UserConfig:
    def __init__(self, config):
        self.date_of_birth = config['date_of_birth']
        self.biological_sex = config['biological_sex']
        self.blood_type = config['blood_type']
        self.skin_type = config['skin_type']
        self.wheelchair_use = config['wheelchair_use']

    def __repr__(self):
        return f"UserConfig(date_of_birth={self.date_of_birth}, biological_sex={self.biological_sex}, blood_type={self.blood_type}, skin_type={self.skin_type}, wheelchair_use={self.wheelchair_use})"
    


In [68]:
day_records[-20].__dict__

{'name': 'HKCategoryTypeIdentifierAppleStandHour',
 'source': 'Charlieâ€™s AppleÂ\xa0Watch',
 'created_at': datetime.datetime(2024, 7, 17, 13, 0, 32, tzinfo=tzoffset(None, -14400)),
 'start': datetime.datetime(2024, 7, 17, 13, 0, tzinfo=tzoffset(None, -14400)),
 'end': datetime.datetime(2024, 7, 17, 14, 0, tzinfo=tzoffset(None, -14400)),
 'metadata': [],
 'unit': None,
 'value': 0,
 'heart_rate': []}

## 4. Load data (as parquet)


## 5. Build weekly calendar workout