In [66]:
import pandas as pd
import xml.etree.ElementTree as ET
import datetime as dt
import numpy as np

In [None]:
# Passing the path of the
# xml document to enable the
# parsing process
tree = ET.parse("wearable_data/apple_health_export/export.xml")

# getting the parent tag of
# the xml document
root = tree.getroot()

# printing the root (parent) tag
# of the xml document, along with
# its memory location
print(root)

# printing the attributes of the
# first tag from the parent 
print(root[0].attrib)

<Element 'HealthData' at 0x1112f06d0>
{'value': '2025-10-27 12:40:27 +0200'}


In [5]:
record_list = [x.attrib for x in root.iter('Record')]

In [9]:
record_data = pd.DataFrame(record_list)

# proper type to dates
for col in ['creationDate', 'startDate', 'endDate']:
    record_data[col] = pd.to_datetime(record_data[col])

# value is numeric, NaN if fails
record_data['value'] = pd.to_numeric(record_data['value'], errors='coerce')

# some records do not measure anything, just count occurences
# filling with 1.0 (= one time) makes it easier to aggregate
record_data['value'] = record_data['value'].fillna(1.0)

# shorter observation names
record_data['type'] = record_data['type'].str.replace('HKQuantityTypeIdentifier', '')
record_data['type'] = record_data['type'].str.replace('HKCategoryTypeIdentifier', '')
record_data.tail()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
1848429,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-26 19:43:25+02:00,2025-10-26 19:42:24+02:00,2025-10-26 19:43:23+02:00,66.0651,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma..."
1848430,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-26 21:19:27+02:00,2025-10-26 21:18:27+02:00,2025-10-26 21:19:26+02:00,48.4193,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma..."
1848431,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-27 07:31:33+02:00,2025-10-27 07:30:32+02:00,2025-10-27 07:31:31+02:00,36.77,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma..."
1848432,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-27 10:00:32+02:00,2025-10-27 09:59:32+02:00,2025-10-27 10:00:31+02:00,17.6583,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma..."
1848433,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-27 11:19:28+02:00,2025-10-27 11:18:27+02:00,2025-10-27 11:19:12+02:00,31.8419,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma..."


In [10]:
#Unique types of records
record_data['type'].unique()

array(['Height', 'BodyMass', 'HeartRate', 'OxygenSaturation',
       'BloodPressureSystolic', 'BloodPressureDiastolic',
       'RespiratoryRate', 'StepCount', 'DistanceWalkingRunning',
       'BasalEnergyBurned', 'ActiveEnergyBurned', 'FlightsClimbed',
       'AppleExerciseTime', 'DistanceCycling', 'RestingHeartRate',
       'VO2Max', 'WalkingHeartRateAverage', 'EnvironmentalAudioExposure',
       'HeadphoneAudioExposure', 'WalkingDoubleSupportPercentage',
       'SixMinuteWalkTestDistance', 'AppleStandTime', 'WalkingSpeed',
       'WalkingStepLength', 'WalkingAsymmetryPercentage',
       'StairAscentSpeed', 'StairDescentSpeed',
       'HKDataTypeSleepDurationGoal', 'AppleWalkingSteadiness',
       'RunningStrideLength', 'RunningVerticalOscillation',
       'RunningGroundContactTime', 'HeartRateRecoveryOneMinute',
       'RunningPower', 'RunningSpeed', 'TimeInDaylight', 'PhysicalEffort',
       'SleepAnalysis', 'AppleStandHour', 'AudioExposureEvent',
       'HeadphoneAudioExposureEvent

In [43]:
record_data[record_data['type'] == 'SleepAnalysis'].head(n=100)

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,metric,value_canonical,date
1822382,SleepAnalysis,Laine’s iPhone,14.3,,2021-01-10 06:30:09+02:00,2021-01-10 00:27:40+02:00,2021-01-10 04:18:26+02:00,1.0,,sleep_analysis,13846.0,2021-01-10
1822383,SleepAnalysis,Laine’s iPhone,14.3,,2021-01-10 06:30:09+02:00,2021-01-10 04:18:30+02:00,2021-01-10 05:38:52+02:00,1.0,,sleep_analysis,4822.0,2021-01-10
1822384,SleepAnalysis,Laine’s iPhone,14.3,,2021-01-10 06:30:09+02:00,2021-01-10 05:41:06+02:00,2021-01-10 06:30:09+02:00,1.0,,sleep_analysis,2943.0,2021-01-10
1822385,SleepAnalysis,Laine’s iPhone,14.3,,2021-01-11 06:30:06+02:00,2021-01-10 23:57:44+02:00,2021-01-11 06:30:06+02:00,1.0,,sleep_analysis,23542.0,2021-01-10
1822386,SleepAnalysis,Laine’s iPhone,14.3,,2021-01-12 07:31:29+02:00,2021-01-12 06:48:08+02:00,2021-01-12 06:58:19+02:00,1.0,,sleep_analysis,611.0,2021-01-12
...,...,...,...,...,...,...,...,...,...,...,...,...
1822477,SleepAnalysis,Laine’s iPhone,14.6,,2021-06-10 05:39:54+02:00,2021-06-10 00:59:51+02:00,2021-06-10 01:32:22+02:00,1.0,,sleep_analysis,1951.0,2021-06-10
1822478,SleepAnalysis,Laine’s iPhone,14.6,,2021-06-10 05:39:54+02:00,2021-06-10 01:32:29+02:00,2021-06-10 05:30:30+02:00,1.0,,sleep_analysis,14281.0,2021-06-10
1822479,SleepAnalysis,Laine’s iPhone,14.6,,2021-06-10 05:39:54+02:00,2021-06-10 05:30:37+02:00,2021-06-10 05:39:53+02:00,1.0,,sleep_analysis,556.0,2021-06-10
1822480,SleepAnalysis,Laine’s iPhone,14.6,,2021-06-13 05:30:21+02:00,2021-06-12 22:49:50+02:00,2021-06-13 05:30:21+02:00,1.0,,sleep_analysis,24031.0,2021-06-12


In [17]:
workout_list = [x.attrib for x in root.iter('Workout')]

# create DataFrame
workout_data = pd.DataFrame(workout_list)
workout_data['workoutActivityType'] = workout_data['workoutActivityType'].str.replace('HKWorkoutActivityType', '')
workout_data = workout_data.rename({"workoutActivityType": "Type"}, axis=1)

# proper type to dates
for col in ['creationDate', 'startDate', 'endDate']:
    workout_data[col] = pd.to_datetime(workout_data[col])

# convert string to numeric   
workout_data['duration'] = pd.to_numeric(workout_data['duration'])
workout_data.tail(n=10)

Unnamed: 0,Type,duration,durationUnit,sourceName,sourceVersion,device,creationDate,startDate,endDate
192,Walking,99.818815,min,Laine’s Apple Watch,11.5,"<<HKDevice: 0xd146054a0>, name:Apple Watch, ma...",2025-09-25 19:15:27+02:00,2025-09-25 17:35:30+02:00,2025-09-25 19:15:19+02:00
193,Walking,86.779552,min,Laine’s Apple Watch,11.5,"<<HKDevice: 0xd146054a0>, name:Apple Watch, ma...",2025-09-26 19:00:46+02:00,2025-09-26 17:33:54+02:00,2025-09-26 19:00:41+02:00
194,Walking,25.732054,min,Laine’s Apple Watch,11.5,"<<HKDevice: 0xd146054a0>, name:Apple Watch, ma...",2025-09-26 19:50:53+02:00,2025-09-26 19:25:07+02:00,2025-09-26 19:50:51+02:00
195,Walking,63.465625,min,Laine’s Apple Watch,11.5,"<<HKDevice: 0xd146054a0>, name:Apple Watch, ma...",2025-09-26 21:39:55+02:00,2025-09-26 20:36:21+02:00,2025-09-26 21:39:49+02:00
196,Walking,394.521379,min,Laine’s Apple Watch,11.5,"<<HKDevice: 0xd146054a0>, name:Apple Watch, ma...",2025-09-27 16:25:02+02:00,2025-09-27 09:50:07+02:00,2025-09-27 16:24:38+02:00
197,Walking,28.430048,min,Laine’s Apple Watch,11.5,"<<HKDevice: 0xd146054a0>, name:Apple Watch, ma...",2025-09-28 17:54:29+02:00,2025-09-28 17:25:59+02:00,2025-09-28 17:54:25+02:00
198,Elliptical,18.39625,min,Laine’s Apple Watch,11.6,"<<HKDevice: 0xd14619ea0>, name:Apple Watch, ma...",2025-10-19 18:00:26+02:00,2025-10-19 17:42:01+02:00,2025-10-19 18:00:25+02:00
199,Walking,28.97849,min,Laine’s Apple Watch,11.6,"<<HKDevice: 0xd14619ea0>, name:Apple Watch, ma...",2025-10-19 19:15:26+02:00,2025-10-19 18:46:26+02:00,2025-10-19 19:15:24+02:00
200,Elliptical,31.125672,min,Laine’s Apple Watch,11.6,"<<HKDevice: 0xd1461a170>, name:Apple Watch, ma...",2025-10-22 16:12:38+02:00,2025-10-22 15:41:28+02:00,2025-10-22 16:12:35+02:00
201,Walking,34.781556,min,Laine’s Apple Watch,11.6,"<<HKDevice: 0xd1461a170>, name:Apple Watch, ma...",2025-10-26 15:14:52+02:00,2025-10-26 14:40:02+02:00,2025-10-26 15:14:49+02:00


In [None]:
def get_workouts_from_to(df, start, end):
    start = pd.to_datetime(start, utc=True)
    end = pd.to_datetime(end, utc=True)
    workouts = df[df["creationDate"] >= start]
    workouts = workouts[workouts["creationDate"] <= end]
    return workouts

lower_time = dt.date(2021, 1, 1)
upper_time = dt.date(2022, 1, 1)
workouts = get_workouts_from_to(workout_data, lower_time, upper_time)

# or relative to the current day
today = dt.date.today()
xdaysago = today - dt.timedelta(days=7)
# first_of_month = today - dt.timedelta(days=today.day - 1)
workouts = get_workouts_from_to(workout_data, xdaysago, today)

# === Step 2: Map Apple types to canonical metric names ===

In [20]:
# mapping dictionary
type_mapping = {
    'BloodPressureSystolic': 'bp_systolic',
    'BloodPressureDiastolic': 'bp_diastolic',
    'ActiveEnergyBurned': 'active_energy',
    'StepCount': 'steps',
    'HeartRate': 'heart_rate',
    'HeartRateVariabilitySDNN': 'hrv_sdnn',
    'BasalEnergyBurned': 'basal_energy',
    'RestingHeartRate': 'resting_hr',
    'WalkingHeartRateAverage': 'walking_hr',
    'DistanceWalkingRunning': 'distance_walk_run',
    'SleepAnalysis': 'sleep_analysis',
    'AppleExerciseTime': 'physical_effort',   # Apple’s exercise minutes
    'Workout': 'physical_effort',             # optional if present
}

# apply mapping, keep unmapped types as-is for exploration
record_data['metric'] = record_data['type'].map(type_mapping).fillna(record_data['type'])



# === Step 3: Basic unit normalization (convert to canonical units) ===

In [44]:
def convert_units(row):
    """Convert Apple Health values to canonical units"""
    metric = row['metric']
    unit = row['unit']
    value = row['value']
    
    # Blood pressure: mmHg (convert from kPa if needed)
    if metric == 'bp_systolic' and unit == 'kPa':
        return value * 7.50062
    if metric == 'bp_diastolic' and unit == 'kPa':
        return value * 7.50062
    
    # Distance: km (convert from m)
    if metric == 'distance_walk_run' and unit == 'm':
        return value / 1000.0
    
    # Energy: kcal (convert from J)
    if metric in ['active_energy', 'basal_energy'] and unit == 'J':
        return value / 4184.0
    
    # Time: minutes (for physical effort)
    if metric == 'physical_effort' and unit == 's':
        return value / 60.0
    
    # Sleep: use duration in seconds from start-end
    if metric == 'sleep_analysis':
        duration = (row['endDate'] - row['startDate']).total_seconds()
        return duration
    
    # default: keep raw value
    return value

record_data['value_canonical'] = record_data.apply(convert_units, axis=1)
record_data.tail()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,metric,value_canonical,date
1848429,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-26 19:43:25+02:00,2025-10-26 19:42:24+02:00,2025-10-26 19:43:23+02:00,66.0651,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma...",hrv_sdnn,66.0651,2025-10-26
1848430,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-26 21:19:27+02:00,2025-10-26 21:18:27+02:00,2025-10-26 21:19:26+02:00,48.4193,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma...",hrv_sdnn,48.4193,2025-10-26
1848431,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-27 07:31:33+02:00,2025-10-27 07:30:32+02:00,2025-10-27 07:31:31+02:00,36.77,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma...",hrv_sdnn,36.77,2025-10-27
1848432,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-27 10:00:32+02:00,2025-10-27 09:59:32+02:00,2025-10-27 10:00:31+02:00,17.6583,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma...",hrv_sdnn,17.6583,2025-10-27
1848433,HeartRateVariabilitySDNN,Laine’s Apple Watch,11.6,ms,2025-10-27 11:19:28+02:00,2025-10-27 11:18:27+02:00,2025-10-27 11:19:12+02:00,31.8419,"<<HKDevice: 0xd14618be0>, name:Apple Watch, ma...",hrv_sdnn,31.8419,2025-10-27


# === Step 4: Derive date field (local calendar day) ===

In [45]:
record_data['date'] = record_data['startDate'].dt.date

In [42]:
len(record_data)

1848434

## Filter only this year

In [51]:
# Get the current year
current_year = dt.date.today().year

# Filter rows where startDate is in the current year
record_data = record_data[record_data['startDate'].dt.year == current_year]

In [52]:
filtered_counts = record_data[record_data['value_canonical'].notna() & (record_data['value_canonical'] != 0)] \
    .groupby('type')['value_canonical'].count()

# Display the result
print(filtered_counts)

type
ActiveEnergyBurned                150949
AppleExerciseTime                  13280
AppleStandHour                      5056
AppleStandTime                     19866
AppleWalkingSteadiness                41
AudioExposureEvent                   199
BasalEnergyBurned                 131053
DistanceCycling                     2703
DistanceWalkingRunning             87837
EnvironmentalAudioExposure          5885
FlightsClimbed                      2829
HeadphoneAudioExposure              3156
HeadphoneAudioExposureEvent           19
HeartRate                         131423
HeartRateRecoveryOneMinute            56
HeartRateVariabilitySDNN            1792
PhysicalEffort                    164000
RespiratoryRate                     2297
RestingHeartRate                     291
RunningGroundContactTime             372
RunningPower                        1039
RunningSpeed                        1051
RunningStrideLength                  371
RunningVerticalOscillation           379
SixMinuteWa

In [53]:
# Group by 'type' and calculate the mean of 'value_canonical'
average_values = record_data.groupby('type')['value_canonical'].mean()

# Filter out rows where the average is not equal to 1
filtered_values = average_values[average_values != 1]

# Display the result
print(filtered_values)

type
ActiveEnergyBurned                   1.084699
AppleStandTime                       2.316168
AppleWalkingSteadiness               0.973277
BasalEnergyBurned                    3.851476
DistanceCycling                      0.008049
DistanceWalkingRunning               0.039020
EnvironmentalAudioExposure          68.984757
FlightsClimbed                       1.467656
HeadphoneAudioExposure              72.267286
HeartRate                           97.892723
HeartRateRecoveryOneMinute          28.932641
HeartRateVariabilitySDNN            45.156980
PhysicalEffort                       3.718269
RespiratoryRate                     16.338920
RestingHeartRate                    61.171821
RunningGroundContactTime           272.231183
RunningPower                       178.296439
RunningSpeed                         8.352166
RunningStrideLength                  0.914043
RunningVerticalOscillation           8.860950
SixMinuteWalkTestDistance          499.487805
SleepAnalysis                

## === Step 5: Aggregate to daily level for the focus metrics ===

In [None]:
focus_metrics = ['heart_rate', 'physical_effort', 'sleep_analysis', 'basal_energy']

daily_summary = []

for day, group in record_data.groupby('date'):
    row = {'date': day}
    
    # --- Heart Rate (bpm) ---
    hr = group[group['metric'] == 'heart_rate']
    if not hr.empty:
        row['hr_mean'] = hr['value_canonical'].mean()
        row['hr_max'] = hr['value_canonical'].max()
        row['hr_min'] = hr['value_canonical'].min()
        row['hr_count'] = hr.shape[0]
    else:
        row['hr_mean'] = row['hr_max'] = row['hr_min'] = None
        row['hr_count'] = 0
    
    # --- Physical Effort (minutes or counts) ---
    pe = group[group['metric'] == 'physical_effort']
    if not pe.empty:
        row['effort_total'] = pe['value_canonical'].sum()
        row['effort_count'] = pe.shape[0]
        row['had_workout'] = True
    else:
        row['effort_total'] = 0
        row['effort_count'] = 0
        row['had_workout'] = False
    
    # --- Sleep Analysis (seconds → hours) ---
    sl = group[group['metric'] == 'sleep_analysis']
    if not sl.empty:
        total_sleep = sl['value_canonical'].sum()
        row['sleep_seconds'] = total_sleep
        row['sleep_hours'] = total_sleep / 3600.0
        row['sleep_periods'] = sl.shape[0]
    else:
        row['sleep_seconds'] = 0
        row['sleep_hours'] = 0
        row['sleep_periods'] = 0
    
    # --- Basal Energy (kcal) ---
    be = group[group['metric'] == 'basal_energy']
    if not be.empty:
        row['basal_energy_total'] = be['value_canonical'].sum()
        row['basal_energy_mean'] = be['value_canonical'].mean()
        row['basal_energy_count'] = be.shape[0]
    else:
        row['basal_energy_total'] = 0
        row['basal_energy_mean'] = 0
        row['basal_energy_count'] = 0
    
    # append row to list
    daily_summary.append(row)

# convert to DataFrame
daily_summary = (
    pd.DataFrame(daily_summary)
    .sort_values('date')
    .reset_index(drop=True)
)


           date     hr_mean  hr_max   hr_min  hr_count  effort_total  \
290  2025-10-18  122.571545   180.0  52.0000       320          40.0   
291  2025-10-19  107.338821   159.0  57.0000       898          70.0   
292  2025-10-20   77.287461   145.0  53.0000       206          19.0   
293  2025-10-21   76.531818   128.0  56.0000       207          18.0   
294  2025-10-22  107.705338   156.0  50.0000       629          49.0   
295  2025-10-23   85.843489   169.0  55.6577       294          27.0   
296  2025-10-24   90.334542   147.0  53.0000       291          14.0   
297  2025-10-25  100.653465   134.0  57.0000       499          27.0   
298  2025-10-26  119.226684   150.0  46.0000       673          51.0   
299  2025-10-27   65.224392   119.0  53.0000        92           3.0   

     effort_count  had_workout  sleep_seconds  sleep_hours  ...  \
290            40         True            0.0          0.0  ...   
291            70         True            0.0          0.0  ...   
292   

  daily_summary[f'{col}_pct_change'] = daily_summary[col].pct_change() * 100


In [60]:
# preview
print(daily_summary.head(10))

         date    hr_mean  hr_max  hr_min  hr_count  effort_total  \
0  2025-01-01  94.754027   145.0    53.0       152           9.0   
1  2025-01-02  67.925285   119.0    48.0       199          15.0   
2  2025-01-03  68.088741   130.0    53.0       187          25.0   
3  2025-01-04  81.559653   166.0    49.0       335          40.0   
4  2025-01-05  72.939575   105.0    61.0       110           3.0   
5  2025-01-06  82.321848   130.0    57.0       264          23.0   
6  2025-01-07  81.948835   137.0    59.0       249          21.0   
7  2025-01-08  82.391835   115.0    56.0       174          21.0   
8  2025-01-09  83.281527   124.0    63.0       220          15.0   
9  2025-01-10  87.844940   150.0    56.0       242          24.0   

   effort_count  had_workout  sleep_seconds  sleep_hours  ...  \
0             9         True        60010.0    16.669444  ...   
1            15         True        13074.0     3.631667  ...   
2            25         True            0.0     0.000000

## === Step 6: Compute day-to-day changes ===

In [61]:
# === Step: Compute day-to-day deltas (for main metrics) ===
for col in ['hr_mean', 'effort_total', 'sleep_hours', 'basal_energy_total']:
    daily_summary[f'{col}_delta'] = daily_summary[col].diff()
    daily_summary[f'{col}_pct_change'] = daily_summary[col].pct_change() * 100

  daily_summary[f'{col}_pct_change'] = daily_summary[col].pct_change() * 100


In [64]:
print(daily_summary.head(100))

          date    hr_mean   hr_max  hr_min  hr_count  effort_total  \
0   2025-01-01  94.754027  145.000    53.0       152           9.0   
1   2025-01-02  67.925285  119.000    48.0       199          15.0   
2   2025-01-03  68.088741  130.000    53.0       187          25.0   
3   2025-01-04  81.559653  166.000    49.0       335          40.0   
4   2025-01-05  72.939575  105.000    61.0       110           3.0   
..         ...        ...      ...     ...       ...           ...   
95  2025-04-06  92.519176  164.295    50.0       262          56.0   
96  2025-04-07  80.428611  167.128    47.0       254          41.0   
97  2025-04-08  86.505096  151.000    54.0       370          47.0   
98  2025-04-09  81.460324  137.000    56.0       265          41.0   
99  2025-04-10  80.375612  173.000    55.0       244          34.0   

    effort_count  had_workout  sleep_seconds  sleep_hours  ...  \
0              9         True        60010.0    16.669444  ...   
1             15         Tr

## == Adding z-score

In [67]:
# window size for baseline (number of previous days)
BASELINE_WINDOW = 7   # you can set to 14 if you prefer

metrics_for_z = ['hr_mean', 'effort_total', 'sleep_hours', 'basal_energy_total']

# sort by date just in case
daily_summary = daily_summary.sort_values('date').reset_index(drop=True)

for col in metrics_for_z:
    # compute rolling mean and std using only previous N days (exclude current)
    daily_summary[f'{col}_baseline_mean'] = (
        daily_summary[col]
        .shift(1)  # exclude today
        .rolling(window=BASELINE_WINDOW, min_periods=3)  # need at least 3 days to start
        .mean()
    )
    daily_summary[f'{col}_baseline_std'] = (
        daily_summary[col]
        .shift(1)
        .rolling(window=BASELINE_WINDOW, min_periods=3)
        .std()
    )
    
    # compute z-score relative to baseline
    # z = (today_value - baseline_mean) / baseline_std
    daily_summary[f'{col}_zscore'] = (
        (daily_summary[col] - daily_summary[f'{col}_baseline_mean']) /
        daily_summary[f'{col}_baseline_std']
    )

# Optionally: replace very large or NaN z-scores for stability
for col in metrics_for_z:
    z_col = f'{col}_zscore'
    daily_summary[z_col] = daily_summary[z_col].replace([np.inf, -np.inf], np.nan)
    # clip to [-5, 5] if you want to limit extreme outliers
    daily_summary[z_col] = daily_summary[z_col].clip(-5, 5)

In [69]:
print(daily_summary[['date'] + [f'{m}_zscore' for m in metrics_for_z]].tail(100))

           date  hr_mean_zscore  effort_total_zscore  sleep_hours_zscore  \
200  2025-07-20        0.318702             2.060714            0.985176   
201  2025-07-21        3.997612            -0.053812            0.448056   
202  2025-07-22       -0.005402             0.216348           -1.076612   
203  2025-07-23       -1.698156            -0.567912           -1.210694   
204  2025-07-24       -0.497965            -0.321545           -0.909374   
..          ...             ...                  ...                 ...   
295  2025-10-23       -0.637202            -0.466936                 NaN   
296  2025-10-24       -0.252004            -1.090756                 NaN   
297  2025-10-25        0.302070            -0.336817                 NaN   
298  2025-10-26        2.037117             0.936063                 NaN   
299  2025-10-27       -1.796578            -1.760598                 NaN   

     basal_energy_total_zscore  
200                   0.953002  
201                  

### Rule engine: mapping signals → suggestions

In [80]:
rules_df = pd.read_csv('RuleBank_v3.csv')

In [81]:
rules_df['symptom']

0                     sleep
1                     sleep
2                     sleep
3                    stress
4                    energy
5                 vasomotor
6                     sleep
7                     sleep
8                    stress
9                    energy
10              environment
11              food_timing
12                hydration
13    journaling/behavioral
14                   safety
15                   energy
Name: symptom, dtype: object

In [93]:
import re

In [94]:
def parse_condition(expr):
    """
    Example: '[sleep_efficiency@last_night < baseline − 1.0SD]'
    returns {'metric': 'sleep_efficiency', 'operator': '<', 'threshold': -1.0, 'window': 'last_night'}
    """
    pattern = r'\[(\w+)@(\w+)\s*([<>]=?|==)\s*baseline\s*[\-+]\s*([0-9.]+)SD\]'
    m = re.search(pattern, expr.replace('−', '-'))  # normalize minus symbol
    if m:
        metric, window, op, sd = m.groups()
        sign = -1 if '-' in expr else 1
        return {'metric': metric, 'window': window, 'operator': op, 'sd_offset': sign * float(sd)}
    return None

rules_df['parsed_conditions'] = rules_df['IF'].apply(parse_condition)

In [95]:
def parse_if_clause(expr: str):
    """
    Parse clauses like:
    '[sleep_efficiency@last_night < baseline − 1.0SD] AND [data_confidence ≥ 70]'
    into structured dict(s).
    """
    expr = str(expr).replace('−', '-')  # normalize minus sign
    
    metric_pattern = r'(\w+)@(\w+)\s*([<>]=?|==)\s*baseline\s*([+-])\s*([\d.]+)SD'
    conf_pattern = r'data_confidence\s*([<>]=?|==)\s*(\d+)'
    
    metric_match = re.search(metric_pattern, expr)
    conf_match = re.search(conf_pattern, expr)
    
    parsed = {}
    if metric_match:
        metric, window, op, sign, sd = metric_match.groups()
        parsed.update({
            'metric': metric,
            'window': window,
            'operator': op,
            'sd_offset': float(sd) * (-1 if sign == '-' else 1)
        })
    if conf_match:
        conf_op, conf_val = conf_match.groups()
        parsed.update({
            'data_confidence_operator': conf_op,
            'data_confidence_threshold': float(conf_val)
        })
    
    return parsed or None

rules_df['parsed'] = rules_df['IF'].apply(parse_if_clause)

In [96]:
rules_df.head()

Unnamed: 0,rule_id,symptom,IF,data_confidence_threshold,UNLESS,THEN,BECAUSE,REFER,SOURCE,NOTES,evidence_tier,windows_baseline_days,windows_trend_days,windows_event,cooldown_days,priority,unit_tests,parsed,parsed_conditions
0,R601,sleep,[sleep_efficiency@last_night < baseline − 1.0S...,70,Alcohol last 24h; Illness or fever present; Me...,Keep the same wake-up time every day.,Regular timing stabilizes sleep pressure and r...,No,"ALSWH submission, 2024; Lancet empowerment mod...",Data confidence = min(sensor_quality_score_wei...,B,30,3,last_night,2,90,"Fires: efficiency −1.2SD, confidence 82. || Su...","{'metric': 'sleep_efficiency', 'window': 'last...","{'metric': 'sleep_efficiency', 'window': 'last..."
1,R602,sleep,[sleep_latency@last_night > baseline + 1.0SD] ...,70,Late caffeine; Late heavy meal; Night/rotating...,Avoid caffeine after lunch.,Later caffeine can delay sleep onset.,No,"Lancet empowerment model, 2024",Sleep signals ignore GPX; sensor_quality falls...,B,30,3,last_night,2,90,"Fires: latency +1.1SD, confidence 78. || Suppr...","{'metric': 'sleep_latency', 'window': 'last_ni...","{'metric': 'sleep_latency', 'window': 'last_ni..."
2,R603,sleep,[sleep_awakenings@last_night > baseline + 1.0S...,70,Very hot night; Severe night sweats logged; Al...,Wear breathable layers you can remove quickly.,Cooling options reduce warm wake-ups.,No,"ALSWH submission, 2024","If temp missing, lower priority; GPX affects o...",B,30,3,last_night,2,90,Fires: awakenings +1.0SD & temp +0.6SD. || Sup...,"{'metric': 'sleep_awakenings', 'window': 'last...",
3,R604,stress,[hrv (night)@last_night < baseline − 1.0SD AND...,70,Acute stress event; Illness or fever present; ...,Do 10 minutes of mindful breathing today.,Brief relaxation can help on low-HRV days.,No,"Lancet mental health review, 2024",Require night window ≥80%; HRV σ_floor=5 ms; G...,B,30,3,last_night,1,80,Fires: HRV −1.2SD & stress tag. || Suppressed:...,,
4,R605,energy,[resting_hr (day)@last_24h > baseline + 1.0SD ...,70,Illness or fever present; Intense exercise day...,Take a brisk 10-minute walk mid-morning.,Light movement safely lifts daytime energy.,No,"ALSWH submission, 2024",activity_intensity from ActivityIntensity_ByDa...,B,30,3,last_24h,1,70,Fires: RHR +1.2SD & activity_intensity low. ||...,"{'metric': 'activity_intensity', 'window': 'la...",


In [97]:
def to_json_row(row):
    return {
        'rule_id': row['rule_id'],
        'symptom': row['symptom'],
        'condition': row['parsed'],
        'unless': [u.strip() for u in str(row['UNLESS']).split(';') if u.strip() and u.strip() != '—'],
        'then': row['THEN'],
        'because': row['BECAUSE'],
        'reference': row.get('REFER'),
        'source': row.get('SOURCE'),
        'evidence_tier': row.get('evidence_tier'),
        'window_params': {
            'baseline_days': row.get('windows_baseline_days'),
            'trend_days': row.get('windows_trend_days'),
            'event': row.get('windows_event'),
            'cooldown_days': row.get('cooldown_days'),
        },
        'priority': row.get('priority'),
    }

rules_json = [to_json_row(rules_df.iloc[i]) for i in range(len(rules_df))]

In [98]:
rules_json

[{'rule_id': 'R601',
  'symptom': 'sleep',
  'condition': {'metric': 'sleep_efficiency',
   'window': 'last_night',
   'operator': '<',
   'sd_offset': -1.0},
  'unless': ['Alcohol last 24h',
   'Illness or fever present',
   'Menstruation day 1–2',
   'Night/rotating shift',
   'Travel/time-zone shift',
   'Noisy/bright sleep environment',
   'Device off-wrist/loose fit'],
  'then': 'Keep the same wake-up time every day.',
  'because': 'Regular timing stabilizes sleep pressure and routine.',
  'reference': 'No',
  'source': 'ALSWH submission, 2024; Lancet empowerment model, 2024',
  'evidence_tier': 'B',
  'window_params': {'baseline_days': np.int64(30),
   'trend_days': np.int64(3),
   'event': 'last_night',
   'cooldown_days': np.int64(2)},
  'priority': np.int64(90)},
 {'rule_id': 'R602',
  'symptom': 'sleep',
  'condition': {'metric': 'sleep_latency',
   'window': 'last_night',
   'operator': '>',
   'sd_offset': 1.0},
  'unless': ['Late caffeine',
   'Late heavy meal',
   'Night/

In [84]:
import json

with open("rules_bank.json", "w", encoding="utf-8") as f:
    json.dump(rules_json, f, indent=2, ensure_ascii=False)

print(f"Saved {len(rules_json)} rules to rules_bank.json")

TypeError: Object of type int64 is not JSON serializable

### Fire rules

In [129]:
def evaluate_rule(rule, row):
    """Evaluate a single rule on a given day's data"""
    cond = rule["condition"]
    metric = cond["metric"]
    z_col = f"{metric}_zscore"

    if z_col not in row: 
        return False, None

    z_value = row[z_col]

    # zscore condition
    operator = cond["operator"]
    sd_offset = cond["sd_offset"]
    if operator == ">" and not (z_value > sd_offset):
        return False, None
    if operator == "<" and not (z_value < sd_offset):
        return False, None

    # confidence condition
    # conf_op = cond.get("data_confidence_operator", ">=")
    # conf_thresh = cond.get("data_confidence_threshold", 70)
    # if conf_op == ">=" and conf_value < conf_thresh:
    #     return False, None
    # if conf_op == "<=" and conf_value > conf_thresh:
    #     return False, None

    return True, z_value


In [136]:

def apply_rules(df, rules):
    """Apply all rules to the entire daily summary DataFrame"""
    results = []
    for date, row in df.iterrows():
        for rule in rules:
            fires, z_value = evaluate_rule(rule, row)
            if fires:
                results.append({
                    "date": row['date'],
                    "rule_id": rule["rule_id"],
                    "symptom": rule["symptom"],
                    "metric": rule["condition"]["metric"],
                    "z_value": round(z_value, 2),
                    "suggestion": rule["then"],
                    "because": rule["because"],
                    "priority": rule["priority"],
                    "source": rule["source"]
                })
    return pd.DataFrame(results)

In [131]:
with open("rules_bank.json", "r") as f:
    rules = json.load(f)

In [127]:
daily_summary.columns

Index(['date', 'hr_mean', 'hr_max', 'hr_min', 'hr_count', 'effort_total',
       'effort_count', 'had_workout', 'sleep_seconds', 'sleep_hours',
       'sleep_periods', 'basal_energy_total', 'basal_energy_mean',
       'basal_energy_count', 'hr_mean_delta', 'hr_mean_pct_change',
       'effort_total_delta', 'effort_total_pct_change', 'sleep_hours_delta',
       'sleep_hours_pct_change', 'basal_energy_total_delta',
       'basal_energy_total_pct_change', 'hr_mean_baseline_mean',
       'hr_mean_baseline_std', 'hr_mean_zscore', 'effort_total_baseline_mean',
       'effort_total_baseline_std', 'effort_total_zscore',
       'sleep_hours_baseline_mean', 'sleep_hours_baseline_std',
       'sleep_hours_zscore', 'basal_energy_total_baseline_mean',
       'basal_energy_total_baseline_std', 'basal_energy_total_zscore'],
      dtype='object')

In [137]:
results = apply_rules(daily_summary, rules)
print(results)

           date rule_id           symptom              metric  z_value  \
0    2025-01-05   R1002      low_activity        effort_total    -1.42   
1    2025-01-10   R1001       cardio_load             hr_mean     1.50   
2    2025-01-10   R1003     sleep_deficit         sleep_hours    -1.28   
3    2025-01-11   R1001       cardio_load             hr_mean     5.00   
4    2025-01-11   R1003     sleep_deficit         sleep_hours    -1.11   
..          ...     ...               ...                 ...      ...   
162  2025-10-16   R1004  metabolic_energy  basal_energy_total     1.03   
163  2025-10-18   R1001       cardio_load             hr_mean     3.33   
164  2025-10-24   R1002      low_activity        effort_total    -1.09   
165  2025-10-26   R1001       cardio_load             hr_mean     2.04   
166  2025-10-27   R1002      low_activity        effort_total    -1.76   

                                            suggestion  \
0    Stand and stretch for one minute each hour or ..

In [139]:
results.tail(10)

Unnamed: 0,date,rule_id,symptom,metric,z_value,suggestion,because,priority,source
157,2025-10-09,R1001,cardio_load,hr_mean,1.19,Drink more water and take a light recovery wal...,Elevated average heart rate can indicate dehyd...,90,Derived from Apple Watch heart rate trends
158,2025-10-11,R1004,metabolic_energy,basal_energy_total,1.26,Ensure adequate hydration and balanced meals.,Higher basal energy expenditure may reflect st...,70,Apple Watch basal energy trend
159,2025-10-13,R1001,cardio_load,hr_mean,1.72,Drink more water and take a light recovery wal...,Elevated average heart rate can indicate dehyd...,90,Derived from Apple Watch heart rate trends
160,2025-10-14,R1002,low_activity,effort_total,-1.15,Stand and stretch for one minute each hour or ...,Low daily activity can lower energy and sleep ...,80,Apple Watch activity rings
161,2025-10-16,R1001,cardio_load,hr_mean,1.32,Drink more water and take a light recovery wal...,Elevated average heart rate can indicate dehyd...,90,Derived from Apple Watch heart rate trends
162,2025-10-16,R1004,metabolic_energy,basal_energy_total,1.03,Ensure adequate hydration and balanced meals.,Higher basal energy expenditure may reflect st...,70,Apple Watch basal energy trend
163,2025-10-18,R1001,cardio_load,hr_mean,3.33,Drink more water and take a light recovery wal...,Elevated average heart rate can indicate dehyd...,90,Derived from Apple Watch heart rate trends
164,2025-10-24,R1002,low_activity,effort_total,-1.09,Stand and stretch for one minute each hour or ...,Low daily activity can lower energy and sleep ...,80,Apple Watch activity rings
165,2025-10-26,R1001,cardio_load,hr_mean,2.04,Drink more water and take a light recovery wal...,Elevated average heart rate can indicate dehyd...,90,Derived from Apple Watch heart rate trends
166,2025-10-27,R1002,low_activity,effort_total,-1.76,Stand and stretch for one minute each hour or ...,Low daily activity can lower energy and sleep ...,80,Apple Watch activity rings
