In [1]:
import pandas as pd 
import xml.etree.ElementTree as ET

In [2]:
xml_path = "export.xml"
tree = ET.parse(xml_path)
root = tree.getroot()

# Create a set to store unique @type values.
types_set = []

# Iterate through the XML elements and extract @type attribute.
for record in root.findall('.//Record'):
    type_attribute = record.get('type')
    if type_attribute:
        types_set.append(type_attribute)

In [7]:
types_sets =pd.Series(types_set).unique()

In [12]:
a = pd.DataFrame(types_sets, columns=['type'])
a[a['type'].str.contains('Running')]

Unnamed: 0,type
5,HKQuantityTypeIdentifierDistanceWalkingRunning
27,HKQuantityTypeIdentifierRunningStrideLength
28,HKQuantityTypeIdentifierRunningVerticalOscilla...
29,HKQuantityTypeIdentifierRunningGroundContactTime
31,HKQuantityTypeIdentifierRunningPower
33,HKQuantityTypeIdentifierRunningSpeed


In [4]:
REQUIRED_RECORD_TYPES = {
    'HKQuantityTypeIdentifierHeartRateVariabilitySDNN',
    'HKCategoryTypeIdentifierSleepAnalysis',
    'HKQuantityTypeIdentifierActiveEnergyBurned',
    'HKQuantityTypeIdentifierVO2Max',
    'HKQuantityTypeIdentifierHeartRate',
    'HKQuantityTypeIdentifierDistanceWalkingRunning'
}
def parse_apple_health_xml(uploaded_file):
    record_list = []
    workout_list = []
    try:
        for event, elem in ET.iterparse(uploaded_file, events=('end',)):
            if event == 'end':
                if elem.tag == 'Workout':
                    workout_list.append(elem.attrib)
                elif elem.tag == 'Record':
                    record_list.append(elem.attrib)
                elem.clear()

        workouts_df = pd.DataFrame(workout_list)
        records_df = pd.DataFrame(record_list)
        
        # Convert date columns for both dataframes
        for df in [workouts_df, records_df]:
            if not df.empty:
                for col in ['creationDate', 'startDate', 'endDate']:
                    if col in df.columns:
                        df[col] = pd.to_datetime(df[col], errors='coerce')
        
        # Convert numeric columns for workouts
        if not workouts_df.empty:
            for col in ['duration', 'totalDistance', 'totalEnergyBurned']:
                if col in workouts_df.columns:
                    workouts_df[col] = pd.to_numeric(workouts_df[col], errors='coerce')
        
        # For records, convert value ONLY for non-sleep-analysis records.
        if not records_df.empty:
            # This is safer: we leave 'value' as an object and convert it in each
            # metric function as needed. The only truly essential numeric is duration_seconds for sleep.
            sleep_mask = records_df['type'] == 'HKCategoryTypeIdentifierSleepAnalysis'
            if sleep_mask.any():
                records_df.loc[sleep_mask, 'duration_seconds'] = (records_df.loc[sleep_mask, 'endDate'] - records_df.loc[sleep_mask, 'startDate']).dt.total_seconds()

        return {'workouts': workouts_df, 'records': records_df}
    except Exception as e:
        print(f"An error occurred during parsing: {e}")
        return None


In [5]:
workout_list = []
for event, elem in ET.iterparse('export.xml', events=('end',)):
    if event == 'end':
        if elem.tag == 'Workout':
            workout_list.append(elem)
    

In [14]:
workout_list.sort(key=lambda x: x.get('startDate', ''), reverse=True)

In [15]:
workout_list[0].get('')

In [16]:
data = parse_apple_health_xml('export.xml')

In [17]:

data['workouts'].sort_values(by='startDate',ascending=False, inplace=True)

In [18]:
data['workouts'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 308 entries, 307 to 0
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype                    
---  ------               --------------  -----                    
 0   workoutActivityType  308 non-null    object                   
 1   duration             308 non-null    float64                  
 2   durationUnit         308 non-null    object                   
 3   sourceName           308 non-null    object                   
 4   sourceVersion        308 non-null    object                   
 5   creationDate         308 non-null    datetime64[ns, UTC+02:00]
 6   startDate            308 non-null    datetime64[ns, UTC+02:00]
 7   endDate              308 non-null    datetime64[ns, UTC+02:00]
 8   device               258 non-null    object                   
dtypes: datetime64[ns, UTC+02:00](3), float64(1), object(5)
memory usage: 24.1+ KB


In [93]:
data['records'].sort_values(by='startDate', ascending=False, inplace=True)

In [94]:
data['records'][data['records']['type']=='HKCategoryTypeIdentifierSleepAnalysis'].head()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,duration_seconds
2715478,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 09:30:58+02:00,2025-08-17 09:53:58+02:00,HKCategoryValueSleepAnalysisAsleepREM,,1380.0
2715477,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:48:28+02:00,2025-08-17 09:30:58+02:00,HKCategoryValueSleepAnalysisAsleepCore,,2550.0
2715476,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:44:28+02:00,2025-08-17 08:48:28+02:00,HKCategoryValueSleepAnalysisAwake,,240.0
2715475,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:21:28+02:00,2025-08-17 08:44:28+02:00,HKCategoryValueSleepAnalysisAsleepCore,,1380.0
2715474,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:20:58+02:00,2025-08-17 08:21:28+02:00,HKCategoryValueSleepAnalysisAwake,,30.0


### Sleep Analysis

##### Sleep Score
|Component | Max Points | How it's Calculated | Reason|
|:--- |:---:|:---|:---|
|Total Sleep Duration |	30	| Scored linearly up to a target of 8 hours. More than 8 hours provides no extra points. | Total sleep is a foundational pillar of health.
|Deep Sleep Percentage	| 30|	Max points are awarded for hitting the target of 18%. The score decreases the further you are from this target.	|The most physically restorative stage. Quality over pure quantity.|
|REM Sleep Percentage	|20|	Max points are awarded for hitting the target of 22%. The score decreases the further you are from this target.	|Crucial for memory consolidation and mental recovery.|
|Sleep Continuity	| 20 |	Based on total time awake. Less than 30 mins of awake time gets a perfect score.	|Frequent awakenings disrupt the sleep cycle and reduce overall quality.|

Total	100		

In [95]:
df = data['records']
sleep_df = df[df['type'] == 'HKCategoryTypeIdentifierSleepAnalysis'].copy()
sleep_df.head()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,duration_seconds
2715478,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 09:30:58+02:00,2025-08-17 09:53:58+02:00,HKCategoryValueSleepAnalysisAsleepREM,,1380.0
2715477,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:48:28+02:00,2025-08-17 09:30:58+02:00,HKCategoryValueSleepAnalysisAsleepCore,,2550.0
2715476,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:44:28+02:00,2025-08-17 08:48:28+02:00,HKCategoryValueSleepAnalysisAwake,,240.0
2715475,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:21:28+02:00,2025-08-17 08:44:28+02:00,HKCategoryValueSleepAnalysisAsleepCore,,1380.0
2715474,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:20:58+02:00,2025-08-17 08:21:28+02:00,HKCategoryValueSleepAnalysisAwake,,30.0


In [96]:

# Find the last night's sleep session
df_timezone = sleep_df['startDate'].dt.tz
today_cheat = sleep_df['startDate'].max().normalize() 
yesterday_noon = today_cheat - pd.DateOffset(hours=12) if df_timezone else pd.Timestamp.now().normalize() - pd.DateOffset(hours=12)
last_night_sleep = sleep_df[sleep_df['startDate'] >= yesterday_noon].copy()
# --- Calculate Core Sleep Metrics ---
stage_durations = last_night_sleep.groupby('value')['duration_seconds'].sum()
time_in_deep_s = stage_durations.get('HKCategoryValueSleepAnalysisAsleepDeep', 0)
time_in_rem_s = stage_durations.get('HKCategoryValueSleepAnalysisAsleepREM', 0)
time_in_light_s = stage_durations.get('HKCategoryValueSleepAnalysisAsleepCore', 0)
time_awake_s = stage_durations.get('HKCategoryValueSleepAnalysisAwake', 0)

total_asleep_s = time_in_deep_s + time_in_rem_s + time_in_light_s

total_asleep_h = total_asleep_s / 3600


# --- Component Scoring (out of 100) ---

# 1. Total Sleep Duration Score (30 points)
duration_score = min(1, total_asleep_h / 8.0) * 30

# 2. Deep Sleep Score (30 points)
deep_percentage = time_in_deep_s / total_asleep_s
deep_target = 0.18 # Target 18%
# Score is 1 at target, 0 if it's twice the target distance away
deep_score = max(0, 1 - abs(deep_percentage - deep_target) / deep_target) * 30
# 3. REM Sleep Score (20 points)
rem_percentage = time_in_rem_s / total_asleep_s
rem_target = 0.22 # Target 22%
rem_score = max(0, 1 - abs(rem_percentage - rem_target) / rem_target) * 20

# 4. Sleep Continuity Score (20 points)
# Score based on total time awake during the sleep session
continuity_score = max(0, 1 - (time_awake_s / 3600)) * 20 # Perfect score for <1 min awake, 0 for >60min

# --- Final Score ---
total_score = int(duration_score + deep_score + rem_score + continuity_score)

help_text = (f"Score Breakdown:\n"
                f"- Duration: {duration_score:.0f}/30 ({total_asleep_h:.1f}h)\n"
                f"- Deep Sleep: {deep_score:.0f}/30 ({deep_percentage:.1%})\n"
                f"- REM Sleep: {rem_score:.0f}/20 ({rem_percentage:.1%})\n"
                f"- Continuity: {continuity_score:.0f}/20 ({time_awake_s/60:.0f}m awake)")

print(f"Total Sleep Score: {total_score}/100")
print(help_text)

Total Sleep Score: 71/100
Score Breakdown:
- Duration: 30/30 (9.1h)
- Deep Sleep: 8/30 (4.8%)
- REM Sleep: 16/20 (26.7%)
- Continuity: 18/20 (6m awake)


In [97]:
df = data['records']
hrv_df = df[df['type'] == 'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'].copy()
sleep_df = df[df['type'] == 'HKCategoryTypeIdentifierSleepAnalysis'].copy()
active_energy_df = df[df['type'] == 'HKQuantityTypeIdentifierActiveEnergyBurned'].copy()

In [98]:
hrv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5410 entries, 2733909 to 2728500
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype                    
---  ------            --------------  -----                    
 0   type              5410 non-null   object                   
 1   sourceName        5410 non-null   object                   
 2   sourceVersion     5410 non-null   object                   
 3   unit              5410 non-null   object                   
 4   creationDate      5410 non-null   datetime64[ns, UTC+02:00]
 5   startDate         5410 non-null   datetime64[ns, UTC+02:00]
 6   endDate           5410 non-null   datetime64[ns, UTC+02:00]
 7   value             5410 non-null   object                   
 8   device            5410 non-null   object                   
 9   duration_seconds  0 non-null      float64                  
dtypes: datetime64[ns, UTC+02:00](3), float64(1), object(6)
memory usage: 464.9+ KB


In [99]:

df = data['records']
hrv_df = df[df['type'] == 'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'].copy()
sleep_df = df[df['type'] == 'HKCategoryTypeIdentifierSleepAnalysis'].copy()
active_energy_df = df[df['type'] == 'HKQuantityTypeIdentifierActiveEnergyBurned'].copy()


# --- Model Constants ---
MAX_DAILY_ACTIVITY_BURN = 2000; MAX_ACTIVITY_DEPLETION_POINTS = 50; PASSIVE_DRAIN_PER_HOUR = 3
RECHARGE_POINTS = {'Deep': 20, 'REM': 12, 'Light': 6, 'Awake': -15} # Points per hour

# --- Time Setup ---
df_timezone = df['endDate'].dt.tz if 'endDate' in df else None
today = pd.Timestamp.now(tz=df_timezone).normalize() if pd.Timestamp.now(tz=df_timezone).normalize() == df['endDate'].max().normalize() else df['endDate'].max().normalize()
print(f"Today: {today} ")
last_night_start = today - pd.DateOffset(hours=12); last_night_end = today + pd.DateOffset(hours=12)

# --- Step 1: The Recharge Engine ---
last_night_sleep = sleep_df[(sleep_df['startDate'] >= last_night_start) & (sleep_df['endDate'] < last_night_end)]


stage_durations_s = last_night_sleep.groupby('value')['duration_seconds'].sum()
deep_h = stage_durations_s.get('HKCategoryValueSleepAnalysisAsleepDeep', 0) / 3600
rem_h = stage_durations_s.get('HKCategoryValueSleepAnalysisAsleepREM', 0) / 3600
light_h = stage_durations_s.get('HKCategoryValueSleepAnalysisAsleepCore', 0) / 3600
awake_h = stage_durations_s.get('HKCategoryValueSleepAnalysisAwake', 0) / 3600

# Calculate raw score from sleep stages
raw_recharge_score = ((deep_h * RECHARGE_POINTS['Deep']) + 
                        (rem_h * RECHARGE_POINTS['REM']) + 
                        (light_h * RECHARGE_POINTS['Light']) + 
                        (awake_h * RECHARGE_POINTS['Awake']))
print(f"Raw Recharge Score: {raw_recharge_score:.0f} pts (Deep: {deep_h:.1f}h, REM: {rem_h:.1f}h, Light: {light_h:.1f}h, Awake: {awake_h:.1f}h)")
# Apply HRV as a quality multiplier
hrv_last_night = hrv_df[(hrv_df['endDate'] >= last_night_start) & (hrv_df['endDate'] < last_night_end)]
if not hrv_last_night.empty:
    avg_hrv_raw = hrv_last_night['value'].astype(float).mean()
    unit = hrv_last_night['unit'].iloc[0] if 'unit' in hrv_last_night.columns and not hrv_last_night['unit'].empty else None
    if unit == 'ms': avg_hrv = avg_hrv_raw
    elif avg_hrv_raw < 1.0: avg_hrv = avg_hrv_raw * 1000
    else: avg_hrv = avg_hrv_raw
else: avg_hrv = 45 # Use baseline if no HRV data
print(f"Average HRV Last Night: {avg_hrv:.1f} ms")

hrv_multiplier = 1 + max(-0.25, min(0.25, (avg_hrv - 45) / 45)) # +/- 25% effect
morning_battery_level = max(20, min(100, raw_recharge_score * hrv_multiplier))
print(f"Morning Body Battery Level: {morning_battery_level:.0f} pts (Recharge Score: {raw_recharge_score:.0f}, HRV Multiplier: {hrv_multiplier:.2f}x)")
# --- Step 2: The Depletion Engine ---
active_energy_today = active_energy_df[active_energy_df['endDate'] >= today]
calories_burned_today = active_energy_today['value'].astype(float).sum()
activity_depletion = (calories_burned_today / MAX_DAILY_ACTIVITY_BURN) * MAX_ACTIVITY_DEPLETION_POINTS
print(f"Calories Burned Today: {calories_burned_today:.0f} kcal, Activity Depletion: {activity_depletion:.0f} pts")
wake_up_time = last_night_sleep['endDate'].max()
now_corrected = pd.Timestamp.now(tz=df_timezone) if pd.Timestamp.now(tz=df_timezone) == df['endDate'].max() else df['endDate'].max()
print(f"Wake Up Time: {wake_up_time}, Now Corrected: {now_corrected}")
hours_awake = max(0, (now_corrected - wake_up_time).total_seconds() / 3600)
passive_drain = hours_awake * PASSIVE_DRAIN_PER_HOUR
total_depletion_today = activity_depletion + passive_drain

# --- Step 3: Final Score ---
current_body_battery = max(0, min(100, morning_battery_level - total_depletion_today)) 

help_text = (f"Started day at: {morning_battery_level:.0f} pts (Recharge Score: {raw_recharge_score:.0f}, HRV Multiplier: {hrv_multiplier:.2f}x). "
                f"Depletion today: {total_depletion_today:.0f} pts.")

print(f"Current Body Battery: {current_body_battery:.0f} pts")

Today: 2025-08-17 00:00:00+02:00 
Raw Recharge Score: 74 pts (Deep: 0.4h, REM: 2.4h, Light: 6.2h, Awake: 0.1h)
Average HRV Last Night: 61.7 ms
Morning Body Battery Level: 92 pts (Recharge Score: 74, HRV Multiplier: 1.25x)
Calories Burned Today: 159 kcal, Activity Depletion: 4 pts
Wake Up Time: 2025-08-17 09:53:58+02:00, Now Corrected: 2025-08-17 16:34:23+02:00
Current Body Battery: 68 pts


In [100]:
health_data = data
df = health_data['records']
hrv_df = df[df['type'] == 'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'].copy()
sleep_df = df[df['type'] == 'HKCategoryTypeIdentifierSleepAnalysis'].copy()
active_energy_df = df[df['type'] == 'HKQuantityTypeIdentifierActiveEnergyBurned'].copy()



In [101]:
print("HRV DataFrame:")
sleep_df.head(20)

HRV DataFrame:


Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,duration_seconds
2715478,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 09:30:58+02:00,2025-08-17 09:53:58+02:00,HKCategoryValueSleepAnalysisAsleepREM,,1380.0
2715477,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:48:28+02:00,2025-08-17 09:30:58+02:00,HKCategoryValueSleepAnalysisAsleepCore,,2550.0
2715476,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:44:28+02:00,2025-08-17 08:48:28+02:00,HKCategoryValueSleepAnalysisAwake,,240.0
2715475,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:21:28+02:00,2025-08-17 08:44:28+02:00,HKCategoryValueSleepAnalysisAsleepCore,,1380.0
2715474,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 08:20:58+02:00,2025-08-17 08:21:28+02:00,HKCategoryValueSleepAnalysisAwake,,30.0
2715473,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 07:38:58+02:00,2025-08-17 08:20:58+02:00,HKCategoryValueSleepAnalysisAsleepCore,,2520.0
2715472,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 06:56:28+02:00,2025-08-17 07:38:58+02:00,HKCategoryValueSleepAnalysisAsleepREM,,2550.0
2715471,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 05:51:28+02:00,2025-08-17 06:56:28+02:00,HKCategoryValueSleepAnalysisAsleepCore,,3900.0
2715470,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 05:19:28+02:00,2025-08-17 05:51:28+02:00,HKCategoryValueSleepAnalysisAsleepREM,,1920.0
2715469,HKCategoryTypeIdentifierSleepAnalysis,Thibaut’s Apple Watch,11.6,,2025-08-17 10:16:01+02:00,2025-08-17 04:37:28+02:00,2025-08-17 05:19:28+02:00,HKCategoryValueSleepAnalysisAsleepCore,,2520.0


In [102]:
hrv_df

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,duration_seconds
2733909,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,11.6,ms,2025-08-17 14:19:04+02:00,2025-08-17 14:18:03+02:00,2025-08-17 14:19:03+02:00,85.0976,"<<HKDevice: 0x77006a6a0>, name:Apple Watch, ma...",
2733908,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,11.6,ms,2025-08-17 11:50:15+02:00,2025-08-17 11:49:14+02:00,2025-08-17 11:50:14+02:00,37.5832,"<<HKDevice: 0x77006a6a0>, name:Apple Watch, ma...",
2733907,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,11.6,ms,2025-08-17 09:21:02+02:00,2025-08-17 09:20:01+02:00,2025-08-17 09:21:01+02:00,46.6296,"<<HKDevice: 0x77006a6a0>, name:Apple Watch, ma...",
2733906,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,11.6,ms,2025-08-17 07:20:58+02:00,2025-08-17 07:19:58+02:00,2025-08-17 07:20:56+02:00,102.381,"<<HKDevice: 0x77006a6a0>, name:Apple Watch, ma...",
2733905,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,11.6,ms,2025-08-17 05:20:56+02:00,2025-08-17 05:19:55+02:00,2025-08-17 05:20:54+02:00,53.2142,"<<HKDevice: 0x77006a6a0>, name:Apple Watch, ma...",
...,...,...,...,...,...,...,...,...,...,...
2728504,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,10.0,ms,2023-11-17 10:04:18+02:00,2023-11-17 10:03:17+02:00,2023-11-17 10:04:13+02:00,23.4777,"<<HKDevice: 0x76f011920>, name:Apple Watch, ma...",
2728503,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,10.0,ms,2023-11-17 06:00:00+02:00,2023-11-17 05:58:59+02:00,2023-11-17 05:59:58+02:00,144.416,"<<HKDevice: 0x76f011920>, name:Apple Watch, ma...",
2728502,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,10.0,ms,2023-11-17 02:02:11+02:00,2023-11-17 02:01:10+02:00,2023-11-17 02:02:09+02:00,55.9046,"<<HKDevice: 0x76f011920>, name:Apple Watch, ma...",
2728501,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Thibaut’s Apple Watch,10.0,ms,2023-11-17 00:48:47+02:00,2023-11-17 00:47:46+02:00,2023-11-17 00:48:37+02:00,67.6049,"<<HKDevice: 0x76f011920>, name:Apple Watch, ma...",


In [19]:
calculate_body_battery(data)

NameError: name 'calculate_body_battery' is not defined

In [20]:
def calculate_sleep_coach(health_data):
    if 'records' not in health_data or health_data['records'].empty: print( 0, "Not enough data"
    sleep_df = health_data['records'][health_data['records']['type'] == 'HKCategoryTypeIdentifierSleepAnalysis'].copy()
    if sleep_df.empty: print( 0, "No sleep data"

    df_timezone = sleep_df['startDate'].dt.tz
    yesterday_noon = pd.Timestamp.now(tz=df_timezone).normalize() - pd.DateOffset(hours=12) if df_timezone else pd.Timestamp.now().normalize() - pd.DateOffset(hours=12)
    last_night_sleep = sleep_df[sleep_df['startDate'] >= yesterday_noon]
    if last_night_sleep.empty: print( 0, "No sleep data for last night"

    asleep_df = last_night_sleep[last_night_sleep['value'] == 'HKCategoryValueSleepAnalysisAsleep']
    time_asleep_hours = asleep_df['duration_seconds'].sum() / 3600 if not asleep_df.empty else 0
    time_in_bed_hours = (last_night_sleep['endDate'].max() - last_night_sleep['startDate'].min()).total_seconds() / 3600
    
    duration_score = min(100, (time_asleep_hours / 8.0) * 100)
    efficiency_score = (time_asleep_hours / time_in_bed_hours) * 100 if time_in_bed_hours > 0 else 0
    total_score = int(0.6 * duration_score + 0.4 * efficiency_score)
    print( total_score, f"Asleep: {time_asleep_hours:.1f} hrs, Efficiency: {efficiency_score:.0f}%"

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3141000653.py, line 2)

In [21]:
vo2_max_df = data['records'][data['records']['type'] == 'HKQuantityTypeIdentifierVO2Max'].copy().sort_values('creationDate', ascending=False)

In [22]:
vo2_max_df['value'] = vo2_max_df['value'].astype(float)

In [23]:
# For robustness, ensure timezone consistency
df_timezone = hrv_df['endDate'].dt.tz
today = pd.Timestamp.now(tz=df_timezone).normalize() if pd.Timestamp.now(tz=df_timezone).normalize() == hrv_df['endDate'].max().normalize() else hrv_df['endDate'].max().normalize()

# Define time windows
seven_days_ago = today - pd.Timedelta(days=7)
sixty_days_ago = today - pd.Timedelta(days=60)

# Filter for morning HRV (e.g., before 8 AM) for better accuracy
morning_hrv = hrv_df[hrv_df['endDate'].dt.hour < 8]
if len(morning_hrv) < 5: morning_hrv = hrv_df # Fallback if not enough morning readings

# Calculate averages
recent_avg = morning_hrv[morning_hrv['endDate'] >= seven_days_ago]['value'].astype(float).mean()
baseline_avg = morning_hrv[(morning_hrv['endDate'] >= sixty_days_ago) & (morning_hrv['endDate'] < seven_days_ago)]['value'].astype(float).mean()

# Check for NaN or None before any math/comparison
if pd.isna(recent_avg) or pd.isna(baseline_avg):
    print("No Status", 0, 0)
else:
    # Determine status based on deviation from baseline
    if recent_avg < (baseline_avg * 0.9):  # More than 10% below baseline
        print("Strained", recent_avg, baseline_avg)
    elif recent_avg > baseline_avg * 1.1:  # More than 10% above might be good recovery or a sign of fatigue
        print("Unbalanced", recent_avg, baseline_avg)
    else:
        print("Balanced", recent_avg, baseline_avg)

NameError: name 'hrv_df' is not defined

In [24]:
health_data = data

In [25]:
workouts_df = health_data['workouts'].copy()
vo2_max_df = health_data['records'][health_data['records']['type'] == 'HKQuantityTypeIdentifierVO2Max'].copy().sort_values('endDate', ascending=False)
hrv_df = health_data['records'][health_data['records']['type'] == 'HKQuantityTypeIdentifierHeartRateVariabilitySDNN'].copy()
hrv_df['value'] = hrv_df['value'].astype(float)
vo2_max_df['value'] = vo2_max_df['value'].astype(float)
# --- Pillar 1: VO2 Max Trend ---
vo2_max_trend = "Stable"
if len(vo2_max_df) < 2: print( "No Status", "A VO2 Max reading is required.")
latest_vo2_max = vo2_max_df.iloc[0]['value']
four_weeks_ago = vo2_max_df.iloc[0]['endDate'] - pd.Timedelta(days=28)
past_vo2_max_readings = vo2_max_df[vo2_max_df['endDate'] < four_weeks_ago]
if not past_vo2_max_readings.empty:
    past_vo2_max = past_vo2_max_readings.iloc[0]['value']
    if latest_vo2_max > past_vo2_max + 0.5: vo2_max_trend = "Increasing"
    elif latest_vo2_max < past_vo2_max - 0.5: vo2_max_trend = "Decreasing"
print('vo2max:', vo2_max_trend, latest_vo2_max, past_vo2_max)

# --- Pillar 2: Training Load (ACWR) ---
if 'totalEnergyBurned' in workouts_df.columns:
    energy_used = workouts_df['totalEnergyBurned'].fillna(workouts_df['duration'] / 60 * 5)
else: energy_used = workouts_df['duration'] / 60 * 5
workouts_df['trainingLoad'] = energy_used

daily_load = workouts_df.set_index('endDate')['trainingLoad'].resample('D').sum().sort_index()
if len(daily_load) < 14: print( "No Status", "At least two weeks of training data is needed.")

short_term_avg = daily_load.rolling(window=7).mean().iloc[-1]
long_term_avg = daily_load.rolling(window=28).mean().iloc[-1]
if pd.isna(long_term_avg) or long_term_avg == 0: print( "No Status", "Not enough long-term training data.")

load_ratio = short_term_avg / long_term_avg
load_status = "Optimal"
if load_ratio > 1.5: load_status = "High"
elif load_ratio < 0.8: load_status = "Low"
print('load:',load_status, short_term_avg, long_term_avg)
# --- Pillar 3: HRV Status ---
hrv_status, _, _ = 'Balanced',',',''

# --- The Decision Tree ---
has_trained_last_week = daily_load.iloc[-7:].sum() > 0

if not has_trained_last_week and vo2_max_trend == "Decreasing":
        out= "Detraining", "You have stopped training and your fitness is decreasing."
    
elif load_status == "High":
    if vo2_max_trend == "Decreasing":
        out= "Strained", "Your training load is very high, causing your fitness to decrease. This is a strong sign of overreaching."
    else: # Covers "Stable" and "Increasing" VO2 Max
        out= "Strained", "Your training load is very high and likely unsustainable. Prioritize recovery to avoid burnout, even if fitness is currently stable or increasing."

elif load_status == "Low":
    if vo2_max_trend in ["Increasing", "Stable"]:
            out= "Peaking", "Your load is reduced, allowing your body to recover for optimal performance. Ideal for a race."
    else: # Covers "Decreasing" VO2 Max
            out= "Recovery", "Your light training load is allowing your body to recover, but your fitness may be slightly declining."

elif vo2_max_trend == "Increasing": # This now only runs if load_status is "Optimal"
    if hrv_status == "Strained":
        out= "Productive", "Your fitness is improving, but ensure you are getting enough recovery (HRV is strained)."
    out= "Productive", "Your fitness is improving! Your current training load is effective."

elif vo2_max_trend == "Decreasing": # This now only runs if load_status is "Optimal"
    if hrv_status == "Strained":
        out= "Unproductive", "You are training, but your fitness is decreasing. Your body is struggling to recover (HRV is strained)."
    out= "Unproductive", "You are training, but your fitness is decreasing. Consider your recovery, sleep, and nutrition."

else: # Default case: Optimal load and Stable VO2 Max
    out= "Maintaining", "Your current training load is enough to maintain your fitness level."

print(out)

vo2max: Increasing 54.31 53.75
load: Low 1.8338627810752581 6.728849316660375
('Peaking', 'Your load is reduced, allowing your body to recover for optimal performance. Ideal for a race.')


In [26]:
daily_load.iloc[-7:]

endDate
2025-08-10 00:00:00+02:00    0.000000
2025-08-11 00:00:00+02:00    0.000000
2025-08-12 00:00:00+02:00    3.769536
2025-08-13 00:00:00+02:00    0.000000
2025-08-14 00:00:00+02:00    0.000000
2025-08-15 00:00:00+02:00    0.000000
2025-08-16 00:00:00+02:00    9.067503
Freq: D, Name: trainingLoad, dtype: float64

In [27]:
daily_load.last('7D')

  daily_load.last('7D')


endDate
2025-08-10 00:00:00+02:00    0.000000
2025-08-11 00:00:00+02:00    0.000000
2025-08-12 00:00:00+02:00    3.769536
2025-08-13 00:00:00+02:00    0.000000
2025-08-14 00:00:00+02:00    0.000000
2025-08-15 00:00:00+02:00    0.000000
2025-08-16 00:00:00+02:00    9.067503
Freq: D, Name: trainingLoad, dtype: float64

In [28]:
HKQuantityTypeIdentifierDistanceWalkingRunning

NameError: name 'HKQuantityTypeIdentifierDistanceWalkingRunning' is not defined

In [48]:
records_df = data['records'].copy()

In [None]:

running_workouts = workouts_df[workouts_df['workoutActivityType'].str.contains("Running", na=False)].copy()
distance_records = records_df[records_df['type'] == 'HKQuantityTypeIdentifierDistanceWalkingRunning'].copy()
#distance_records = distance_records[distance_records['sourceName'].str.contains("Apple Watch", na=False)].copy()
distance_records['value'] = distance_records['value'].astype(float)

In [53]:
distance_records[distance_records['sourceName'].str.contains("Watch")]

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device,duration_seconds
529024,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2024-09-30 07:07:10+02:00,2024-09-30 06:54:07+02:00,2024-09-30 06:54:20+02:00,0.012937,"<<HKDevice: 0x7703a21c0>, name:Apple Watch, ma...",
529025,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2024-09-30 08:46:37+02:00,2024-09-30 08:29:56+02:00,2024-09-30 08:30:53+02:00,0.007187,"<<HKDevice: 0x7703a21c0>, name:Apple Watch, ma...",
529026,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2024-09-30 10:27:40+02:00,2024-09-30 10:17:03+02:00,2024-09-30 10:21:26+02:00,0.012218,"<<HKDevice: 0x7703a21c0>, name:Apple Watch, ma...",
529027,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2024-09-30 10:49:46+02:00,2024-09-30 10:39:13+02:00,2024-09-30 10:39:24+02:00,0.011506,"<<HKDevice: 0x7703a21c0>, name:Apple Watch, ma...",
529028,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2024-09-30 11:25:35+02:00,2024-09-30 11:14:07+02:00,2024-09-30 11:14:25+02:00,0.015198,"<<HKDevice: 0x7703a21c0>, name:Apple Watch, ma...",
...,...,...,...,...,...,...,...,...,...,...
818905,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2025-08-17 16:02:02+02:00,2024-10-16 18:06:42+02:00,2024-10-16 18:13:14+02:00,0.751590,"<<HKDevice: 0x7703a2340>, name:Apple Watch, ma...",
818906,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2025-08-17 16:02:02+02:00,2024-10-16 18:13:14+02:00,2024-10-16 18:18:18+02:00,0.838533,"<<HKDevice: 0x7703a2340>, name:Apple Watch, ma...",
818907,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2025-08-17 16:02:02+02:00,2024-10-16 18:18:18+02:00,2024-10-16 18:23:23+02:00,0.871862,"<<HKDevice: 0x7703a2340>, name:Apple Watch, ma...",
818908,HKQuantityTypeIdentifierDistanceWalkingRunning,Thibaut’s Apple Watch,11.0,km,2025-08-17 16:02:02+02:00,2024-10-16 18:23:23+02:00,2024-10-16 18:28:27+02:00,0.866050,"<<HKDevice: 0x7703a2340>, name:Apple Watch, ma...",


In [38]:

# This will hold the calculated distances
calculated_distances = {}
count  = 0
startDate = running_workouts['startDate'].max().normalize()  # Ensure we start from the most recent workout date
print(f"Processing running workouts starting from: {startDate}")
for index, workout in running_workouts.iterrows():
    if count <3:
        workout_start = workout['startDate']
        workout_end = workout['endDate']
        print(workout_start, workout_end)
        if workout_start.normalize() == startDate:
            # Create a boolean mask to find all distance records within this workout's timeframe
            mask = (distance_records['startDate'] >= workout_start) & (distance_records['endDate'] <= workout_end)
        
            # Sum the 'value' (distance) of all matching records
            print(distance_records.loc[mask, 'value'])
            workout_distance = distance_records.loc[mask, 'value'].astype(float).sum()
            if workout_distance > 0:
                calculated_distances[index] = workout_distance
                count += 1
        else:
            pass   
        
        
    else:
        print("Reached limit of 3 workouts processed.")
        break

# Update the original workouts dataframe with the new, accurate distances
print(f"Calculated distances for {len(calculated_distances)} workouts.")
print(calculated_distances)
if calculated_distances:
    # Create a Series from the dictionary to update the DataFrame
    #distance_series = pd.Series(calculated_distances, name='totalDistance')
    #workouts_df = workouts_df.join(distance_series, how='left')
    pass

Processing running workouts starting from: 2025-08-12 00:00:00+02:00
2025-08-12 19:09:51+02:00 2025-08-12 20:00:38+02:00
Series([], Name: value, dtype: float64)
2025-08-05 19:03:27+02:00 2025-08-05 19:58:49+02:00
2025-08-03 10:53:48+02:00 2025-08-03 12:03:44+02:00
2025-07-29 09:29:27+02:00 2025-07-29 11:13:10+02:00
2025-07-29 09:29:20+02:00 2025-07-29 11:13:04+02:00
2025-07-26 08:02:15+02:00 2025-07-26 08:52:56+02:00
2025-07-15 17:11:26+02:00 2025-07-15 18:09:32+02:00
2025-07-11 18:08:52+02:00 2025-07-11 19:12:35+02:00
2025-07-02 19:32:17+02:00 2025-07-02 20:31:20+02:00
2025-06-22 09:00:43+02:00 2025-06-22 10:35:03+02:00
2025-06-07 09:00:14+02:00 2025-06-07 16:02:05+02:00
2025-06-07 08:59:34+02:00 2025-06-07 16:02:04+02:00
2025-06-03 18:08:12+02:00 2025-06-03 19:15:43+02:00
2025-05-25 10:28:59+02:00 2025-05-25 12:36:31+02:00
2025-05-25 10:28:59+02:00 2025-05-25 12:36:30+02:00
2025-05-22 16:51:51+02:00 2025-05-22 17:27:24+02:00
2025-05-18 08:40:34+02:00 2025-05-18 12:32:03+02:00
2025-05

In [57]:
def enrich_workouts_with_distance(workouts_df, records_df):
    """
    Matches running workouts with their corresponding distance records to calculate total distance.
    """
    if workouts_df.empty or records_df.empty:
        print("No workouts or records data available.")
        return workouts_df

    # Filter for only the necessary data to speed up processing
    running_workouts = workouts_df[workouts_df['workoutActivityType'].str.contains("Running", na=False)].copy()
    distance_records = records_df[records_df['type'] == 'HKQuantityTypeIdentifierDistanceWalkingRunning'].copy()
    distance_records = distance_records[distance_records['sourceName'].str.contains("Watch", na=False)].copy()
    distance_records['value'] = distance_records['value'].astype(float)
    print(f"Found {len(running_workouts)} running workouts and {len(distance_records)} distance records.")
    if running_workouts.empty or distance_records.empty:
        print("No running workouts or no distance records available.")
        return workouts_df # No running workouts or no distance data to match

    # This will hold the calculated distances
    calculated_distances = {}
    count  = 0
    
    for index, workout in running_workouts.iterrows():
        
        workout_start = workout['startDate']
        workout_end = workout['endDate']
        
        # Create a boolean mask to find all distance records within this workout's timeframe
        mask = (distance_records['startDate'] >= workout_start) & (distance_records['endDate'] <= workout_end)
        
        # Sum the 'value' (distance) of all matching records
        workout_distance = distance_records.loc[mask, 'value'].astype(float).sum()
        
        if workout_distance > 0:
            calculated_distances[index] = workout_distance
            

    # Update the original workouts dataframe with the new, accurate distances
    print(f"Calculated distances for {len(calculated_distances)} workouts.")
    print(calculated_distances)
    if calculated_distances:
        # Create a Series from the dictionary to update the DataFrame
        if 'totalDistance' in workouts_df.columns:
            workouts_df.drop(columns=['totalDistance'], inplace=True, errors='ignore')
        distance_series = pd.Series(calculated_distances, name='totalDistance')
        workouts_df = workouts_df.join(distance_series, how='left')
        
    return workouts_df

In [62]:
workouts_df = enrich_workouts_with_distance(workouts_df, data['records'])

Found 204 running workouts and 221435 distance records.
Calculated distances for 187 workouts.
{306: np.float64(8.928408774), 300: np.float64(7.7648241662999995), 299: np.float64(10.117475172499999), 298: np.float64(12.162609318), 297: np.float64(12.168415858), 294: np.float64(6.2987446039999995), 285: np.float64(10.107049568999999), 283: np.float64(10.203714197), 280: np.float64(10.04569461), 276: np.float64(13.665087155999998), 272: np.float64(44.96516363854), 271: np.float64(44.96516363854), 267: np.float64(10.156169720000001), 266: np.float64(17.785754134999998), 265: np.float64(17.785754134999998), 263: np.float64(6.285029736), 261: np.float64(32.8000237684), 262: np.float64(32.8000237684), 258: np.float64(10.41147610989419), 252: np.float64(2.1459547900000002), 251: np.float64(5.03982243), 250: np.float64(2.40169926), 249: np.float64(12.098519096), 248: np.float64(17.263523195), 247: np.float64(10.868438824485), 241: np.float64(2.431440824), 240: np.float64(7.80498476198), 239: n

In [63]:
workouts_df2[workouts_df2['workoutActivityType'].str.contains("Running", na=False)].head(10)

Unnamed: 0,workoutActivityType,duration,durationUnit,sourceName,sourceVersion,creationDate,startDate,endDate,device,trainingLoad,totalDistance
306,HKWorkoutActivityTypeRunning,45.234438,min,Thibaut’s Apple Watch,11.6,2025-08-12 20:02:43+02:00,2025-08-12 19:09:51+02:00,2025-08-12 20:00:38+02:00,"<<HKDevice: 0x770de6fa0>, name:Apple Watch, ma...",3.769536,8.928409
300,HKWorkoutActivityTypeRunning,31.98537,min,Thibaut’s Apple Watch,11.5,2025-08-05 20:05:09+02:00,2025-08-05 19:03:27+02:00,2025-08-05 19:58:49+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",2.665447,7.764824
299,HKWorkoutActivityTypeRunning,64.285692,min,Thibaut’s Apple Watch,11.5,2025-08-03 12:04:07+02:00,2025-08-03 10:53:48+02:00,2025-08-03 12:03:44+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",5.357141,10.117475
298,HKWorkoutActivityTypeRunning,103.716667,min,Strava,49012.0,2025-08-03 12:44:49+02:00,2025-07-29 09:29:27+02:00,2025-07-29 11:13:10+02:00,,8.643056,12.162609
297,HKWorkoutActivityTypeRunning,103.446218,min,WorkOutDoors,4.0,2025-07-29 11:13:34+02:00,2025-07-29 09:29:20+02:00,2025-07-29 11:13:04+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",8.620518,12.168416
294,HKWorkoutActivityTypeRunning,43.77543,min,Thibaut’s Apple Watch,11.5,2025-07-26 09:11:34+02:00,2025-07-26 08:02:15+02:00,2025-07-26 08:52:56+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",3.647953,6.298745
285,HKWorkoutActivityTypeRunning,54.03853,min,Thibaut’s Apple Watch,11.5,2025-07-15 18:09:42+02:00,2025-07-15 17:11:26+02:00,2025-07-15 18:09:32+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",4.503211,10.10705
283,HKWorkoutActivityTypeRunning,60.190774,min,Thibaut’s Apple Watch,11.5,2025-07-11 19:12:45+02:00,2025-07-11 18:08:52+02:00,2025-07-11 19:12:35+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",5.015898,10.203714
280,HKWorkoutActivityTypeRunning,57.414008,min,Thibaut’s Apple Watch,11.5,2025-07-02 20:31:33+02:00,2025-07-02 19:32:17+02:00,2025-07-02 20:31:20+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",4.784501,10.045695
276,HKWorkoutActivityTypeRunning,80.916463,min,Thibaut’s Apple Watch,11.5,2025-06-22 10:35:16+02:00,2025-06-22 09:00:43+02:00,2025-06-22 10:35:03+02:00,"<<HKDevice: 0x770e20900>, name:Apple Watch, ma...",6.743039,13.665087


In [70]:
def find_best(workouts_df):
    races = {"5K": 5, "10K": 10, "Half-Marathon": 21.0975, "Marathon": 42.195, "50K": 50, "100K": 100}
    personal_bests = {}
    tolerance = 0.99
    for name, distance_km in races.items():
        race_attempts = workouts_df[
            (workouts_df['workoutActivityType'].str.contains("Running")) &
            (workouts_df['totalDistance'] >= distance_km * tolerance)
        ].copy()
        if not race_attempts.empty:
            race_attempts['pace_s_per_km'] = (race_attempts['duration'] * 60) / race_attempts['totalDistance']
            best_attempt = race_attempts.loc[race_attempts['pace_s_per_km'].idxmin()]
            best_time_seconds = best_attempt['pace_s_per_km'] * distance_km
            personal_bests[name] = best_time_seconds
    return personal_bests

In [65]:
personal_bests

{'5K': np.float64(1235.7795476423894),
 '10K': np.float64(2906.1801300692896),
 'Half-Marathon': np.float64(8223.97859418525),
 'Marathon': np.float64(22278.189591964154)}

In [104]:
def format_time_hms(seconds):
    if pd.isna(seconds): return None
    seconds = int(seconds)
    hours, remainder = divmod(seconds, 3600); minutes, sec = divmod(remainder, 60)
    return f"{hours:02d}:{minutes:02d}:{sec:02d}" if hours > 0 else f"{minutes:02d}:{sec:02d}"

def format_improvement(seconds):
    if pd.isna(seconds): return None
    seconds = int(seconds); sign = "-" if seconds < 0 else "+"; seconds = abs(seconds)
    minutes, sec = divmod(seconds, 60)
    return f"{sign}{minutes:02d}:{sec:02d}"

def calculate_race_predictor(health_data):
    if 'records' not in health_data or health_data['records'].empty: return pd.DataFrame(), "Not enough data"
    vo2_max_df = health_data['records'][health_data['records']['type'] == 'HKQuantityTypeIdentifierVO2Max'].copy().sort_values('endDate', ascending=False)
    vo2_max_df['value'] = vo2_max_df['value'].astype(float)
    if vo2_max_df.empty: return pd.DataFrame(), "No VO₂ Max data found."

    latest_vo2_max = vo2_max_df.iloc[0]['value']
    personal_bests = find_best(workouts_df)
    
    try:
        base_5k_seconds = (13.3 - (0.28 * latest_vo2_max)) * 60
        print(base_5k_seconds)
    except Exception:
        return pd.DataFrame(), "Could not calculate race times from VO₂ Max."

    races = {"5K": 5, "10K": 10, "Half-Marathon": 21.0975, "Marathon": 42.195, "50K": 50, "100K": 100}
    predictions = []
    for name, distance_km in races.items():
        predicted_improvement_sec = base_5k_seconds * (distance_km / 5)**1.06
        print(f"Predicted {name} time: {predicted_improvement_sec:.2f} seconds")
        pb_seconds = personal_bests.get(name)
        predicted_time_sec = pb_seconds + predicted_improvement_sec if pb_seconds else None
        predictions.append({
            "Distance": name,
            "Personal Best": format_time_hms(pb_seconds),
            "Predicted Improvement": format_improvement(predicted_improvement_sec),
            "Predicted Time": format_time_hms(predicted_time_sec)
        })

    prediction_df = pd.DataFrame(predictions)
    prediction_df.dropna(subset=['Personal Best'], inplace=True)  # Remove rows with no pb time
    info = f"Predictions based on a current VO₂ Max of {latest_vo2_max:.1f}."
    return prediction_df, info

In [105]:
calculate_race_predictor(data)

-114.40800000000003
Predicted 5K time: -114.41 seconds
Predicted 10K time: -238.53 seconds
Predicted Half-Marathon time: -526.30 seconds
Predicted Marathon time: -1097.30 seconds
Predicted 50K time: -1313.58 seconds
Predicted 100K time: -2738.72 seconds


(        Distance Personal Best Predicted Improvement Predicted Time
 0             5K         20:35                -01:54          18:41
 1            10K         48:26                -03:58          44:27
 2  Half-Marathon      02:17:03                -08:46       02:08:17
 3       Marathon      06:11:18                -18:17       05:53:00,
 'Predictions based on a current VO₂ Max of 54.3.')