Copyright ©2022. Stephen Rigden.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.

## Export iPhone Health Data

See <<project folder>>README.md for instructions.

In [1]:
# lxml is needed by pandas.read_xml so…
# noinspection PyUnresolvedReferences
import lxml
from pathlib import Path
import pandas

In [2]:
project_path = Path.cwd().parent.parent
iphone_file = project_path / 'data' / 'raw' / f"export.xml"
heart_rate_pickle = project_path / 'data' / 'processed' / 'heart_preprocessed.pickle'

In [6]:
hf = pandas.read_xml(iphone_file)  # 1.25m records took 2 min on my 4 y.o MacBook Pro.
hf.shape

(1247324, 41)

In [7]:
hf.columns

Index(['value', 'HKCharacteristicTypeIdentifierDateOfBirth',
       'HKCharacteristicTypeIdentifierBiologicalSex',
       'HKCharacteristicTypeIdentifierBloodType',
       'HKCharacteristicTypeIdentifierFitzpatrickSkinType',
       'HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse', 'type',
       'sourceName', 'sourceVersion', 'unit', 'creationDate', 'startDate',
       'endDate', 'device', 'MetadataEntry', 'Record', 'SensitivityPoint',
       'workoutActivityType', 'duration', 'durationUnit', 'totalDistance',
       'totalDistanceUnit', 'totalEnergyBurned', 'totalEnergyBurnedUnit',
       'WorkoutEvent', 'dateComponents', 'activeEnergyBurned',
       'activeEnergyBurnedGoal', 'activeEnergyBurnedUnit', 'appleMoveTime',
       'appleMoveTimeGoal', 'appleExerciseTime', 'appleExerciseTimeGoal',
       'appleStandHours', 'appleStandHoursGoal',
       'HeartRateVariabilityMetadataList', 'identifier', 'sourceURL',
       'fhirVersion', 'receivedDate', 'resourceFilePath'],
      dty

In [8]:
hf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247324 entries, 0 to 1247323
Data columns (total 41 columns):
 #   Column                                                     Non-Null Count    Dtype  
---  ------                                                     --------------    -----  
 0   value                                                      1244178 non-null  object 
 1   HKCharacteristicTypeIdentifierDateOfBirth                  1 non-null        object 
 2   HKCharacteristicTypeIdentifierBiologicalSex                1 non-null        object 
 3   HKCharacteristicTypeIdentifierBloodType                    1 non-null        object 
 4   HKCharacteristicTypeIdentifierFitzpatrickSkinType          1 non-null        object 
 5   HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse  1 non-null        object 
 6   type                                                       1245993 non-null  object 
 7   sourceName                                                 1246676 non-n

# Refine Dataset

Extract columns and rows with useful information.

#### Refine columns (pass 1)

In [9]:
health_file = hf.loc[:, ['value', 'type', 'sourceName', 'sourceVersion', 'unit',
                         'creationDate', 'startDate', 'endDate', 'device']]
health_file.shape

(1247324, 9)

In [10]:
health_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247324 entries, 0 to 1247323
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   value          1244178 non-null  object
 1   type           1245993 non-null  object
 2   sourceName     1246676 non-null  object
 3   sourceVersion  1235895 non-null  object
 4   unit           1232546 non-null  object
 5   creationDate   1245994 non-null  object
 6   startDate      1245994 non-null  object
 7   endDate        1245994 non-null  object
 8   device         1197500 non-null  object
dtypes: object(9)
memory usage: 85.6+ MB


#### Refine Rows

In [11]:
health_file.head(7)

Unnamed: 0,value,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,device
0,2022-01-01 07:00:25 -0500,,,,,,,,
1,,,,,,,,,
2,5.75,HKQuantityTypeIdentifierHeight,iPhone,13.4,ft,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,
3,5.83333,HKQuantityTypeIdentifierHeight,Stephen’s iPhone 11,15.0,ft,2021-09-21 12:20:50 -0500,2021-09-21 12:20:50 -0500,2021-09-21 12:20:50 -0500,
4,170,HKQuantityTypeIdentifierBodyMass,iPhone,13.4,lb,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,
5,67,HKQuantityTypeIdentifierHeartRate,Stephen’s Apple Watch,6.1.3,count/min,2020-03-30 15:13:44 -0500,2020-03-30 15:11:49 -0500,2020-03-30 15:11:49 -0500,"<<HKDevice: 0x280d2fac0>, name:Apple Watch, ma..."
6,67,HKQuantityTypeIdentifierHeartRate,Stephen’s Apple Watch,6.1.3,count/min,2020-03-30 15:18:49 -0500,2020-03-30 15:16:17 -0500,2020-03-30 15:16:17 -0500,"<<HKDevice: 0x280d2fac0>, name:Apple Watch, ma..."


In [12]:
health_file.tail()

Unnamed: 0,value,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,device
1247319,,Observation,OhioHealth,,,,,,
1247320,,Observation,OhioHealth,,,,,,
1247321,,Observation,OhioHealth,,,,,,
1247322,,Observation,OhioHealth,,,,,,
1247323,,Patient,OhioHealth,,,,,,


In [13]:
health_file['type'].value_counts()

HKQuantityTypeIdentifierActiveEnergyBurned                583387
HKQuantityTypeIdentifierBasalEnergyBurned                 208037
HKQuantityTypeIdentifierHeartRate                         180408
HKQuantityTypeIdentifierDistanceWalkingRunning             72826
HKQuantityTypeIdentifierStepCount                          63963
HKQuantityTypeIdentifierAppleStandTime                     36398
HKQuantityTypeIdentifierAppleExerciseTime                  29979
HKQuantityTypeIdentifierEnvironmentalAudioExposure         20954
HKQuantityTypeIdentifierFlightsClimbed                     12228
HKCategoryTypeIdentifierAppleStandHour                     10710
HKQuantityTypeIdentifierStairDescentSpeed                   5384
HKQuantityTypeIdentifierWalkingStepLength                   3889
HKQuantityTypeIdentifierWalkingSpeed                        3889
HKQuantityTypeIdentifierHeartRateVariabilitySDNN            3258
HKQuantityTypeIdentifierWalkingDoubleSupportPercentage      2395
HKQuantityTypeIdentifierS

#### Select rows with heart types. Refine columns (pass 2)

In [14]:
heart_rate = health_file['type'] == 'HKQuantityTypeIdentifierHeartRate'
bp_diastolic = health_file['type'] == 'HKQuantityTypeIdentifierBloodPressureDiastolic'
bp_systolic = health_file['type'] == 'HKQuantityTypeIdentifierBloodPressureSystolic'
ds = health_file.loc[heart_rate | bp_diastolic | bp_systolic, ['value', 'type', 'startDate']]
ds.loc[:, 'value'] = ds['value'].astype('float')
ds = ds.rename(columns={'startDate': 'date'})
ds.loc[:, 'date'] = ds['date'].astype('datetime64[ns]')
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181400 entries, 5 to 181404
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   value   181400 non-null  float64       
 1   type    181400 non-null  object        
 2   date    181400 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 5.5+ MB


In [15]:
ds.date.min()

Timestamp('2020-03-30 20:11:49')

In [16]:
ds.date.max()

Timestamp('2022-01-01 11:56:36')

In [17]:
ds.to_pickle(heart_rate_pickle)