Copyright ©2021-2022. Stephen Rigden.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.

### Health File Preparation

These instructions are correct at 10/27/2010 with iOS 15.0.2.

#### Phone Export

See the README file for instructions on exporting the iPhone health file.

### Prepare for import

On your desktop, unzip the attached archive.
Change zip file name to export_YYYY_MM_DD. Change the export.xml file name to export_YYYY_MM_DD.xml and move it to the Raw directory. Discard the unzipped folder and its contents.

In this notebook change the import_file_date below to YYYY_MM_DD.

In [1]:
# # Change the import_file_date to match date in name of current import file.
import_file_date = '2021_12_01'

In [2]:
# lxml is needed by pandas.read_xml so…
# noinspection PyUnresolvedReferences
import lxml
from pathlib import Path
import pandas

In [3]:
project_path = Path.cwd().parent
iphone_file = project_path / 'data' / 'raw' / f"export_{import_file_date}.xml"
heart_rate_pickle = project_path / 'data' / 'processed' / 'heart_preprocessed.pickle'

In [4]:
hf = pandas.read_xml(iphone_file)
hf.shape

(1201967, 41)

In [5]:
hf.columns

Index(['value', 'HKCharacteristicTypeIdentifierDateOfBirth',
       'HKCharacteristicTypeIdentifierBiologicalSex',
       'HKCharacteristicTypeIdentifierBloodType',
       'HKCharacteristicTypeIdentifierFitzpatrickSkinType',
       'HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse', 'type',
       'sourceName', 'sourceVersion', 'unit', 'creationDate', 'startDate',
       'endDate', 'device', 'MetadataEntry', 'Record', 'SensitivityPoint',
       'workoutActivityType', 'duration', 'durationUnit', 'totalDistance',
       'totalDistanceUnit', 'totalEnergyBurned', 'totalEnergyBurnedUnit',
       'WorkoutEvent', 'dateComponents', 'activeEnergyBurned',
       'activeEnergyBurnedGoal', 'activeEnergyBurnedUnit', 'appleMoveTime',
       'appleMoveTimeGoal', 'appleExerciseTime', 'appleExerciseTimeGoal',
       'appleStandHours', 'appleStandHoursGoal',
       'HeartRateVariabilityMetadataList', 'identifier', 'sourceURL',
       'fhirVersion', 'receivedDate', 'resourceFilePath'],
      dty

In [6]:
hf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1201967 entries, 0 to 1201966
Data columns (total 41 columns):
 #   Column                                                     Non-Null Count    Dtype  
---  ------                                                     --------------    -----  
 0   value                                                      1199095 non-null  object 
 1   HKCharacteristicTypeIdentifierDateOfBirth                  1 non-null        object 
 2   HKCharacteristicTypeIdentifierBiologicalSex                1 non-null        object 
 3   HKCharacteristicTypeIdentifierBloodType                    1 non-null        object 
 4   HKCharacteristicTypeIdentifierFitzpatrickSkinType          1 non-null        object 
 5   HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse  1 non-null        object 
 6   type                                                       1200695 non-null  object 
 7   sourceName                                                 1201350 non-n

# Refine Dataset

Extract columns and rows with useful information.

#### Refine columns (pass 1)

In [7]:
health_file = hf.loc[:, ['value', 'type', 'sourceName', 'sourceVersion', 'unit',
                         'creationDate', 'startDate', 'endDate', 'device']]
health_file.shape

(1201967, 9)

In [8]:
health_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1201967 entries, 0 to 1201966
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   value          1199095 non-null  object
 1   type           1200695 non-null  object
 2   sourceName     1201350 non-null  object
 3   sourceVersion  1190641 non-null  object
 4   unit           1188020 non-null  object
 5   creationDate   1200740 non-null  object
 6   startDate      1200740 non-null  object
 7   endDate        1200740 non-null  object
 8   device         1155167 non-null  object
dtypes: object(9)
memory usage: 82.5+ MB


#### Refine Rows

In [9]:
health_file.head(7)

Unnamed: 0,value,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,device
0,2021-12-01 13:27:48 -0500,,,,,,,,
1,,,,,,,,,
2,5.75,HKQuantityTypeIdentifierHeight,iPhone,13.4,ft,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,
3,5.83333,HKQuantityTypeIdentifierHeight,Stephen’s iPhone 11,15.0,ft,2021-09-21 12:20:50 -0500,2021-09-21 12:20:50 -0500,2021-09-21 12:20:50 -0500,
4,170,HKQuantityTypeIdentifierBodyMass,iPhone,13.4,lb,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,2020-03-30 15:01:19 -0500,
5,67,HKQuantityTypeIdentifierHeartRate,Stephen’s Apple Watch,6.1.3,count/min,2020-03-30 15:13:44 -0500,2020-03-30 15:11:49 -0500,2020-03-30 15:11:49 -0500,"<<HKDevice: 0x282471630>, name:Apple Watch, ma..."
6,67,HKQuantityTypeIdentifierHeartRate,Stephen’s Apple Watch,6.1.3,count/min,2020-03-30 15:18:49 -0500,2020-03-30 15:16:17 -0500,2020-03-30 15:16:17 -0500,"<<HKDevice: 0x282471630>, name:Apple Watch, ma..."


In [10]:
health_file.tail()

Unnamed: 0,value,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,device
1201962,,DiagnosticReport,OhioHealth,,,,,,
1201963,,DiagnosticReport,OhioHealth,,,,,,
1201964,,DiagnosticReport,OhioHealth,,,,,,
1201965,,DiagnosticReport,OhioHealth,,,,,,
1201966,,Patient,OhioHealth,,,,,,


In [11]:
health_file['type'].value_counts()

HKQuantityTypeIdentifierActiveEnergyBurned                564993
HKQuantityTypeIdentifierBasalEnergyBurned                 198655
HKQuantityTypeIdentifierHeartRate                         176143
HKQuantityTypeIdentifierDistanceWalkingRunning             69982
HKQuantityTypeIdentifierStepCount                          61378
HKQuantityTypeIdentifierAppleStandTime                     34610
HKQuantityTypeIdentifierAppleExerciseTime                  28358
HKQuantityTypeIdentifierEnvironmentalAudioExposure         19934
HKQuantityTypeIdentifierFlightsClimbed                     11526
HKCategoryTypeIdentifierAppleStandHour                     10195
HKQuantityTypeIdentifierStairDescentSpeed                   4919
HKQuantityTypeIdentifierWalkingSpeed                        3569
HKQuantityTypeIdentifierWalkingStepLength                   3569
HKQuantityTypeIdentifierHeartRateVariabilitySDNN            3121
HKQuantityTypeIdentifierWalkingDoubleSupportPercentage      2237
HKQuantityTypeIdentifierS

#### Select rows with heart types. Refine columns (pass 2)

In [12]:
heart_rate = health_file['type'] == 'HKQuantityTypeIdentifierHeartRate'
bp_diastolic = health_file['type'] == 'HKQuantityTypeIdentifierBloodPressureDiastolic'
bp_systolic = health_file['type'] == 'HKQuantityTypeIdentifierBloodPressureSystolic'
ds = health_file.loc[heart_rate | bp_diastolic | bp_systolic, ['value', 'type', 'startDate']]
ds.loc[:, 'value'] = ds['value'].astype('float')
ds = ds.rename(columns={'startDate': 'date'})
ds.loc[:, 'date'] = ds['date'].astype('datetime64[ns]')
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 176911 entries, 5 to 176915
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   value   176911 non-null  float64       
 1   type    176911 non-null  object        
 2   date    176911 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 5.4+ MB


In [13]:
ds.date.min()

Timestamp('2020-03-30 20:11:49')

In [14]:
ds.date.max()

Timestamp('2021-12-01 18:19:16')

In [15]:
ds.to_pickle(heart_rate_pickle)