Copyright ©2022. Stephen Rigden.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.

## Export iPhone Health Data
See <<project folder>>README.md for instructions.

## Load and Refine the xml Data File

In [1]:
# lxml is needed by pandas.read_xml so…
# noinspection PyUnresolvedReferences
import lxml
from pathlib import Path
import pandas

In [2]:
project_path = Path.cwd().parent.parent
iphone_file = project_path / 'data' / 'raw' / f"export.xml"
heart_rate_pickle = project_path / 'data' / 'processed' / 'heart_preprocessed.pickle'

In [3]:
hf = pandas.read_xml(iphone_file)  # 1.25m records took 2 min on my 4 y.o MacBook Pro.
hf.shape

(21504, 9)

In [4]:
hf.columns

Index(['startDate', 'type', 'value', 'creationDate', 'endDate', 'sourceName',
       'sourceVersion', 'unit', 'device'],
      dtype='object')

In [5]:
hf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21504 entries, 0 to 21503
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   startDate      21504 non-null  object
 1   type           21504 non-null  object
 2   value          21504 non-null  int64 
 3   creationDate   1344 non-null   object
 4   endDate        1344 non-null   object
 5   sourceName     21504 non-null  object
 6   sourceVersion  21504 non-null  object
 7   unit           21504 non-null  object
 8   device         21504 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.5+ MB


# Refine Dataset

Extract columns and rows with useful information.

#### Refine columns (pass 1)

In [6]:
health_file = hf.loc[:, ['value', 'type', 'sourceName', 'sourceVersion', 'unit',
                         'creationDate', 'startDate', 'endDate', 'device']]
health_file.shape

(21504, 9)

In [7]:
health_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21504 entries, 0 to 21503
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   value          21504 non-null  int64 
 1   type           21504 non-null  object
 2   sourceName     21504 non-null  object
 3   sourceVersion  21504 non-null  object
 4   unit           21504 non-null  object
 5   creationDate   1344 non-null   object
 6   startDate      21504 non-null  object
 7   endDate        1344 non-null   object
 8   device         21504 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.5+ MB


#### Refine Rows

In [8]:
health_file.head(7)

Unnamed: 0,value,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,device
0,122,HKQuantityTypeIdentifierBloodPressureSystolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 00:00:00,1875-06-01 00:00:00,1875-06-01 00:00:00,Holy Grail
1,87,HKQuantityTypeIdentifierBloodPressureDiastolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 00:00:00,1875-06-01 00:00:00,1875-06-01 00:00:00,Holy Grail
2,155,HKQuantityTypeIdentifierBloodPressureSystolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 06:00:00,1875-06-01 06:00:00,1875-06-01 06:00:00,Holy Grail
3,91,HKQuantityTypeIdentifierBloodPressureDiastolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 06:00:00,1875-06-01 06:00:00,1875-06-01 06:00:00,Holy Grail
4,125,HKQuantityTypeIdentifierBloodPressureSystolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 12:00:00,1875-06-01 12:00:00,1875-06-01 12:00:00,Holy Grail
5,88,HKQuantityTypeIdentifierBloodPressureDiastolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 12:00:00,1875-06-01 12:00:00,1875-06-01 12:00:00,Holy Grail
6,127,HKQuantityTypeIdentifierBloodPressureSystolic,Mock Data Generator,1e-googolplex,mm Hg,1875-06-01 18:00:00,1875-06-01 18:00:00,1875-06-01 18:00:00,Holy Grail


In [9]:
health_file.tail()

Unnamed: 0,value,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,device
21499,81,HKQuantityTypeIdentifierHeartRate,Mock Data Generator,1e-googolplex,mm Hg,,1875-11-15 23:00:00,,Holy Grail
21500,66,HKQuantityTypeIdentifierHeartRate,Mock Data Generator,1e-googolplex,mm Hg,,1875-11-15 23:12:00,,Holy Grail
21501,65,HKQuantityTypeIdentifierHeartRate,Mock Data Generator,1e-googolplex,mm Hg,,1875-11-15 23:24:00,,Holy Grail
21502,58,HKQuantityTypeIdentifierHeartRate,Mock Data Generator,1e-googolplex,mm Hg,,1875-11-15 23:36:00,,Holy Grail
21503,91,HKQuantityTypeIdentifierHeartRate,Mock Data Generator,1e-googolplex,mm Hg,,1875-11-15 23:48:00,,Holy Grail


In [10]:
health_file['type'].value_counts()

HKQuantityTypeIdentifierHeartRate                 20160
HKQuantityTypeIdentifierBloodPressureSystolic       672
HKQuantityTypeIdentifierBloodPressureDiastolic      672
Name: type, dtype: int64

#### Select rows with heart types. Refine columns (pass 2)

In [11]:
heart_rate = health_file['type'] == 'HKQuantityTypeIdentifierHeartRate'
bp_diastolic = health_file['type'] == 'HKQuantityTypeIdentifierBloodPressureDiastolic'
bp_systolic = health_file['type'] == 'HKQuantityTypeIdentifierBloodPressureSystolic'
ds = health_file.loc[heart_rate | bp_diastolic | bp_systolic, ['value', 'type', 'startDate']]
ds.loc[:, 'value'] = ds['value'].astype('float')
ds = ds.rename(columns={'startDate': 'date'})
ds.loc[:, 'date'] = ds['date'].astype('datetime64[ns]')
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21504 entries, 0 to 21503
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   value   21504 non-null  float64       
 1   type    21504 non-null  object        
 2   date    21504 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 672.0+ KB


In [12]:
ds.date.min()

Timestamp('1875-06-01 00:00:00')

In [13]:
ds.date.max()

Timestamp('1875-11-15 23:48:00')

In [14]:
ds.to_pickle(heart_rate_pickle)