Copyright ©2022. Stephen Rigden. This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.

In [34]:
import pandas
from pathlib import Path

from heart_health import utilities

In [35]:
# Set file paths
project_path = Path.cwd().parent.parent
extra_data_pickle = project_path / 'data' / 'processed' / 'extra_data_preprocessed.pickle'
heart_data_pickle = project_path / 'data' / 'processed' / 'heart_preprocessed.pickle'
heart_and_externals_pickle = project_path / 'data' / 'processed' / 'heart_and_externals.pickle'

### Load and view the data

In [36]:
hds = pandas.read_pickle(heart_data_pickle)
eds = pandas.read_pickle(extra_data_pickle)

In [37]:
hds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21504 entries, 0 to 21503
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   value   21504 non-null  float64       
 1   type    21504 non-null  object        
 2   date    21504 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 672.0+ KB


In [38]:
hds.head()

Unnamed: 0,value,type,date
0,122.0,HKQuantityTypeIdentifierBloodPressureSystolic,1875-06-01 00:00:00
1,87.0,HKQuantityTypeIdentifierBloodPressureDiastolic,1875-06-01 00:00:00
2,155.0,HKQuantityTypeIdentifierBloodPressureSystolic,1875-06-01 06:00:00
3,91.0,HKQuantityTypeIdentifierBloodPressureDiastolic,1875-06-01 06:00:00
4,125.0,HKQuantityTypeIdentifierBloodPressureSystolic,1875-06-01 12:00:00


In [39]:
hds.tail()

Unnamed: 0,value,type,date
21499,81.0,HKQuantityTypeIdentifierHeartRate,1875-11-15 23:00:00
21500,66.0,HKQuantityTypeIdentifierHeartRate,1875-11-15 23:12:00
21501,65.0,HKQuantityTypeIdentifierHeartRate,1875-11-15 23:24:00
21502,58.0,HKQuantityTypeIdentifierHeartRate,1875-11-15 23:36:00
21503,91.0,HKQuantityTypeIdentifierHeartRate,1875-11-15 23:48:00


In [40]:
eds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           84 non-null     datetime64[ns]
 1   heart_tablets  84 non-null     Int64         
 2   alcohol        84 non-null     Int64         
 3   notes          84 non-null     object        
dtypes: Int64(2), datetime64[ns](1), object(1)
memory usage: 2.9+ KB


In [41]:
eds.head()

Unnamed: 0,date,heart_tablets,alcohol,notes
0,1875-06-01,2,0,A note
1,1875-06-02,2,177,
2,1875-06-03,2,0,
3,1875-06-04,2,0,
4,1875-06-05,2,30,


### Extract blood pressure data and merge with extra data file

In [42]:
bpds = utilities.create_blood_pressure_dataset(hds)
bpds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 672 entries, 0 to 671
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            672 non-null    datetime64[ns]
 1   systolic        672 non-null    float64       
 2   diastolic       672 non-null    float64       
 3   pulse pressure  672 non-null    float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 26.2 KB


In [43]:
bpds.head()

Unnamed: 0,date,systolic,diastolic,pulse pressure
0,1875-06-01 00:00:00,122.0,87.0,35.0
1,1875-06-01 06:00:00,155.0,91.0,64.0
2,1875-06-01 12:00:00,125.0,88.0,37.0
3,1875-06-01 18:00:00,127.0,81.0,46.0
4,1875-06-02 00:00:00,109.0,78.0,31.0


In [44]:
day = bpds['date'].dt.date.astype('datetime64[ns]')
bpds = bpds.merge(eds, left_on=[day], right_on=['date'])
bpds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 336 entries, 0 to 335
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            336 non-null    datetime64[ns]
 1   date_x          336 non-null    datetime64[ns]
 2   systolic        336 non-null    float64       
 3   diastolic       336 non-null    float64       
 4   pulse pressure  336 non-null    float64       
 5   date_y          336 non-null    datetime64[ns]
 6   heart_tablets   336 non-null    Int64         
 7   alcohol         336 non-null    Int64         
 8   notes           336 non-null    object        
dtypes: Int64(2), datetime64[ns](3), float64(3), object(1)
memory usage: 26.9+ KB


In [45]:
bpds = bpds.loc[:, ['date_x', 'systolic', 'diastolic', 'pulse pressure', 'heart_tablets', 'alcohol']]
bpds = bpds.rename(columns={'date_x': 'date'})
bpds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 336 entries, 0 to 335
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            336 non-null    datetime64[ns]
 1   systolic        336 non-null    float64       
 2   diastolic       336 non-null    float64       
 3   pulse pressure  336 non-null    float64       
 4   heart_tablets   336 non-null    Int64         
 5   alcohol         336 non-null    Int64         
dtypes: Int64(2), datetime64[ns](1), float64(3)
memory usage: 19.0 KB


In [46]:
bpds.head()

Unnamed: 0,date,systolic,diastolic,pulse pressure,heart_tablets,alcohol
0,1875-06-01 00:00:00,122.0,87.0,35.0,2,0
1,1875-06-01 06:00:00,155.0,91.0,64.0,2,0
2,1875-06-01 12:00:00,125.0,88.0,37.0,2,0
3,1875-06-01 18:00:00,127.0,81.0,46.0,2,0
4,1875-06-02 00:00:00,109.0,78.0,31.0,2,177


In [47]:
bpds.tail()

Unnamed: 0,date,systolic,diastolic,pulse pressure,heart_tablets,alcohol
331,1875-08-22 18:00:00,129.0,87.0,42.0,2,0
332,1875-08-23 00:00:00,113.0,78.0,35.0,2,0
333,1875-08-23 06:00:00,128.0,82.0,46.0,2,0
334,1875-08-23 12:00:00,102.0,78.0,24.0,2,0
335,1875-08-23 18:00:00,125.0,89.0,36.0,2,0


In [48]:
bpds.to_pickle(heart_and_externals_pickle)