Development of the data pipeline for the raw measurements from Project Breathe

In [1]:
import pandas as pd
import src.data.helpers as dh
import src.data.breathe_data as breathe_data

In [2]:
path = (
    dh.get_path_to_main()
    + "DataFiles/BR/MeasurementData/Breathe_Spirometer_20231113.csv"
)
df = pd.read_csv(path)

In [171]:
print(df.columns)
print(df.shape)

Index(['UserId', 'EntityId', 'Timestamp', 'ClientTimestamp', 'IsDeleted',
       'CaptureType', 'CaptureDeviceMake', 'CaptureDeviceModel',
       'ThirdPartyId', 'PEF', 'PEFTime', 'FEV6', 'FEV1', 'FEV1PersonalBest',
       'FEV075', 'FEV1DivFEV6', 'FEV1DivFVC', 'FEF2575', 'FEV1Percent', 'EVol',
       'FVC', 'InvalidEntry', 'VolumeTimeCurve', 'FlowVolumeCurve',
       'IsCapturePrimary', 'CaptureSessionId'],
      dtype='object')
(104324, 26)


In [172]:
df.IsDeleted.unique()

array([False,  True])

In [173]:
tmp_size = df.shape[0]
df = df[~df.IsDeleted]
print(f"Removed {tmp_size - df.shape[0]} entries manually deleted on the app")

Removed 2565 entries manually deleted on the app


In [174]:
df.describe()

Unnamed: 0,EntityId,PEF,PEFTime,FEV6,FEV1,FEV1PersonalBest,FEV075,FEV1DivFEV6,FEV1DivFVC,FEF2575,FEV1Percent,EVol,FVC
count,101759.0,87122.0,4549.0,100680.0,101759.0,96319.0,96319.0,96319.0,4163.0,98277.0,101176.0,4549.0,7634.0
mean,1023662.0,279.11671,0.0,3.029517,2.130394,3.262218,1.896338,0.681364,30.227024,1.631205,60.820797,0.0,1.11686
std,39042.65,199.826929,0.0,1.068609,0.880368,0.979416,0.857896,0.176934,33.537712,1.169019,25.287741,0.0,1.790167
min,1000001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1002048.0,147.0,0.0,2.34,1.56,2.9,1.42,0.62,0.0,0.9,45.0,0.0,0.0
50%,1008223.0,264.0,0.0,2.77,1.91,3.5,1.71,0.69,0.0,1.19,59.0,0.0,0.0
75%,1029546.0,435.0,0.0,3.65,2.62,3.9,2.35,0.8,61.700001,2.13,77.0,0.0,1.98
max,1336051.0,966.0,0.0,9.83,6.03,5.3,5.78,1.0,100.0,8.68,700.0,0.0,7.13


In [175]:
def count_na(series):
    return series.isna().sum()


df.agg(count_na)

UserId                     0
EntityId                   0
Timestamp                  0
ClientTimestamp            0
IsDeleted                  0
CaptureType                0
CaptureDeviceMake       5946
CaptureDeviceModel      5977
ThirdPartyId          101151
PEF                    14637
PEFTime                97210
FEV6                    1079
FEV1                       0
FEV1PersonalBest        5440
FEV075                  5440
FEV1DivFEV6             5440
FEV1DivFVC             97596
FEF2575                 3482
FEV1Percent              583
EVol                   97210
FVC                    94125
InvalidEntry               0
VolumeTimeCurve        99802
FlowVolumeCurve        99801
IsCapturePrimary           0
CaptureSessionId           0
dtype: int64

In [176]:
df["Datetime recorded"] = pd.to_datetime(df.Timestamp, utc=False)

In [177]:
cols_to_keep = [
    "UserId",
    "Datetime recorded",
    "FEV1",
    "FEF2575",
    "PEF",
    "InvalidEntry",
]
df1 = df[cols_to_keep]

In [178]:
df1.head()

Unnamed: 0,UserId,Datetime recorded,FEV1,FEF2575,PEF,InvalidEntry
0,60d96f45-ec56-470b-89ae-0085c9f073a7,2022-08-04 17:15:31.378609200,2.73,2.53,326.0,False
1,60d96f45-ec56-470b-89ae-0085c9f073a7,2022-08-04 17:15:31.378609200,2.68,2.49,353.0,False
2,60d96f45-ec56-470b-89ae-0085c9f073a7,2022-08-04 17:15:31.378609200,2.7,2.34,351.0,False
3,60d96f45-ec56-470b-89ae-0085c9f073a7,2022-08-04 17:15:31.379609100,2.88,,,False
4,60d96f45-ec56-470b-89ae-0085c9f073a7,2022-08-02 15:25:26.475628300,2.57,2.52,333.0,False


In [179]:
def get_BR_ID_to_partition_key_mapping():
    df = pd.read_excel(
        dh.get_path_to_main() + "DataFiles/BR/PredModInputData.xlsx",
        sheet_name="brPatient",
        usecols="A, AD",
    )
    # Set ID as string
    df.ID = df.ID.astype(str)
    return df
df_key_map = get_BR_ID_to_partition_key_mapping()

In [180]:
# Is mapping complete?
keys_intersect = set(df1.UserId).intersection(set(df_key_map.PartitionKey))
print(f"{len(keys_intersect)} intersecting keys")
keys_missing_in_map = set(df1.UserId) - set(df_key_map.PartitionKey)
print(f"{len(keys_missing_in_map)} keys from physdata are missing in mapping table")
keys_missing_in_physdata = set(df_key_map.PartitionKey) - set(df1.UserId)
print(f"{len(keys_missing_in_physdata)} keys from mapping table are missing in phys data")

239 intersecting keys
215 keys from physdata are missing in mapping table
18 keys from mapping table are missing in phys data


In [181]:
df2 = df1.merge(df_key_map, left_on='UserId', right_on='PartitionKey', how='inner')
df2.ID.nunique()

240

In [182]:
# Why 1 off error??

In [183]:
df3 = df2.drop(columns=['UserId', 'PartitionKey'])

In [184]:
df3

Unnamed: 0,Datetime recorded,FEV1,FEF2575,PEF,InvalidEntry,ID
0,2022-08-04 17:15:02.545056800,2.44,1.18,0.0,False,183
1,2022-08-04 17:15:02.545056800,2.52,1.35,0.0,False,183
2,2022-08-04 17:15:02.546055300,2.52,1.35,0.0,False,183
3,2022-08-04 17:15:02.546055300,2.84,0.00,0.0,False,183
4,2022-08-04 17:15:02.546055300,2.82,1.53,0.0,False,183
...,...,...,...,...,...,...
76714,2022-08-10 05:30:16.027219400,3.41,3.04,483.0,True,273
76715,2022-08-10 05:30:16.027219400,3.27,2.97,492.0,False,273
76716,2022-08-10 05:31:57.706153700,3.29,3.15,471.0,True,273
76717,2022-08-10 05:32:50.942244600,3.31,3.09,472.0,False,273


In [185]:
# Understand invalid entry

df3.ID.value_counts()
df3["Date recorded"] = df3["Datetime recorded"].dt.date
df_for_ID = df3[df3.ID == '101'].reset_index()
df_for_ID.InvalidEntry.value_counts()

InvalidEntry
True     12275
False     3900
Name: count, dtype: int64

In [186]:
# Has invalid entry
df_for_ID[df_for_ID["Date recorded"] == df_for_ID["Date recorded"][0]]

Unnamed: 0,index,Datetime recorded,FEV1,FEF2575,PEF,InvalidEntry,ID,Date recorded
0,49269,2022-08-04 17:16:15.691332000,1.31,0.54,0.0,False,101,2022-08-04
1,49270,2022-08-04 17:16:15.692332100,1.31,0.57,0.0,False,101,2022-08-04
2,49271,2022-08-04 17:16:15.692332100,1.31,0.67,0.0,False,101,2022-08-04
3,49272,2022-08-04 17:16:15.692332100,1.30,0.69,0.0,False,101,2022-08-04
4,49273,2022-08-04 17:16:15.692332100,1.28,0.60,0.0,False,101,2022-08-04
...,...,...,...,...,...,...,...,...
5720,54989,2022-08-04 08:32:02.173501500,1.64,0.88,176.0,True,101,2022-08-04
5721,54990,2022-08-04 08:32:02.173501500,1.54,0.91,168.0,False,101,2022-08-04
5722,54991,2022-08-04 08:32:02.173501500,1.67,0.97,172.0,True,101,2022-08-04
5723,54992,2022-08-04 08:32:02.173501500,1.63,0.94,170.0,True,101,2022-08-04


# Data post processing

In [2]:
df = breathe_data.build_O2_FEV1_FEF2575_df()


*** Building O2Sat, FEV1, FEF2575 dataframe ***

*** Loading patients data ***
The 4 NaN values belong to IDs ('322', '338', '344', '348') whose height are missing.
However, we don't correct for them as we don't have any measurement corresponding to those IDs for now.
Loaded 258 individuals

*** Loading measurements data ***
Dropping 1 entries with FEV1 = 6.0 for ID 330
* Checking for same day measurements *
* Checking for same day measurements *
* Checking for same day measurements *
Number of IDs:  243
Number of rows:  36341
Number of FEV1 recordings: 7773
Number of FEF2575 recordings: 7127
Number of O2 Saturation recordings: 35105
Dropped 30341 entries with at least one NaN in subset ['O2 Saturation', 'FEV1', 'FEF2575']
36341/36341 entries remain
This includes dropping 1236 entries with NaN O2 Saturation
This includes dropping 28568 entries with NaN FEV1
This includes dropping 29214 entries with NaN FEF2575
Built data structure with 104 IDs and 6000 entries


In [4]:
df.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,StudyNumber,Age,Sex,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy
0,101,2022-08-03,1.71,98.0,1.03,1.78,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,49.306644,47.367618,100.874827
1,101,2022-08-04,1.78,98.0,1.32,1.78,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,49.306644,49.306644,100.874827
2,101,2022-08-05,1.7,98.0,1.0,1.78,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,49.306644,47.090615,100.874827
3,101,2022-08-06,1.71,98.0,1.03,1.71,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,47.367618,47.367618,100.874827
4,101,2022-08-07,1.65,98.0,1.0,1.75,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,48.475633,45.705597,100.874827


In [3]:
df_old = breathe_data.build_O2_FEV1_FEF2575_df(meas_file=1)


*** Building O2Sat, FEV1, FEF2575 dataframe ***

*** Loading patients data ***
The 4 NaN values belong to IDs ('322', '338', '344', '348') whose height are missing.
However, we don't correct for them as we don't have any measurement corresponding to those IDs for now.
Loaded 258 individuals

*** Loading measurements data ***
Dropping 1 entries with FEV1 = 6.0 for ID 330
* Checking for same day measurements *
* Checking for same day measurements *
* Checking for same day measurements *
Number of IDs:  233
Number of rows:  26812
Number of FEV1 recordings: 23778
Number of FEF2575 recordings: 20564
Number of O2 Saturation recordings: 23431
Dropped 9156 entries with at least one NaN in subset ['O2 Saturation', 'FEV1', 'FEF2575']
26812/26812 entries remain
This includes dropping 3381 entries with NaN O2 Saturation
This includes dropping 3034 entries with NaN FEV1
This includes dropping 6248 entries with NaN FEF2575
Built data structure with 209 IDs and 17656 entries


In [5]:
df_old.head()

Unnamed: 0,ID,Date Recorded,FEV1,O2 Saturation,FEF2575,ecFEV1,StudyNumber,Age,Sex,Height,Predicted FEV1,Healthy O2 Saturation,ecFEV1 % Predicted,FEV1 % Predicted,O2 Saturation % Healthy
0,101,2019-02-20,1.31,97.0,0.53,1.32,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,36.564477,36.287474,99.845492
1,101,2019-02-21,1.29,96.0,0.56,1.32,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,36.564477,35.733466,98.816157
2,101,2019-02-22,1.32,96.0,0.63,1.32,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,36.564477,36.564477,98.816157
3,101,2019-02-23,1.28,97.0,0.52,1.33,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,36.841481,35.456463,99.845492
4,101,2019-02-24,1.33,98.0,0.59,1.36,projectb861@gmail.com,53,Male,173.0,3.610061,97.150104,37.672492,36.841481,100.874827
