In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Import biology module
import sys

sys.path.append("../data")
import biology as bio
import breathe_data
import sanity_checks


plotsdir = "../../../../PlotsSmartcare/O2_FEV1/"

In [4]:
print("\n*** Loading measurements data ***")
    df = pd.read_excel(
        "../../../../DataFiles/BR/PredModInputData.xlsx",
        sheet_name="BRphysdata",
        usecols="A, E, G, H , J",
    )


*** Loading patients data ***
Number of IDs:  258


In [2]:
df_meas = breathe_data.load_measurements()


*** Loading measurements data ***


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer: Error while type casting for column 'O2 Saturation'

In [None]:
df.apply(lambda x: sanity_checks.fev1(x["FEV1"], x.ID), axis=1)
# df.apply(lambda x: sanity_checks.o2_saturation(x["O2 Saturation"], x.ID), axis=1)

In [22]:
extract_measure(df_meas, "O2 Saturation")

KeyError: "['Date recorded'] not in index"

In [21]:
def extract_measure(measurements_in, label, with_patient_id=False):
    # Could also filter by Recording Type
    if with_patient_id:
        measurements_out = measurements_in[measurements_in[label].notnull()][
            ["User ID", "Date recorded", label]
        ]
    else:
        measurements_out = measurements_in[measurements_in[label].notnull()][
            ["ID", "Date recorded", label]
        ]
    print("{} has {} measurements".format(label, measurements_out.shape[0]))
    return measurements_out

In [2]:
df = breathe_data.build_O2_FEV1_df()


*** Building O2 Saturation and FEV1 dataframe ***

*** Loading patients data ***
Number of IDs:  258

*** Loading measurements data ***
FEV1 and SpO2 NaN together:  0
FEV1 or SpO2 is NaN:  6414
Number of rows:  26812
Dropping NaN rows
Number of rows:  20398
Number of IDs, datapoints after merging patient and measurement data:  214 20398


In [12]:
# 0% airway resistance: FEV1 = Predicted FEV1
# 25% airway resistance: FEV1 = 0.75 * Predicted FEV1
# Negative airway resistance: FEV1 > Predicted FEV1
# Airway resistance = 1 - FEV1/Predicted FEV1
df["Airway resistance"] = (1 - df.FEV1/df["Predicted FEV1"])*100

# 
df["Relative O2 drop (%)"] = (df["O2 Saturation"]/df["Healthy O2 Saturation"]) * 100

df

Unnamed: 0,FEV1,O2 Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted,FEV1 % Predicted_avg,ID (avg FEV1 % Predicted),FEV1_avg,ID (avg FEV1),ID (Predicted FEV1),Healthy O2 Saturation,Airway resistance,O2 drop,Relative O2 drop
0,1.31,97.0,101,53,Male,173.0,3.610061,36.287474,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),96.008280,63.712526,-0.991720,101.032952
1,1.29,96.0,101,53,Male,173.0,3.610061,35.733466,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),96.008280,64.266534,0.008280,99.991376
2,1.32,96.0,101,53,Male,173.0,3.610061,36.564477,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),96.008280,63.435523,0.008280,99.991376
3,1.28,97.0,101,53,Male,173.0,3.610061,35.456463,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),96.008280,64.543537,-0.991720,101.032952
4,1.33,98.0,101,53,Male,173.0,3.610061,36.841481,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L),96.008280,63.158519,-1.991720,102.074529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0,4.505342,96.108129,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L),95.974812,3.891871,-3.025188,103.152064
20394,4.35,99.0,358,18,Male,177.0,4.505342,96.552046,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L),95.974812,3.447954,-3.025188,103.152064
20395,4.30,98.0,358,18,Male,177.0,4.505342,95.442252,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L),95.974812,4.557748,-2.025188,102.110124
20396,4.30,97.0,358,18,Male,177.0,4.505342,95.442252,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L),95.974812,4.557748,-1.025188,101.068184


In [13]:
# Plot Airway resistance vs O2 drop
fig = px.scatter(df, x="Airway resistance", y="Relative O2 drop (%)")
fig.show()

In [17]:
# Filter airway resistance below 40%
df[df["Airway resistance"] < -40]

Unnamed: 0,FEV1,O2 Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted,FEV1 % Predicted_avg,ID (avg FEV1 % Predicted),FEV1_avg,ID (avg FEV1),ID (Predicted FEV1),Healthy O2 Saturation,Airway resistance,O2 drop,Relative O2 drop
13687,3.26,96.0,202,64,Female,157.0,2.213968,147.246961,99.589163,202 (99.6%),2.204872,202 (2.2L),202 (2.2L),96.735868,-47.246961,0.735868,99.239302
13692,3.31,98.0,202,64,Female,157.0,2.213968,149.50535,99.589163,202 (99.6%),2.204872,202 (2.2L),202 (2.2L),96.735868,-49.50535,-1.264132,101.306787
20061,6.0,99.0,330,24,Female,153.5,2.918146,205.610003,205.610003,330 (205.6%),6.0,330 (6.0L),330 (2.9L),96.765153,-105.610003,-2.234847,102.309558


In [22]:
# Plot FEV1 values for patient 202
fig = px.scatter(df[df.ID == "202"], x="FEV1", y="FEV1")
fig.show()

In [23]:
dfmeas = breathe_data.load_measurements()


*** Loading measurements data ***
FEV1 and SpO2 NaN together:  0
FEV1 or SpO2 is NaN:  6414
Number of rows:  26812
Dropping NaN rows
Number of rows:  20398


In [33]:
dfmeas.reset_index(inplace=True)

In [26]:
df_patients = breathe_data.load_patients()


*** Loading patients data ***
Number of IDs:  258


In [27]:
df_patients.head()

Unnamed: 0,ID,Age,Sex,Height
0,101,53,Male,173.0
1,102,45,Male,176.0
2,103,39,Female,161.0
3,104,25,Female,143.0
4,105,26,Female,165.0


In [34]:
dfmeas.merge(df_patients, right_on="ID", left_on="SmartCareID", how="left")

Unnamed: 0,SmartCareID,DateRecorded,FEV1,O2 Saturation,ID,Age,Sex,Height
0,101,2019-02-20,1.31,97.0,101,53,Male,173.0
1,101,2019-02-21,1.29,96.0,101,53,Male,173.0
2,101,2019-02-22,1.32,96.0,101,53,Male,173.0
3,101,2019-02-23,1.28,97.0,101,53,Male,173.0
4,101,2019-02-24,1.33,98.0,101,53,Male,173.0
...,...,...,...,...,...,...,...,...
20393,358,2021-06-05,4.33,99.0,358,18,Male,177.0
20394,358,2021-06-09,4.35,99.0,358,18,Male,177.0
20395,358,2021-06-10,4.30,98.0,358,18,Male,177.0
20396,358,2021-06-12,4.30,97.0,358,18,Male,177.0


In [30]:
print("\n*** Building O2 Saturation and FEV1 dataframe ***")

# Merge rows with same SmartCareId and DateRecorded, taking the non NaN value
## Define custom aggregation function
def custom_aggregation(series):
    non_nan_values = series.dropna()
    if len(non_nan_values) > 1:
        raise ValueError(
            f"More than 1 non NaN value in group: {non_nan_values.tolist()}"
        )
    if len(non_nan_values) == 0:
        return np.nan
    print(f"non_nan_values: {non_nan_values}")
    return non_nan_values.iloc[0]

df_meas = df_meas.groupby(["SmartCareID", "DateRecorded"])[
    ["FEV1", "O2 Saturation"]
].agg(custom_aggregation)

# Count rows where FEV and O2 Saturation are NaN together
print("FEV1 and O2 Saturation NaN together: ", df_meas.isna().all(axis=1).sum())
# Count and print rows where either FEV or O2 Saturation is NaN
print("FEV1 or O2 Saturation is NaN: ", df_meas.isna().any(axis=1).sum())
# Print number of rows
print("Number of rows: ", len(df_meas))
# Drop rows with nan values
df_meas = df_meas.dropna()
print("Dropping NaN rows")
# Count number of rows
print("Number of rows: ", len(df_meas))



*** Building O2 Saturation and FEV1 dataframe ***
non_nan_values: SmartCareID  DateRecorded
101          2019-02-20      1.31
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-21      1.29
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-22      1.32
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-23      1.28
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-24      1.33
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-25      1.36
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-26      1.32
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-27      1.33
Name: FEV1, dtype: float64
non_nan_values: SmartCareID  DateRecorded
101          2019-02-28      1.28
Name: FEV1, dtype: float64
non_nan_values: SmartC

In [None]:

# Merge patient and measurement dataframes on SmartCareID and ID
df = df_meas.merge(df_patients, right_on="ID", left_on="SmartCareID", how="left")

# Print number of IDs
print(
    "Number of IDs, datapoints after merging patient and measurement data: ",
    df.ID.nunique(),
    len(df),
)

# Compute predicted FEV1 using calc_predicted FEV1 in the biology module
df["Predicted FEV1"] = df.apply(
    lambda x: bio.calc_LMS_predicted_FEV1(
        bio.load_LMS_spline_vals(x.Age, x.Sex),
        bio.load_LMS_coeffs(x.Sex),
        x.Height,
        x.Age,
        x.Sex,
    )["mean"],
    axis=1,
)
# Compute FEV1 % Predicted
df["FEV1 % Predicted"] = df["FEV1"] / df["Predicted FEV1"] * 100

# Compute avg FEV1 % Predicted per individual
def compute_avg(df, col_name, unit):
    tmp = df.groupby("ID")[col_name].mean()
    # Add tmp to a new column per Id
    df = df.join(tmp, on="ID", rsuffix="_avg")

    df[f"ID (avg {col_name})"] = df.apply(
        lambda x: f"{x.ID} ({str(round(x[f'{col_name}_avg'],1))}{unit})",
        axis=1,
    )
    return df

df = compute_avg(df, "FEV1 % Predicted", "%")
df = compute_avg(df, "FEV1", "L")

df[f"ID (Predicted FEV1)"] = df.apply(
    lambda x: f"{x.ID} ({str(round(x['Predicted FEV1'],1))}L)",
    axis=1,
)

df["Healthy O2 Saturation"] = df.apply(
    lambda x: bio.calc_healthy_O2_saturation(x["O2 Saturation"], x.Sex, x.Height)[
        "mean"
    ],
    axis=1,
)


In [2]:
df_patients = breathe_data.load_patients()


*** Loading patients data ***


AssertionError: 

In [3]:
# Get python version
import sys
print(sys.version)

3.10.13 (main, Sep 11 2023, 08:39:02) [Clang 14.0.6 ]


In [33]:
s = pd.Series(["hello"])
s.dtype == np.dtype('O')

True