The modelling of O2 saturation is difficult because we observed trends/signals in the SmartCare data that are medically hard to explain, or counter intuitive. Hence, we're doing a complementary analysis on the O2 saturation patterns using the Breathe data.

Boxplots of O2 saturation grouped by ID:
- ordered by predicted FEV1
- ordered by avg FEV1 % predicted
- ordered by avg FEV1

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

# Import biology module
import sys
sys.path.append('../data')
import biology as bio

plotsdir = "../../../../PlotsSmartcare/O2_FEV1/"


In [2]:
df_patient = pd.read_excel('../../../../DataFiles/BR/PredModInputData.xlsx', sheet_name="brPatient", usecols="A, J, K, L")
# Set ID as string
df_patient['ID'] = df_patient['ID'].astype(str)
df_patient

Unnamed: 0,ID,Age,Sex,Height
0,101,53,Male,173.0
1,102,45,Male,176.0
2,103,39,Female,161.0
3,104,25,Female,143.0
4,105,26,Female,165.0
...,...,...,...,...
253,354,20,Male,166.0
254,355,23,Male,177.0
255,356,44,Female,162.0
256,357,24,Female,164.0


In [3]:
df_meas = pd.read_excel('../../../../DataFiles/BR/PredModInputData.xlsx', sheet_name="BRphysdata", usecols="A, E, G, H , J")

In [4]:
# Set SmartCareID as type string
df_meas['SmartCareID'] = df_meas['SmartCareID'].astype(str)
# Rename FEV to FEV1
df_meas.rename(columns={'FEV':'FEV1'}, inplace=True)
df_meas.head(15)

Unnamed: 0,SmartCareID,RecordingType,Date_TimeRecorded,FEV1,O2Saturation
0,101,CalorieRecording,2019-02-20 00:00:00.000,0.0,0.0
1,101,CoughRecording,2019-02-20 08:13:40.546,0.0,0.0
2,101,FEF2575Recording,2019-02-20 08:16:56.942,0.53,0.0
3,101,FEV075Recording,2019-02-20 08:16:56.942,1.11,0.0
4,101,FEV1DivFEV6Recording,2019-02-20 08:16:56.942,0.5,0.0
5,101,FEV1Recording,2019-02-20 08:16:56.942,1.31,0.0
6,101,FEV6Recording,2019-02-20 08:16:56.942,2.6,0.0
7,101,HasColdOrFluRecording,2019-02-20 08:13:37.175,0.0,0.0
8,101,HasHayFeverRecording,2019-02-20 08:13:37.175,0.0,0.0
9,101,LungFunctionRecording,2019-02-20 08:16:56.942,0.0,0.0


In [5]:
# Drop rows where Recording Type is not "FEV1Recording", or "O2SaturationRecording"
df_meas = df_meas[df_meas['RecordingType'].isin(['FEV1Recording', 'O2SaturationRecording'])]

In [6]:
df_meas.head()

Unnamed: 0,SmartCareID,RecordingType,Date_TimeRecorded,FEV1,O2Saturation
5,101,FEV1Recording,2019-02-20 08:16:56.942,1.31,0.0
12,101,O2SaturationRecording,2019-02-20 08:12:48.263,0.0,97.0
23,101,FEV1Recording,2019-02-21 08:32:41.028,1.29,0.0
30,101,O2SaturationRecording,2019-02-21 08:28:11.712,0.0,96.0
41,101,FEV1Recording,2019-02-22 09:17:57.892,1.32,0.0


In [7]:
# Replace 0.00 with NaN
df_meas = df_meas.replace(0.00, np.nan)
# Create Date Recorded column and drop time from Date/Time Recorded column
df_meas['DateRecorded'] = df_meas['Date_TimeRecorded'].dt.date
# Drop Date_TimeRecorded column and RecordingType column
df_meas = df_meas.drop(['Date_TimeRecorded', 'RecordingType'], axis=1)
df_meas.head()

Unnamed: 0,SmartCareID,FEV1,O2Saturation,DateRecorded
5,101,1.31,,2019-02-20
12,101,,97.0,2019-02-20
23,101,1.29,,2019-02-21
30,101,,96.0,2019-02-21
41,101,1.32,,2019-02-22


In [8]:
# Merge rows with same SmartCareId and DateRecorded, taking the non NaN value

# Define custom aggregation function
def custom_aggregation(series):
    non_nan_values = series.dropna()
    if len(non_nan_values) > 1:
        raise ValueError(f"More than 1 non NaN value in group: {non_nan_values.tolist()}")
    if len(non_nan_values) == 0:
        return np.nan
    return non_nan_values.iloc[0]

df_meas = df_meas.groupby(['SmartCareID', 'DateRecorded'])[['FEV1', 'O2Saturation']].agg(custom_aggregation)

In [9]:
df_meas.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FEV1,O2Saturation
SmartCareID,DateRecorded,Unnamed: 2_level_1,Unnamed: 3_level_1
101,2019-02-20,1.31,97.0
101,2019-02-21,1.29,96.0
101,2019-02-22,1.32,96.0
101,2019-02-23,1.28,97.0
101,2019-02-24,1.33,98.0


In [10]:
# Count rows where FEV and O2Saturation are NaN together
print(df_meas.isna().all(axis=1).sum())
# Count and print rows where either FEV or O2Saturation is NaN
print(df_meas.isna().any(axis=1).sum())
# Print number of rows
print(len(df_meas))
# Drop rows with nan values
df_meas = df_meas.dropna()
# Count number of rows
print(len(df_meas))

0
6414
26812
20398


In [43]:
# Merge patient and measurement dataframes on SmartCareID and ID
df = df_meas.merge(df_patient, right_on='ID', left_on='SmartCareID', how='left')
df

Unnamed: 0,FEV1,O2Saturation,ID,Age,Sex,Height
0,1.31,97.0,101,53,Male,173.0
1,1.29,96.0,101,53,Male,173.0
2,1.32,96.0,101,53,Male,173.0
3,1.28,97.0,101,53,Male,173.0
4,1.33,98.0,101,53,Male,173.0
...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0
20394,4.35,99.0,358,18,Male,177.0
20395,4.30,98.0,358,18,Male,177.0
20396,4.30,97.0,358,18,Male,177.0


In [44]:
# Compute predicted FEV1 using calc_predicted FEV1 in the biology module
df["Predicted FEV1"] = df.apply(lambda row: bio.calc_predicted_fev1(row.Height, row.Age, row.Sex)["Predicted FEV1"], axis=1)
# Compute FEV1 % Predicted
df["FEV1 % Predicted"] = df["FEV1"] / df["Predicted FEV1"] * 100

# Remove when there's less than 10 O2Saturation measurements
tmp_shape = df.shape[0]
tmp_ids = df.groupby('ID').size()
df = df.groupby('ID').filter(lambda x: len(x) >= 10)
print(f"Removed {tmp_shape - df.shape[0]}/{tmp_shape} rows, {tmp_ids.shape[0] - df.groupby('ID').size().shape[0]}/{tmp_ids.shape[0]} patients")
df

Removed 149/20398 rows, 40/214 patients


Unnamed: 0,FEV1,O2Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted
0,1.31,97.0,101,53,Male,173.0,3.412,38.393904
1,1.29,96.0,101,53,Male,173.0,3.412,37.807737
2,1.32,96.0,101,53,Male,173.0,3.412,38.686987
3,1.28,97.0,101,53,Male,173.0,3.412,37.514654
4,1.33,98.0,101,53,Male,173.0,3.412,38.980070
...,...,...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0,4.599,94.150902
20394,4.35,99.0,358,18,Male,177.0,4.599,94.585780
20395,4.30,98.0,358,18,Male,177.0,4.599,93.498587
20396,4.30,97.0,358,18,Male,177.0,4.599,93.498587


In [45]:
# Remove values below 85
df = df[df['O2Saturation'] >= 85]

In [46]:
# Compute avg FEV1 % Predicted per individual
def compute_avg(df, col_name, unit):
    tmp = df.groupby('ID')[col_name].mean()
    # Add tmp to a new column per Id
    df = df.join(tmp, on='ID', rsuffix='_avg')

    df[f"ID (avg {col_name})"] = df.apply(
            lambda x: f"{x.ID} ({str(round(x[f'{col_name}_avg'],1))}{unit})",
            axis=1,
        )
    return df

df = compute_avg(df, 'FEV1 % Predicted', '%')
df = compute_avg(df, 'FEV1', 'L')


In [47]:
df[f"ID (Predicted FEV1)"] = df.apply(
            lambda x: f"{x.ID} ({str(round(x['Predicted FEV1'],1))}L)",
            axis=1,
        )

In [48]:
df

Unnamed: 0,FEV1,O2Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted,FEV1 % Predicted_avg,ID (avg FEV1 % Predicted),FEV1_avg,ID (avg FEV1),ID (Predicted FEV1)
0,1.31,97.0,101,53,Male,173.0,3.412,38.393904,43.475809,101 (43.5%),1.483395,101 (1.5L),101 (3.4L)
1,1.29,96.0,101,53,Male,173.0,3.412,37.807737,43.475809,101 (43.5%),1.483395,101 (1.5L),101 (3.4L)
2,1.32,96.0,101,53,Male,173.0,3.412,38.686987,43.475809,101 (43.5%),1.483395,101 (1.5L),101 (3.4L)
3,1.28,97.0,101,53,Male,173.0,3.412,37.514654,43.475809,101 (43.5%),1.483395,101 (1.5L),101 (3.4L)
4,1.33,98.0,101,53,Male,173.0,3.412,38.980070,43.475809,101 (43.5%),1.483395,101 (1.5L),101 (3.4L)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0,4.599,94.150902,93.775327,358 (93.8%),4.312727,358 (4.3L),358 (4.6L)
20394,4.35,99.0,358,18,Male,177.0,4.599,94.585780,93.775327,358 (93.8%),4.312727,358 (4.3L),358 (4.6L)
20395,4.30,98.0,358,18,Male,177.0,4.599,93.498587,93.775327,358 (93.8%),4.312727,358 (4.3L),358 (4.6L)
20396,4.30,97.0,358,18,Male,177.0,4.599,93.498587,93.775327,358 (93.8%),4.312727,358 (4.3L),358 (4.6L)


In [51]:
# Sort values by ascending FEV1 % Predicted
col = 'FEV1 % Predicted'
col = 'Predicted FEV1'
col = 'FEV1'

df = df.sort_values(by=[col if col == 'Predicted FEV1' else f"{col}_avg"])

title = f"Breathe - Boxplots for O2 Sat, ordered by {col} ({df.shape[0]} points)"

fig = px.box(
        df,
        x=f"ID ({col if col == 'Predicted FEV1' else 'avg ' + col})",
        y="O2Saturation",
    )
fig.update_traces(boxmean=True)

# Update fig size
fig.update_layout(height=600, width=3000)
fig.show()
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))