The modelling of O2 saturation is difficult because we observed trends/signals in the SmartCare data that are medically hard to explain, or counter intuitive. Hence, we're doing a complementary analysis on the O2 saturation patterns using the Breathe data.

Boxplots of O2 saturation grouped by ID:
- ordered by predicted FEV1
- ordered by avg FEV1 % predicted
- ordered by avg FEV1

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Import biology module
import sys
sys.path.append('../data')
import biology as bio


plotsdir = "../../../../PlotsSmartcare/O2_FEV1/"


# Data processing

In [2]:
df_patient = pd.read_excel('../../../../DataFiles/BR/PredModInputData.xlsx', sheet_name="brPatient", usecols="A, J, K, L")
# Set ID as string
df_patient['ID'] = df_patient['ID'].astype(str)
df_patient

Unnamed: 0,ID,Age,Sex,Height
0,101,53,Male,173.0
1,102,45,Male,176.0
2,103,39,Female,161.0
3,104,25,Female,143.0
4,105,26,Female,165.0
...,...,...,...,...
253,354,20,Male,166.0
254,355,23,Male,177.0
255,356,44,Female,162.0
256,357,24,Female,164.0


In [3]:
df_meas = pd.read_excel('../../../../DataFiles/BR/PredModInputData.xlsx', sheet_name="BRphysdata", usecols="A, E, G, H , J")

In [4]:
# Set SmartCareID as type string
df_meas['SmartCareID'] = df_meas['SmartCareID'].astype(str)
# Rename FEV to FEV1
df_meas.rename(columns={'FEV':'FEV1'}, inplace=True)
df_meas.head(15)

Unnamed: 0,SmartCareID,RecordingType,Date_TimeRecorded,FEV1,O2Saturation
0,101,CalorieRecording,2019-02-20 00:00:00.000,0.0,0.0
1,101,CoughRecording,2019-02-20 08:13:40.546,0.0,0.0
2,101,FEF2575Recording,2019-02-20 08:16:56.942,0.53,0.0
3,101,FEV075Recording,2019-02-20 08:16:56.942,1.11,0.0
4,101,FEV1DivFEV6Recording,2019-02-20 08:16:56.942,0.5,0.0
5,101,FEV1Recording,2019-02-20 08:16:56.942,1.31,0.0
6,101,FEV6Recording,2019-02-20 08:16:56.942,2.6,0.0
7,101,HasColdOrFluRecording,2019-02-20 08:13:37.175,0.0,0.0
8,101,HasHayFeverRecording,2019-02-20 08:13:37.175,0.0,0.0
9,101,LungFunctionRecording,2019-02-20 08:16:56.942,0.0,0.0


In [5]:
# Drop rows where Recording Type is not "FEV1Recording", or "O2SaturationRecording"
df_meas = df_meas[df_meas['RecordingType'].isin(['FEV1Recording', 'O2SaturationRecording'])]

In [6]:
df_meas.head()

Unnamed: 0,SmartCareID,RecordingType,Date_TimeRecorded,FEV1,O2Saturation
5,101,FEV1Recording,2019-02-20 08:16:56.942,1.31,0.0
12,101,O2SaturationRecording,2019-02-20 08:12:48.263,0.0,97.0
23,101,FEV1Recording,2019-02-21 08:32:41.028,1.29,0.0
30,101,O2SaturationRecording,2019-02-21 08:28:11.712,0.0,96.0
41,101,FEV1Recording,2019-02-22 09:17:57.892,1.32,0.0


In [7]:
# Replace 0.00 with NaN
df_meas = df_meas.replace(0.00, np.nan)
# Create Date Recorded column and drop time from Date/Time Recorded column
df_meas['DateRecorded'] = df_meas['Date_TimeRecorded'].dt.date
# Drop Date_TimeRecorded column and RecordingType column
df_meas = df_meas.drop(['Date_TimeRecorded', 'RecordingType'], axis=1)
df_meas.head()

Unnamed: 0,SmartCareID,FEV1,O2Saturation,DateRecorded
5,101,1.31,,2019-02-20
12,101,,97.0,2019-02-20
23,101,1.29,,2019-02-21
30,101,,96.0,2019-02-21
41,101,1.32,,2019-02-22


In [8]:
# Merge rows with same SmartCareId and DateRecorded, taking the non NaN value

# Define custom aggregation function
def custom_aggregation(series):
    non_nan_values = series.dropna()
    if len(non_nan_values) > 1:
        raise ValueError(f"More than 1 non NaN value in group: {non_nan_values.tolist()}")
    if len(non_nan_values) == 0:
        return np.nan
    return non_nan_values.iloc[0]

df_meas = df_meas.groupby(['SmartCareID', 'DateRecorded'])[['FEV1', 'O2Saturation']].agg(custom_aggregation)

In [9]:
df_meas.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FEV1,O2Saturation
SmartCareID,DateRecorded,Unnamed: 2_level_1,Unnamed: 3_level_1
101,2019-02-20,1.31,97.0
101,2019-02-21,1.29,96.0
101,2019-02-22,1.32,96.0
101,2019-02-23,1.28,97.0
101,2019-02-24,1.33,98.0


In [10]:
# Count rows where FEV and O2Saturation are NaN together
print(df_meas.isna().all(axis=1).sum())
# Count and print rows where either FEV or O2Saturation is NaN
print(df_meas.isna().any(axis=1).sum())
# Print number of rows
print(len(df_meas))
# Drop rows with nan values
df_meas = df_meas.dropna()
# Count number of rows
print(len(df_meas))

0
6414
26812
20398


In [11]:
# Merge patient and measurement dataframes on SmartCareID and ID
df = df_meas.merge(df_patient, right_on='ID', left_on='SmartCareID', how='left')
df

Unnamed: 0,FEV1,O2Saturation,ID,Age,Sex,Height
0,1.31,97.0,101,53,Male,173.0
1,1.29,96.0,101,53,Male,173.0
2,1.32,96.0,101,53,Male,173.0
3,1.28,97.0,101,53,Male,173.0
4,1.33,98.0,101,53,Male,173.0
...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0
20394,4.35,99.0,358,18,Male,177.0
20395,4.30,98.0,358,18,Male,177.0
20396,4.30,97.0,358,18,Male,177.0


In [12]:
# Compute predicted FEV1 using calc_predicted FEV1 in the biology module
# df["Predicted FEV1"] = df.apply(lambda row: bio.calc_predicted_fev1(row.Height, row.Age, row.Sex)["Predicted FEV1"], axis=1)
df["Predicted FEV1"] = df.apply(
        lambda x: bio.calc_LMS_predicted_FEV1(
            bio.load_LMS_spline_vals(x.Age, x.Sex),
            bio.load_LMS_coeffs(x.Sex),
            x.Height,
            x.Age,
            x.Sex,
        )["Predicted FEV1"],
        axis=1,
    )
# Compute FEV1 % Predicted
df["FEV1 % Predicted"] = df["FEV1"] / df["Predicted FEV1"] * 100

# Remove when there's less than 10 O2Saturation measurements
tmp_shape = df.shape[0]
tmp_ids = df.groupby('ID').size()
df = df.groupby('ID').filter(lambda x: len(x) >= 10)
print(f"Removed {tmp_shape - df.shape[0]}/{tmp_shape} rows, {tmp_ids.shape[0] - df.groupby('ID').size().shape[0]}/{tmp_ids.shape[0]} patients")
df

Removed 149/20398 rows, 40/214 patients


Unnamed: 0,FEV1,O2Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted
0,1.31,97.0,101,53,Male,173.0,3.610061,36.287474
1,1.29,96.0,101,53,Male,173.0,3.610061,35.733466
2,1.32,96.0,101,53,Male,173.0,3.610061,36.564477
3,1.28,97.0,101,53,Male,173.0,3.610061,35.456463
4,1.33,98.0,101,53,Male,173.0,3.610061,36.841481
...,...,...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0,4.505342,96.108129
20394,4.35,99.0,358,18,Male,177.0,4.505342,96.552046
20395,4.30,98.0,358,18,Male,177.0,4.505342,95.442252
20396,4.30,97.0,358,18,Male,177.0,4.505342,95.442252


In [13]:
# Remove values below 85 - concerns one individual (ID 111)
df = df[df['O2Saturation'] >= 85]

In [14]:
# Compute avg FEV1 % Predicted per individual
def compute_avg(df, col_name, unit):
    tmp = df.groupby('ID')[col_name].mean()
    # Add tmp to a new column per Id
    df = df.join(tmp, on='ID', rsuffix='_avg')

    df[f"ID (avg {col_name})"] = df.apply(
            lambda x: f"{x.ID} ({str(round(x[f'{col_name}_avg'],1))}{unit})",
            axis=1,
        )
    return df

df = compute_avg(df, 'FEV1 % Predicted', '%')
df = compute_avg(df, 'FEV1', 'L')


In [15]:
df[f"ID (Predicted FEV1)"] = df.apply(
            lambda x: f"{x.ID} ({str(round(x['Predicted FEV1'],1))}L)",
            axis=1,
        )

In [16]:
df

Unnamed: 0,FEV1,O2Saturation,ID,Age,Sex,Height,Predicted FEV1,FEV1 % Predicted,FEV1 % Predicted_avg,ID (avg FEV1 % Predicted),FEV1_avg,ID (avg FEV1),ID (Predicted FEV1)
0,1.31,97.0,101,53,Male,173.0,3.610061,36.287474,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L)
1,1.29,96.0,101,53,Male,173.0,3.610061,35.733466,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L)
2,1.32,96.0,101,53,Male,173.0,3.610061,36.564477,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L)
3,1.28,97.0,101,53,Male,173.0,3.610061,35.456463,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L)
4,1.33,98.0,101,53,Male,173.0,3.610061,36.841481,41.090567,101 (41.1%),1.483395,101 (1.5L),101 (3.6L)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20393,4.33,99.0,358,18,Male,177.0,4.505342,96.108129,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L)
20394,4.35,99.0,358,18,Male,177.0,4.505342,96.552046,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L)
20395,4.30,98.0,358,18,Male,177.0,4.505342,95.442252,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L)
20396,4.30,97.0,358,18,Male,177.0,4.505342,95.442252,95.724745,358 (95.7%),4.312727,358 (4.3L),358 (4.5L)


# SpO2 boxplots per individual

In [41]:
# Sort values by ascending FEV1 % Predicted
col = 'FEV1 % Predicted'
col = 'Predicted FEV1'
# col = 'FEV1'

gender = "both"
gender = "male"
# gender = "female"

if gender == "both":
  df_plot = df
  title_suffix = ""
elif gender == "male":
  df_plot = df[df.Sex == "Male"]
  title_suffix = ", males only"
elif gender == "female":
  df_plot = df[df.Sex == "Female"]
  title_suffix = ", females only"

df_plot = df_plot.sort_values(by=[col if col == 'Predicted FEV1' else f"{col}_avg"])

title = f"Breathe - Boxplots for O2 Sat, ordered by {col} ({df_plot.shape[0]} points{title_suffix})"

fig = px.box(
        df_plot,
        x=f"ID ({col if col == 'Predicted FEV1' else 'avg ' + col})",
        y="O2Saturation",
    )
# Add title
fig.update_layout(title=title)
fig.update_traces(boxmean=True)

# Update fig size
fig.update_layout(height=600, width=3000)
fig.show()
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title))

In [43]:
col = 'FEV1 % Predicted'
# col = 'Predicted FEV1'
# col = 'FEV1'

means = df.groupby('ID')['O2Saturation'].mean()
# Get the FEV1 % Predicted for the each patient
fev1 = df.groupby('ID')[f'{col}'].mean()
# Create a df with means and fev1
df_means = pd.DataFrame({'mean O2 Saturation': means, f'mean {col}': fev1})
# Order by {col}
df_means = df_means.sort_values(by=f'mean {col}')
# Scatter plot of means
fig = px.scatter(
        x=df_means[f'mean {col}'],
        y=df_means['mean O2 Saturation'],
    )
# Update font size
fig.update_layout(
        font=dict(
            size=8,
        ),
    )
# Compute Pearson correlation coefficient
corr = df_means.corr().iloc[0,1]
# Add Pearson correlation coefficient to title
fig.update_layout(
        title=f"Mean O2 Saturation vs mean {col} ({len(df_means)} individuals, Pearson correlation coefficient: {round(corr, 2)})",
    )
# Reduce marker size
fig.update_traces(marker=dict(size=3))
# Set axis labels
fig.update_xaxes(title_text=f'Mean {col}')
# fig.update_xaxes(title_text=f'{len(df_means)} individuals (ordered by FEV1 % Predicted)', showgrid=False, showticklabels=False)
# Put the x axis title on bottom
fig.update_yaxes(title_text='Mean O2 Saturation')
# Hide x axis labels
fig.show()

# Study the 60% FEV1 % Predicted cut-off

In [44]:
# Split dataframe into 2 based on FEV1 % Predicted = 60%
df_low = df[df['FEV1 % Predicted'] <= 60]
df_high = df[df['FEV1 % Predicted'] > 60]

# For each individual, compute the average O2 saturation
means_low = df_low.groupby('ID')['O2Saturation'].mean()
means_high = df_high.groupby('ID')['O2Saturation'].mean()


# Compute the achievable range of values between 75th percentile and 25th percentile
achievable_ranges_low = df_low.groupby('ID')['O2Saturation'].quantile(0.75) - df_low.groupby('ID')['O2Saturation'].quantile(0.25)
achievable_ranges_high = df_high.groupby('ID')['O2Saturation'].quantile(0.75) - df_high.groupby('ID')['O2Saturation'].quantile(0.25)

In [45]:
# Compare mean of means_low with mean of means_high
print(f"Mean of means_low: {means_low.mean()}")
print(f"Mean of means_high: {means_high.mean()}")

# Same for achievable ranges
print(f"Mean of achievable_ranges_low: {achievable_ranges_low.mean()}")
print(f"Mean of achievable_ranges_high: {achievable_ranges_high.mean()}")

Mean of means_low: 96.45803558105217
Mean of means_high: 97.44298835023818
Mean of achievable_ranges_low: 1.231818181818182
Mean of achievable_ranges_high: 0.9890510948905109


In [46]:
# Plot a boxplot of the achievable_ranges_low and achievable_ranges_high side by side
fig = go.Figure()
fig.add_trace(go.Box(y=achievable_ranges_low, name='FEV1 % Predicted <60%', boxmean=True))
fig.add_trace(go.Box(y=achievable_ranges_high, name='FEV1 % Predicted >60%', boxmean=True))
# Update fig size
fig.update_layout(title="Achievable ranges of O2 saturation (75th-25th percentiles)", height=600, width=600)
fig.show()

# Study SpO2 sex bias

In [47]:
df_males = df[df.Sex == "Male"]
df_females = df[df.Sex == "Female"]
assert len(df_males) + len(df_females) == len(df)

In [48]:
df_females.O2Saturation.describe()

count    10988.000000
mean        97.156656
std          1.704953
min         85.000000
25%         96.000000
50%         98.000000
75%         98.000000
max        100.000000
Name: O2Saturation, dtype: float64

In [49]:
df_males.O2Saturation.describe()

count    9255.000000
mean       96.748664
std         1.496404
min        89.000000
25%        96.000000
50%        97.000000
75%        98.000000
max       100.000000
Name: O2Saturation, dtype: float64

Conclusions:

All: mean female SpO2 - mean male SpO2 = 97.2 (1.7%) - 96.8 (1.5%) = 0.4%

\>60% FEV1 % Predicted avg: 97.6 (1.3%) - 97.2 (1.2%) = 0.4%

\>80% FEV1 % Predicted avg: 97.9 (1.2%) - 97.3 (1.1%) = 0.6%

\>90% FEV1 % Predicted avg: 98.2 (0.9%) - 97.2 (1.2%) = 1%

____________
Only within 18-40yr range (this excludes 20 females and 10 males)

All: 97.2 (1.7%) - 96.8 (1.5%) = 0.4%

\>90% FEV1 % Predicted avg: 98 (0.9%) - 97.2 (1.2%) = 0.8%

In [25]:
# Threshold for healthy individuals
threshold_fev1_prct_pred = 60
df_healthy_males = df_males[df_males['FEV1 % Predicted_avg'] >= threshold_fev1_prct_pred]
df_healthy_females = df_females[df_females['FEV1 % Predicted_avg'] >= threshold_fev1_prct_pred]

In [26]:
df_healthy_females.O2Saturation.describe()

count    5576.000000
mean       97.622788
std         1.385999
min        88.000000
25%        97.000000
50%        98.000000
75%        98.000000
max       100.000000
Name: O2Saturation, dtype: float64

In [27]:
df_healthy_males.O2Saturation.describe()

count    5191.000000
mean       97.194259
std         1.189012
min        91.000000
25%        97.000000
50%        97.000000
75%        98.000000
max       100.000000
Name: O2Saturation, dtype: float64

In [28]:
# Within 18-40 age range
df_young_males = df_males[(df_males['Age'] >= 18) & (df_males['Age'] <= 40)]
df_young_females = df_females[(df_females.Age >= 18) & (df_females.Age <= 40)]
# Print length diff against df_males and df_females
print(f"{len(df_young_females.ID.unique())}/{len(df_females.ID.unique())} females between 18 and 40")
print(f"{len(df_young_males.ID.unique())}/{len(df_males.ID.unique())} males between 18 and 40")

77/97 females between 18 and 40
63/77 males between 18 and 40


In [29]:
df_young_females.O2Saturation.describe()

count    7937.000000
mean       97.197472
std         1.695689
min        85.000000
25%        97.000000
50%        98.000000
75%        98.000000
max       100.000000
Name: O2Saturation, dtype: float64

In [30]:
df_young_males.O2Saturation.describe()

count    6533.000000
mean       96.775815
std         1.547469
min        89.000000
25%        96.000000
50%        97.000000
75%        98.000000
max       100.000000
Name: O2Saturation, dtype: float64

In [31]:
# Threshold for healthy individuals
threshold_fev1_prct_pred = 90
df_young_males[df_young_males['FEV1 % Predicted_avg'] >= threshold_fev1_prct_pred].O2Saturation.describe()


count    956.000000
mean      97.284414
std        1.125001
min       93.000000
25%       97.000000
50%       98.000000
75%       98.000000
max      100.000000
Name: O2Saturation, dtype: float64

In [32]:
df_young_females[df_young_females['FEV1 % Predicted_avg'] >= threshold_fev1_prct_pred].O2Saturation.describe()

count    993.000000
mean      98.008056
std        0.967171
min       89.000000
25%       98.000000
50%       98.000000
75%       99.000000
max      100.000000
Name: O2Saturation, dtype: float64