In [1]:
from load_data import *
from partition import *
from desaturation_fev1 import *
from normalise import *
from exacerbation_periods import *
from smooth import *
from plot_helpers import *
from biology import *
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go


#Settings
datadir = "../../../../SmartCareData/"
plotsdir="../../../../PlotsSmartcare/O2_FEV1/"

In [2]:
# Process O2-FEV1 data
O2_FEV1 = create_O2_FEV1_df(datadir)
# O2_FEV1 = remove_all_measurements_with_ID_216(O2_FEV1)

# Compute FEV1 % Predicted
O2_FEV1['FEV1 % Predicted'] = O2_FEV1.apply(lambda x: x["FEV1"] / x["Predicted FEV1"] * 100, axis=1)

# Partition FEV1 % Predicted in 3 groups
O2_FEV1["FEV1 % Predicted Group"] = partition_in_n_equal_groups(O2_FEV1['FEV1 % Predicted'],3)

# What are we analysing?
O2_FEV1.head()

  measurements = pd.read_csv(datadir + "mydata.csv").rename(columns={"FEV 1": "FEV1"})
  for idx, row in parser.parse():
  for idx, row in parser.parse():


Removed 11 measurements where O2 Sat > 100%, kept 123136 measurements
O2 Saturation contains 13418 measurements
FEV1 contains 10418 measurements
Removed 4684 rows with O2_FEV1 inner join, kept 68% of measurements (10100)
Removed 1008 duplicates, 9092 measurements left


Unnamed: 0,ID,Date recorded,O2 Saturation,FEV1,Age,Sex,Height,Weight,Predicted FEV1,FEV1 Set As,Pulmonary Exacerbation,Less Exacerbation,Transplant Recipients,Date Last PE Start,Date Last PE Stop,Comments,FEV1 % Predicted,FEV1 % Predicted Group
0,79,2015-09-10,95.0,1.53,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,40.691489,<42.7
1,79,2015-09-11,97.0,1.68,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,44.680851,[42.7;66.0[
2,79,2015-09-12,97.0,1.48,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,39.361702,<42.7
3,79,2015-08-13,95.0,1.63,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,43.351064,[42.7;66.0[
4,79,2015-11-07,94.0,1.52,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,40.425532,<42.7


# O2-FEV1 analysis
## Definitions
- O2 Saturation
- FEV 1
- Predicted FEV 1 in %: measure for the amount of airway obstruction either due to sputum load (non-permanent obstruction) or scars in the lungs (permanent obstruction).

## Literature
[The association between forced expiratory volume in one second (FEV1) and pulse oximetric measurements of arterial oxygen saturation (SpO2) in the patients with COPD: A preliminary study](https://pubmed.ncbi.nlm.nih.gov/24949035/).
- Context: 31 patients with COPD
- Key results: There was not statistically significant correlation between FEV1 % predicted and SpO2 values (P < 0.05), but a great correlation existed between FEV1/FVC % predicted and SpO2 values (r = 0.556, P < 0.001).

 [ANALYSIS OF CORRELATION BETWEEN FEV1/FEV6 AND OXYGEN SATURATION DURING SIX-MINUTE WALK TEST (6MWT) IN COPD PATIENTS](https://www.researchgate.net/publication/351322676_ANALYSIS_OF_CORRELATION_BETWEEN_FEV1FEV6_AND_OXYGEN_SATURATION_DURING_SIX-MINUTE_WALK_TEST_6MWT_IN_COPD_PATIENTS)
 - Need to download

Note: nobody segments the input by groups. Probably because they don't have enough data to do this.

In [None]:
## Raw scatter plots

In [6]:
# Raw plot without trendline
fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1")
# fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1", trendline="ols")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
fig.write_image(plotsdir + "FEV1-O2 raw.pdf")

fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1 % Predicted")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
# fig.write_image(plotsdir + "FEV1 % Predicted-O2 raw.pdf")

O2_FEV1.shape

(9092, 18)

In [7]:
var = 'FEV1'; xaxis_range = [0, 5.1]
# var = 'FEV1 % Predicted'; xaxis_range = [0, 155]

# Plot with height
O2_FEV1['Height Group'], height_labels=partition_in_n_equal_groups(O2_FEV1['Height'], 3, True)

fig = px.scatter(O2_FEV1, y="O2 Saturation", x=var, color="Height Group", marginal_x="histogram", marginal_y="histogram")
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig.update_xaxes(range=[0.9*min(O2_FEV1["FEV1"]), 1.1*max(O2_FEV1["FEV1"])])
# fig.update_yaxes(range=[0.9*min(O2_FEV1["O2 Saturation"]), max(1.1*O2_FEV1["O2 Saturation"])])
# fig.show()

for height_group in height_labels:
    mask_height=O2_FEV1['Height Group'] == height_group

    fig = px.scatter(O2_FEV1[mask_height], y="O2 Saturation", x=var,
                     title='Height group {}'.format(height_group))
    fig.update_layout(autosize=False, width=500, height=500)
    fig.update_traces(marker=dict(size=5),
                      selector=dict(mode='markers'))
    fig.update_xaxes(range=xaxis_range)
    fig.update_yaxes(range=[74, 102])
    fig.show()
    filename= "Height group " +  height_group + " " + var + "-O2 raw"
    # fig.write_image(plotsdir + filename + ".pdf")

In [11]:
predicted = False
x_var = 'Height'
lung_function_var = 'FEV1 % Predicted' if predicted else 'FEV1'

fig1, fig2, _ = desaturation_FEV1_for_variable(O2_FEV1, x_var, n_var_groups=6, predicted=predicted)
fig1.show()
# fig1.write_image(plotsdir + "Desaturation {} for {} main plot.pdf".format(lung_function_var, x_var))

fig2.show()
# fig2.write_image(plotsdir + "Desaturation {} for {} with lines.pdf".format(lung_function_var, x_var))

# Observations
# o2 10, var 15 contains a lot of noise and groups have very low number of datapoints. It gets too patient specific

## Per patient plots

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
O2_FEV1.head()

# Include time evolution into the graph
def to_float(dt_time):
    return (365*dt_time.year + 12*dt_time.month + dt_time.day)/12

mindate = to_float(min(O2_FEV1["Date recorded"]))
O2_FEV1["Months since study start"]=O2_FEV1["Date recorded"].apply(lambda x: to_float(x)-mindate)



In [None]:
# Same patient variability: can we observe the same pattern for each patient?
IDs=O2_FEV1.ID.unique()
for ID in IDs:
    plot_o2_fev1_raw_for_id(O2_FEV1, ID)

## Summary plot (+ partititioning)

In [12]:
# Add exacerbated labels
# Compute exacerbation labels
# O2_FEV1_processed = compute_ex_labels_from_heuristics(antibioticsdata, patientsdata, O2_FEV1)

# Get exacerbation labels from the predictive classifier
O2_FEV1_processed = merge_pred_ex_labels_to(O2_FEV1, get_pred_ex_labels(datadir))

# Smooth
O2_FEV1_processed = smooth(O2_FEV1_processed, ['FEV1', 'FEV1 % Predicted'], smooth_func="max")
O2_FEV1_processed = smooth(O2_FEV1_processed, ['O2 Saturation'], smooth_func="mean")
print("Removed {} nan entries after max smoothing, kept {}".format(O2_FEV1_processed.isna().sum().max(), O2_FEV1_processed.shape[0]))
O2_FEV1_processed.dropna(inplace=True)

# Normalise
O2_FEV1_processed = norm_by_stable_baseline(O2_FEV1_processed,
                                            ['O2 Saturation', 'O2 Saturation smoothed', 'FEV1', 'FEV1 % Predicted', 'FEV1 smoothed', 'FEV1 % Predicted smoothed'])

# Settings
prefix, ex_column, x, y = set_x_y_vars(
      with_predicted_labels=True,
      with_predicted_fev1=False,
      is_smoothed_fev=True,
      is_smoothed_o2=False,
      is_normalised=False,
  )

is_partitioned= True
if not is_partitioned:
  # Plot O2-FEV with respective displots
  fig=plot_o2_fev_with_displots(O2_FEV1_processed, x, y, ex_column)
  fig.show()
  fig.write_image("{}/Ex labels {} {}-{}.pdf".format(plotsdir, prefix, x, y), width=1400, height=600)

  # Plot subsampled O2-FEV scatter
  fig = plot_subsampled_scatter(x, y, O2_FEV1_processed, random_state=7)
  fig.show()
else:
  # Use partitioned data. Note that this requires smoothed True and Predicted FEV1 False

  # fev1_prct_thresholds = [65, 99]
  # O2_FEV1_processed['FEV1 % Predicted smoothed Group'] = partition_given_thresholds(O2_FEV1_processed['FEV1 % Predicted smoothed'], fev1_prct_thresholds)
  # O2_FEV1_processed['Predicted FEV1 smoothed Group'] = partition_given_thresholds(O2_FEV1_processed['Predicted FEV1 smoothed'], predicted_fev1_thresholds)

  predicted_fev1_thresholds = [2.4, 3.2, 4]

  # Print number of uniaue ID
  print("There are {} different individuals".format(len(O2_FEV1_processed.ID.unique())))

  # Filter by unique ID
  O2_FEV1_processed = O2_FEV1_processed.groupby('ID').filter(lambda l: len(l) > 1)
  # Check if individual's Predicted FEV1 position wrt to groups thresholds
  O2_FEV1_processed['Predicted FEV1 Group'] = O2_FEV1_processed['Predicted FEV1'].apply(lambda l: value_to_group(l, predicted_fev1_thresholds))

  # Create a for loop for each Group
  # parition_variable = 'FEV1 % Predicted smoothed'
  parition_variable = 'Predicted FEV1'

  for fev_group in O2_FEV1_processed[parition_variable + ' Group'].unique():
    mask = O2_FEV1_processed[parition_variable + ' Group'] == fev_group
    # Count patients
    n_patients=len(O2_FEV1_processed[mask].ID.unique())

    # Plot scatter plot using plotly with O2_FEV1_processed[mask], x, y, ex_column with color from get_ex_color() and get_stable_color()
    fig = px.scatter(O2_FEV1_processed[mask], x=x, y=y, color=ex_column, color_discrete_sequence=[get_stable_color(), get_ex_color()])
    # Set x axes range to min max of O2_FEV1_processed[x]
    fig.update_xaxes(range=[O2_FEV1_processed[x].min()*0.95, O2_FEV1_processed[x].max()*1.05])
    fig.update_layout(title="{} patients, {} {}".format(n_patients, parition_variable, fev_group))

    fig.show()
    fig.write_image("{}/Ex labels {} {}-{} group {}.pdf".format(plotsdir, prefix, x, y, fev_group), width=1400, height=600)


Exacerbated labels data from the predictive classifier has 14452 entries (1445 exacerbated, 13006 not exacerbated measurements, 1 NaN)
Merging exacerbated labels into O2_FEV1
Dropped 6039 O2_FEV1 entries with NaN exacerbation label. 3053 entries remain.


NameError: name 'mean' is not defined

In [49]:
partition_given_thresholds(pd.Series([1, 2, 3, 4, 5, 6, 7, 7.1, 8, 9, 10]), [2, 5, 7])
O2_FEV1_processed.head()

Unnamed: 0,ID,Date recorded,O2 Saturation,FEV1,Age,Sex,Height,Weight,Predicted FEV1,FEV1 Set As,...,FEV1 smoothed,FEV1 % Predicted smoothed,O2 Saturation smoothed,O2 Saturation norm,O2 Saturation smoothed norm,FEV1 norm,FEV1 % Predicted norm,FEV1 smoothed norm,FEV1 % Predicted smoothed norm,FEV1 % Predicted smoothed Group
373,30,2015-12-09,95.0,0.99,33,Female,155.4,57.5,2.7,2.7,...,1.02,37.777778,96.0,-0.378788,0.636364,-0.018788,-0.695847,-0.021061,-0.780022,<60
374,30,2015-12-31,97.0,1.0,33,Female,155.4,57.5,2.7,2.7,...,1.04,38.518519,96.666667,1.621212,1.30303,-0.008788,-0.325477,-0.001061,-0.039282,<60
375,30,2015-12-13,94.0,1.04,33,Female,155.4,57.5,2.7,2.7,...,1.04,38.518519,95.333333,-1.378788,-0.030303,0.031212,1.156004,-0.001061,-0.039282,<60
376,30,2015-12-17,95.0,0.96,33,Female,155.4,57.5,2.7,2.7,...,1.04,38.518519,95.333333,-0.378788,-0.030303,-0.048788,-1.806958,-0.001061,-0.039282,<60
377,30,2016-01-10,95.0,1.03,33,Female,155.4,57.5,2.7,2.7,...,1.05,38.888889,94.666667,-0.378788,-0.69697,0.021212,0.785634,0.008939,0.331089,<60


## Test the sensitivity of random states on subsampled plots
We wonder if different random states will give largely different results, to an extend that it could challenge the model assumptions.
It turns out that, despite an effect on the x axis range, the subsampled plots are very similar.

In [None]:
# Add subsampled plot
prefix, ex_column, x, y = set_x_y_vars(with_predicted_labels=True, with_predicted_fev1=True, is_smoothed=True, is_normalised=False)

for random_state in range(1,10):
  print(random_state)
  fig = plot_subsampled_scatter(x, y, O2_FEV1_processed, random_state=random_state)
  fig.show()

## Full sized displots

In [None]:
# This part fo the code is unused now
# But the displots can be useful for the report, so I keep it here

if predicted:
    if normalised:
        xrange=[-1.8, 1.2]
    else:
        xrange=[0, 155]
    bin_size=4
else:
    if normalised:
        xrange=[-42,40]
    else:
        xrange=[0, 5.1]
    bin_size=0.14

# Raw plot with superposed exacerbation labels
fig = px.scatter(O2_FEV1_processed, y=y, x=x, color=ex_column)
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig.show()
# fig.write_image("{}/{}-{} by exacerbated labels superposed raw.pdf".format(plotsdir, x, y))

# Raw individual plot for each exacerbation label]
fig_scatter = px.scatter(O2_FEV1_processed.dropna(), y=y, x=x,
                 # color='Exacerbation Labels',
                 facet_col=ex_column)
fig_scatter.update_layout(autosize=False, width=1000, height=500)
fig_scatter.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig_scatter.update_xaxes(range=xrange)
if not(normalised): fig_scatter.update_yaxes(range=[74, 102])
# fig_scatter.show()
# fig.write_image("{}/Ex labels {} {}-{} individual raw.pdf".format(plotsdir, prefix, x, y), width=1300, height=400)

def distplot(x, group_labels, bin_size):
    data=[]
    for exacerbation_label in group_labels:
        mask = O2_FEV1_processed[ex_column] == exacerbation_label
        data.append(O2_FEV1_processed[x][ mask ])
        print("{} measurements with {} = {}".format(sum(mask), ex_column, exacerbation_label))
    fig = ff.create_distplot(data, group_labels, bin_size=bin_size, show_rug=False)
    fig.update_xaxes(title_text=x)
    fig.update_layout(autosize=False, width=1000, height=400)
    return fig

# Create distplot for x = lung function
group_labels = [True, False] if with_predicted_labels else ["Exacerbation Period", "Stable Period"]
fig_distplot_fev1 = distplot(x, group_labels, bin_size)
fig_distplot_fev1.show()
# fig.write_image("{}/Ex labels {} {} distplot.pdf".format(plotsdir, prefix, x), width=1000, height=400)

# Create distplot for y = O2 saturation
group_labels = [True, False] if with_predicted_labels else ["Exacerbation Period", "Stable Period"]
fig_displot_o2 = distplot(y, group_labels, 1)
fig_displot_o2.show()
# fig_displot_o2.write_image("{}/Ex labels {} {} distplot.pdf".format(plotsdir, prefix, y), width=1000, height=400)

## Patient level plots to validate or validate outliers down

In [None]:
# O2-FEV1: Outliers down can happen, however if you are persistently getting low reading it means that it's not an outlier
# Checking O2 sat stable period measurements outliers
exacerbation_label = 'Stable Period'

# Case 1:
IDs = ['180', '202']
o2_saturation_threshold = 87

# Case 2: smoothed
o2_saturation_threshold = 90

# Plot
ids = O2_FEV1_processed[ (O2_FEV1_processed['Exacerbation Labels'] == exacerbation_label) & (O2_FEV1_processed['O2 Saturation'] < o2_saturation_threshold) ].ID.unique()
print(ids)
for id in ids:
    plot_o2_fev1_raw_for_id(O2_FEV1_processed, id, save=False, show=True)

In [15]:
O2_FEV1["new_col"][1:4] = np.mean(O2_FEV1['FEV1'].to_numpy().copy())
O2_FEV1["new_col"]

0       5.040000
1       1.756611
2       1.756611
3       1.756611
4       5.040000
          ...   
9087    5.040000
9088    5.040000
9089    5.040000
9090    5.040000
9091    5.040000
Name: new_col, Length: 9092, dtype: float64

## Validate smooth max

In [5]:
def plot_fev1_and_fev1_smoothed_for_id(O2_FEV1, id):
  O2_FEV1=O2_FEV1[ O2_FEV1['ID'] == id ]
  
  mode="mean"
  O2_FEV1 = smooth(O2_FEV1, ['FEV1'], mode=mode)

  # Plot
  # Create a subplots that share the same x and y axes with plotly
  fig = make_subplots(rows=1, cols=1, shared_xaxes=True, shared_yaxes=True)
  # Add scatter of FEV1 with Date recorded with a name 'FEV1' with marker opacity
  fig.add_scatter(x=O2_FEV1['Date recorded'], y=O2_FEV1['FEV1'], mode='markers+lines', name='FEV1', opacity=0.5, row=1, col=1)
  # Add scatter of FEV1 smoothed with Date recorded with a name 'FEV1 smoothed'
  fig.add_scatter(x=O2_FEV1['Date recorded'], y=O2_FEV1['FEV1 {} smoothed'.format(mode)], mode='markers+lines', name='FEV1 smoothed', opacity=0.5, row=1, col=1)
  # Update figure size
  fig.update_layout(autosize=False, width=1000, height=300, title='FEV1 and FEV1 smoothed for ID {}'.format(id))
  fig.show()

# Plot plot_fev1_and_fev1_smoothed_for_id for the first 10 IDs of O2_FEV1
for id in O2_FEV1.ID.unique()[:10]:
  plot_fev1_and_fev1_smoothed_for_id(O2_FEV1, id)

## How to smooth O2?

In [5]:
# Plots O2 saturation against Date for a given patient id
def plot_o2_saturation_for_id(df, id, plotsdir, save=False, show=False):
    mask = df['ID'] == id
    fig = px.scatter(df[mask], x='Date recorded', y='O2 Saturation')
    fig.update_layout(autosize=False, width=1000, height=400)
    fig.update_traces(marker=dict(size=5),
                      selector=dict(mode='markers'))
    if save:
        fig.write_image("{}/O2 for ID/O2 saturation for ID {}.pdf".format(plotsdir, id), width=1000, height=400)
    if show:
        fig.show()
        
# For each patient id, plot O2 saturation against Date recorded
for id in O2_FEV1.ID.unique():
    plot_o2_saturation_for_id(O2_FEV1, id, plotsdir, save=True, show=False)

## Why are there measurements with FEV1 % Pred smoothed > 70% and FEV1 smoothed < 1L?
image.png
The scatter plot of O2 vs FEV1 for the FEV1 % Predicted group between 70% and 90% shows several measurements with an FEV1 smoothed below 1L. This looks erroneous.
There are 98 measurements with FEV1 % Pred smoothed > 70% and FEV1 smoothed < 1L. 
- 92 measurements belong to individual with ID 216. There are no other measurements for this individual. Is there a problem for the data collected by this individual?

Action: add function `remove_all_measurements_with_ID_216()`

In [11]:
114 * 0.01 * 3.95 - 44 * 0.025 - 2.6

0.8030000000000008

In [8]:
# Find rows where FEV1 % Predicted smoothed > 70% and FEV1 smoothed < 1 L
mask = (O2_FEV1_processed['FEV1 % Predicted smoothed'] > 70) & (O2_FEV1_processed['FEV1 smoothed'] < 1)
print("{} measurements with FEV1 % Predicted smoothed > 70% and FEV1 < 1 L".format(sum(mask)))
# Set pandas display max row option to 100
# pd.set_option('display.max_rows', 100)
# Apply the mask and filter the folowing columns ID, FEV1, FEV1 smoothed, Age, Sex, Height, Weight, FEV1 % Predicted, FEV1 % Predicted smoothed, Is Exacerbated
O2_FEV1_processed[mask][['ID', 'Age', 'Sex', 'Height', 'Weight', 'FEV1', 'FEV1 smoothed', 'Predicted FEV1', 'FEV1 Set As', 'FEV1 % Predicted', 'FEV1 % Predicted smoothed', 'Is Exacerbated', 'O2 Saturation']].sort_values(by=['ID', 'FEV1 % Predicted smoothed'])

92 measurements with FEV1 % Predicted smoothed > 70% and FEV1 < 1 L


Unnamed: 0,ID,Age,Sex,Height,Weight,FEV1,FEV1 smoothed,Predicted FEV1,FEV1 Set As,FEV1 % Predicted,FEV1 % Predicted smoothed,Is Exacerbated,O2 Saturation
7655,216,44,Female,154.0,53.0,0.64,0.64,0.88,0.9,72.727273,72.727273,False,92.0
7637,216,44,Female,154.0,53.0,0.65,0.65,0.88,0.9,73.863636,73.863636,False,94.0
7652,216,44,Female,154.0,53.0,0.64,0.66,0.88,0.9,72.727273,75.000000,False,90.0
7653,216,44,Female,154.0,53.0,0.66,0.66,0.88,0.9,75.000000,75.000000,False,94.0
7654,216,44,Female,154.0,53.0,0.64,0.66,0.88,0.9,72.727273,75.000000,False,91.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7618,216,44,Female,154.0,53.0,0.79,0.79,0.88,0.9,89.772727,89.772727,False,92.0
7619,216,44,Female,154.0,53.0,0.75,0.79,0.88,0.9,85.227273,89.772727,False,95.0
7593,216,44,Female,154.0,53.0,0.70,0.81,0.88,0.9,79.545455,92.045455,False,92.0
7594,216,44,Female,154.0,53.0,0.81,0.81,0.88,0.9,92.045455,92.045455,False,92.0


In [9]:
# Filter measurements done by ID 216
mask = O2_FEV1_processed['ID'] == '216'

# Add scatter of FEV1 smoothed against date recorded
fig = px.scatter(O2_FEV1_processed[mask], x='Date recorded', y='FEV1 smoothed')
# Add scatter of FEV1 with trace name FEV1
fig.add_scatter(x=O2_FEV1_processed[mask]['Date recorded'], y=O2_FEV1_processed[mask]['FEV1'], mode='markers', name='FEV1')
fig.show()
# Apply the mask and filter the folowing columns ID, FEV1, FEV1 smoothed, Age, Sex, Height, Weight, FEV1 % Predicted, FEV1 % Predicted smoothed, Is Exacerbated
O2_FEV1_processed[mask][['ID', 'Date recorded', 'FEV1', 'FEV1 smoothed', 'Age', 'Sex', 'Height', 'Weight', 'FEV1 % Predicted', 'FEV1 % Predicted smoothed', 'Is Exacerbated']].sort_values(by=['FEV1 smoothed'])

Unnamed: 0,ID,Date recorded,FEV1,FEV1 smoothed,Age,Sex,Height,Weight,FEV1 % Predicted,FEV1 % Predicted smoothed,Is Exacerbated
7655,216,2017-02-21,0.64,0.64,44,Female,154.0,53.0,72.727273,72.727273,False
7637,216,2017-03-15,0.65,0.65,44,Female,154.0,53.0,73.863636,73.863636,False
7653,216,2017-01-29,0.66,0.66,44,Female,154.0,53.0,75.000000,75.000000,False
7678,216,2017-02-26,0.66,0.66,44,Female,154.0,53.0,75.000000,75.000000,False
7677,216,2017-03-12,0.62,0.66,44,Female,154.0,53.0,70.454545,75.000000,False
...,...,...,...,...,...,...,...,...,...,...,...
7617,216,2016-11-10,0.73,0.79,44,Female,154.0,53.0,82.954545,89.772727,False
7618,216,2016-11-29,0.79,0.79,44,Female,154.0,53.0,89.772727,89.772727,False
7594,216,2016-09-27,0.81,0.81,44,Female,154.0,53.0,92.045455,92.045455,False
7593,216,2016-10-01,0.70,0.81,44,Female,154.0,53.0,79.545455,92.045455,False


## What's the difference between Predicted FEV1 and FEV1 Set As? Rounded version?
The two are computed by Damian's code and documented [here](https://tristantreb.github.io/master_thesis_CF_ML/Code/smartcare/populateDerivedColsInMLTables.html)
- FEV1SetAs = round(PredictedFEV1)
- CalcFEV1SetAs is different than PredictedFEV1 because it uses a corrected Age (floor(years(patientStudyStartDate - patientDOB))), instead of the age that was entered during the study.


In [38]:
O2_FEV1.columns

Index(['ID', 'Date recorded', 'O2 Saturation', 'FEV1', 'Age', 'Sex', 'Height',
       'Weight', 'Predicted FEV1', 'FEV1 Set As', 'Pulmonary Exacerbation',
       'Less Exacerbation', 'Transplant Recipients', 'Date Last PE Start',
       'Date Last PE Stop', 'Comments', 'FEV1 % Predicted',
       'FEV1 % Predicted Group'],
      dtype='object')

# Create Factor Functions
We want the characterise the factor function that links the Unblocked FEV1 with its parents: Healthy FEV1 and Lung Damage. Here's a model of the relations betweeen those three varaibles: the unblocked FEV1 (L) of an individual is the healthy FEV1 (L), the theoretical lung function based on height, DOB, gdner, ethnicity (TBC exactly which), pejorated by the % in lung damage

Exacerbated labels data from the predictive classifier has 14452 entries (1445 exacerbated, 13006 not exacerbated measurements, 1 NaN)
Merging exacerbated labels into O2_FEV1
Dropped 6039 O2_FEV1 entries with NaN exacerbation label. 3053 entries remain.


In [14]:
# Factor function for unblocked FEV1 (L)
df_unblocked_factor=pd.DataFrame(columns=['ID,', 'Unblocked FEV1 (L)', 'Healthy FEV1 (L)'])
for id in O2_FEV1.ID.unique():
  # For a given patient id, filter the FEV1 measurements
  mask = O2_FEV1['ID'] == id
  O2_FEV1_patient = O2_FEV1[mask]
  # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
  # To avoid taking an outlier up, which is third highest FEV1 measurement
  rmax=O2_FEV1_patient['FEV1'].nlargest(3).iloc[-1]
  # Get the theoretical healthy FEV1 (L)
  healthy_fev1=O2_FEV1_patient['FEV1 Set As'].iloc[0]
  # Add the patient id, reversed max FEV1 and healthy FEV1 (L) to the dataframe
  new_row = pd.DataFrame({'ID': [id], 'Unblocked FEV1 (L)': [rmax], 'Healthy FEV1 (L)': [healthy_fev1]})
  df_unblocked_factor = pd.concat([df_unblocked_factor, new_row])


def add_lung_damage(fig, df, lung_damage_prct):
  xmax=df['Healthy FEV1 (L)'].max(); xmin=df['Healthy FEV1 (L)'].min()
  a=1-lung_damage_prct/100
  fig.add_shape(type="line", x0=xmin, y0=a*xmin, x1=xmax, y1=a*xmax, line=dict(color="Red", width=0.5))
  # Add line legend for no lung damage
  fig.add_annotation(x=1.02*xmax, y=a*xmax, text="{}%".format(lung_damage_prct), showarrow=False, font=dict(size=10, color="Red"))

# Plot a scatter of unblocked FEV1 (L) against healthy FEV1 (L)
fig = px.scatter(df_unblocked_factor, x='Healthy FEV1 (L)', y='Unblocked FEV1 (L)')
add_lung_damage(fig, df_unblocked_factor, 0)
add_lung_damage(fig, df_unblocked_factor, 50)
title="Impact of lung damage on healthy FEV1"
fig.update_layout(autosize=False, width=500, height=500, title="Impact of lung damage on healthy FEV1")
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
fig.show()
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

In [15]:
O2_FEV1_factors = merge_pred_ex_labels_to(O2_FEV1, get_pred_ex_labels(datadir))

df_unblocked_factor=pd.DataFrame(columns=['ID,', 'Unblocked FEV1 (L)', 'Healthy FEV1 (L)'])
for id in O2_FEV1.ID.unique():
  # For a given patient id, filter the FEV1 measurements
  mask = O2_FEV1['ID'] == id
  O2_FEV1_patient = O2_FEV1[mask]
  # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
  # To avoid taking an outlier up, which is third highest FEV1 measurement
  rmax=O2_FEV1_patient['FEV1'].nlargest(3).iloc[-1]
  # Get the theoretical healthy FEV1 (L)
  healthy_fev1=O2_FEV1_patient['FEV1 Set As'].iloc[0]
  # Add the patient id, reversed max FEV1 and healthy FEV1 (L) to the dataframe
  new_row = pd.DataFrame({'ID': [id], 'Unblocked FEV1 (L)': [rmax], 'Healthy FEV1 (L)': [healthy_fev1]})
  df_unblocked_factor = pd.concat([df_unblocked_factor, new_row])

# Left join O2_FEV1_factors with df_unblocked_factor on ID
O2_FEV1_factors = pd.merge(O2_FEV1_factors, df_unblocked_factor, on='ID', how='left')

# Plot unblocked fev1 (L) in x, measured fev1 in y, and color by Is Exacerbated
fig = px.scatter(O2_FEV1_factors, x='Unblocked FEV1 (L)', y='FEV1', color='Is Exacerbated', color_discrete_sequence=[get_stable_color(), get_ex_color()])
title="Impact of % small airways blockage on unblocked FEV1"
fig.update_layout(autosize=False, width=500, height=500, title=title)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
fig.show() 
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

Exacerbated labels data from the predictive classifier has 14452 entries (1445 exacerbated, 13006 not exacerbated measurements, 1 NaN)
Merging exacerbated labels into O2_FEV1
Dropped 6039 O2_FEV1 entries with NaN exacerbation label. 3053 entries remain.


In [81]:
# Filter O2_FEV1 to include unique ID
O2_FEV1_factors = O2_FEV1_factors.drop_duplicates(subset=['ID'])
# Get the 15 lowest values of FEV1 Predicted in O2_FEV1
O2_FEV1_factors.sort_values(by=['Predicted FEV1']).head(15)


Unnamed: 0,ID,Date recorded,O2 Saturation,FEV1,Age,Sex,Height,Weight,Predicted FEV1,FEV1 Set As,...,Transplant Recipients,Date Last PE Start,Date Last PE Stop,Comments,FEV1 % Predicted,FEV1 % Predicted Group,Is Exacerbated,"ID,",Unblocked FEV1 (L),Healthy FEV1 (L)
2644,216,2016-09-19,94.0,0.69,44,Female,154.0,53.0,0.88,0.9,...,No,2016-06-27,2016-07-11 00:00:00,"NO CRP, admission or IV therapy during study p...",78.409091,>=66.0,False,,0.78,0.9
2555,214,2016-08-19,96.0,1.3,31,Female,166.0,54.2,1.77,1.8,...,No,2016-07-11,2016-08-19 00:00:00,patients spent time in Ireland therefore no sp...,73.446328,>=66.0,False,,1.33,1.8
2337,212,2016-07-08,98.0,1.66,29,Female,160.0,41.0,1.81,1.8,...,No,2016-06-09,2016-07-07 00:00:00,non compliant,91.712707,>=66.0,False,,1.71,1.8
2879,227,2016-10-07,97.0,1.59,46,Male,176.5,73.2,1.95,2.0,...,No,2016-08-14,2016-08-28 00:00:00,no admissions No CRP measurement during exace...,81.538462,>=66.0,False,,2.64,2.0
2738,223,2016-09-23,97.0,1.75,39,Male,166.0,68.1,1.97,2.0,...,No,2016-01-22,2016-02-05 00:00:00,"NO CRP, admission or IV therapy during study p...",88.832487,>=66.0,False,,2.22,2.0
2412,213,2016-07-17,94.0,1.89,43,Male,176.0,77.9,2.1,2.1,...,No,2016-06-14,2016-06-28 00:00:00,no admissions,90.0,>=66.0,False,,1.85,2.1
2574,215,2016-08-30,96.0,1.92,33,Male,173.0,61.0,2.14,2.1,...,No,2016-08-11,2016-08-25 00:00:00,completed study on 30/3/2017,89.719626,>=66.0,False,,2.15,2.1
719,39,2015-12-26,93.0,0.55,41,Female,152.3,47.25,2.4,2.4,...,No,2015-11-24,2015-12-08 00:00:00,"Sputum's collected, end questionnaires completed",22.916667,<42.7,False,,0.9,2.4
2394,74,2016-07-16,96.0,0.86,36,Female,154.0,56.3,2.58,2.6,...,No,2016-06-08,2016-06-16 00:00:00,Some sputums collected issues with freezer bec...,33.333333,<42.7,False,,0.91,2.6
1173,56,2016-01-21,97.0,1.87,37,Female,157.4,49.7,2.7,2.7,...,No,2015-07-01,2015-07-22 00:00:00,No admissions at during the study. This pati...,69.259259,>=66.0,False,,1.96,2.7
