In [1]:
from load_data import *
from partition import *
from desaturation_fev1 import *
from normalise import *
from exacerbation_periods import *
from smooth import *
from plot_helpers import *
from biology import *
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go


#Settings
datadir = "../../../../SmartCareData/"
plotsdir="../../../../PlotsSmartcare/O2_FEV1/"

In [2]:
# Process O2-FEV1 data
O2_FEV1 = create_O2_FEV1_df(datadir)
# O2_FEV1 = remove_all_measurements_with_ID_216(O2_FEV1)

# Compute FEV1 % Predicted
O2_FEV1['FEV1 % Predicted'] = O2_FEV1.apply(lambda x: x["FEV1"] / x["Predicted FEV1"] * 100, axis=1)

# Partition FEV1 % Predicted in 3 groups
O2_FEV1["FEV1 % Predicted Group"] = partition_in_n_equal_groups(O2_FEV1['FEV1 % Predicted'],3)

# What are we analysing?
O2_FEV1.head()

  measurements = pd.read_csv(datadir + "mydata.csv").rename(columns={"FEV 1": "FEV1"})
  for idx, row in parser.parse():
  for idx, row in parser.parse():


Removed 11 measurements where O2 Sat > 100%, kept 123136 measurements
O2 Saturation contains 13418 measurements
FEV1 contains 10418 measurements
Removed 4684 rows with O2_FEV1 inner join, kept 68% of measurements (10100)
Removed 1008 duplicates, 9092 measurements left


Unnamed: 0,ID,Date recorded,O2 Saturation,FEV1,Age,Sex,Height,Weight,Predicted FEV1,FEV1 Set As,Pulmonary Exacerbation,Less Exacerbation,Transplant Recipients,Date Last PE Start,Date Last PE Stop,Comments,FEV1 % Predicted,FEV1 % Predicted Group
0,79,2015-09-10,95.0,1.53,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,40.691489,<42.7
1,79,2015-09-11,97.0,1.68,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,44.680851,[42.7;66.0[
2,79,2015-09-12,97.0,1.48,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,39.361702,<42.7
3,79,2015-08-13,95.0,1.63,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,43.351064,[42.7;66.0[
4,79,2015-11-07,94.0,1.52,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,40.425532,<42.7


# O2-FEV1 analysis
## Definitions
- O2 Saturation
- FEV 1
- Predicted FEV 1 in %: measure for the amount of airway obstruction either due to sputum load (non-permanent obstruction) or scars in the lungs (permanent obstruction).

## Literature
[The association between forced expiratory volume in one second (FEV1) and pulse oximetric measurements of arterial oxygen saturation (SpO2) in the patients with COPD: A preliminary study](https://pubmed.ncbi.nlm.nih.gov/24949035/).
- Context: 31 patients with COPD
- Key results: There was not statistically significant correlation between FEV1 % predicted and SpO2 values (P < 0.05), but a great correlation existed between FEV1/FVC % predicted and SpO2 values (r = 0.556, P < 0.001).

 [ANALYSIS OF CORRELATION BETWEEN FEV1/FEV6 AND OXYGEN SATURATION DURING SIX-MINUTE WALK TEST (6MWT) IN COPD PATIENTS](https://www.researchgate.net/publication/351322676_ANALYSIS_OF_CORRELATION_BETWEEN_FEV1FEV6_AND_OXYGEN_SATURATION_DURING_SIX-MINUTE_WALK_TEST_6MWT_IN_COPD_PATIENTS)
 - Need to download

Note: nobody segments the input by groups. Probably because they don't have enough data to do this.

In [None]:
## Raw scatter plots

In [None]:
# Raw plot without trendline
fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1")
# fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1", trendline="ols")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
fig.write_image(plotsdir + "FEV1-O2 raw.pdf")

fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1 % Predicted")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
# fig.write_image(plotsdir + "FEV1 % Predicted-O2 raw.pdf")

O2_FEV1.shape

In [None]:
var = 'FEV1'; xaxis_range = [0, 5.1]
# var = 'FEV1 % Predicted'; xaxis_range = [0, 155]

# Plot with height
O2_FEV1['Height Group'], height_labels=partition_in_n_equal_groups(O2_FEV1['Height'], 3, True)

fig = px.scatter(O2_FEV1, y="O2 Saturation", x=var, color="Height Group", marginal_x="histogram", marginal_y="histogram")
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig.update_xaxes(range=[0.9*min(O2_FEV1["FEV1"]), 1.1*max(O2_FEV1["FEV1"])])
# fig.update_yaxes(range=[0.9*min(O2_FEV1["O2 Saturation"]), max(1.1*O2_FEV1["O2 Saturation"])])
# fig.show()

for height_group in height_labels:
    mask_height=O2_FEV1['Height Group'] == height_group

    fig = px.scatter(O2_FEV1[mask_height], y="O2 Saturation", x=var,
                     title='Height group {}'.format(height_group))
    fig.update_layout(autosize=False, width=500, height=500)
    fig.update_traces(marker=dict(size=5),
                      selector=dict(mode='markers'))
    fig.update_xaxes(range=xaxis_range)
    fig.update_yaxes(range=[74, 102])
    fig.show()
    filename= "Height group " +  height_group + " " + var + "-O2 raw"
    # fig.write_image(plotsdir + filename + ".pdf")

In [None]:
predicted = False
x_var = 'Height'
lung_function_var = 'FEV1 % Predicted' if predicted else 'FEV1'

fig1, fig2, _ = desaturation_FEV1_for_variable(O2_FEV1, x_var, n_var_groups=6, predicted=predicted)
fig1.show()
# fig1.write_image(plotsdir + "Desaturation {} for {} main plot.pdf".format(lung_function_var, x_var))

fig2.show()
# fig2.write_image(plotsdir + "Desaturation {} for {} with lines.pdf".format(lung_function_var, x_var))

# Observations
# o2 10, var 15 contains a lot of noise and groups have very low number of datapoints. It gets too patient specific

## Per patient plots

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
O2_FEV1.head()

# Include time evolution into the graph
def to_float(dt_time):
    return (365*dt_time.year + 12*dt_time.month + dt_time.day)/12

mindate = to_float(min(O2_FEV1["Date recorded"]))
O2_FEV1["Months since study start"]=O2_FEV1["Date recorded"].apply(lambda x: to_float(x)-mindate)

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
IDs=O2_FEV1.ID.unique()
for ID in IDs:
    plot_o2_fev1_raw_for_id(O2_FEV1, ID)

## Summary plot (+ partititioning)

In [45]:
# Add exacerbated labels
# Compute exacerbation labels
# O2_FEV1_processed = compute_ex_labels_from_heuristics(antibioticsdata, patientsdata, O2_FEV1)

# Get exacerbation labels from the predictive classifier
O2_FEV1_processed = merge_pred_ex_labels_to(O2_FEV1, get_pred_ex_labels(datadir))

# Smooth
O2_FEV1_processed = smooth(O2_FEV1_processed, ['FEV1', 'FEV1 % Predicted'], mode="max")
O2_FEV1_processed = smooth(O2_FEV1_processed, ['O2 Saturation'], mode="mean")
print("Removed {} nan entries after max smoothing, kept {}".format(O2_FEV1_processed.isna().sum().max(), O2_FEV1_processed.shape[0]))
O2_FEV1_processed.dropna(inplace=True)

# Normalise
O2_FEV1_processed = norm_by_stable_baseline(O2_FEV1_processed,
                                            ['O2 Saturation', 'O2 Saturation smoothed', 'FEV1', 'FEV1 % Predicted', 'FEV1 smoothed', 'FEV1 % Predicted smoothed'])

# Settings
prefix, ex_column, x, y = set_x_y_vars(
      with_predicted_labels=True,
      with_predicted_fev1=False,
      is_smoothed_fev=True,
      is_smoothed_o2=False,
      is_normalised=False,
  )

is_partitioned= False
if not is_partitioned:
  # Plot O2-FEV with respective displots
  fig=plot_o2_fev_with_displots(O2_FEV1_processed, x, y, ex_column)
  fig.show()
  # fig.write_image("{}/Ex labels {} {}-{}.pdf".format(plotsdir, prefix, x, y), width=1400, height=600)

  # Plot subsampled O2-FEV scatter
  fig = plot_subsampled_scatter(x, y, O2_FEV1_processed, random_state=7)
  fig.show()
  
else:
  # Use partitioned data. Note that this requires smoothed True and Predicted FEV1 False

  predicted_fev1_thresholds = [2.4, 3.2, 4]

  # Print number of uniaue ID
  print("There are {} different individuals".format(len(O2_FEV1_processed.ID.unique())))

  # Filter by unique ID
  O2_FEV1_processed = O2_FEV1_processed.groupby('ID').filter(lambda l: len(l) > 1)
  # Check if individual's Predicted FEV1 position wrt to groups thresholds
  O2_FEV1_processed['Predicted FEV1 Group'] = O2_FEV1_processed['Predicted FEV1'].apply(lambda l: value_to_group(l, predicted_fev1_thresholds))

  # Create a for loop for each Group
  # parition_variable = 'FEV1 % Predicted smoothed'
  parition_variable = 'Predicted FEV1'

  for fev_group in O2_FEV1_processed[parition_variable + ' Group'].unique():
    mask = O2_FEV1_processed[parition_variable + ' Group'] == fev_group
    # Count patients
    n_patients=len(O2_FEV1_processed[mask].ID.unique())

    # Plot scatter plot using plotly with O2_FEV1_processed[mask], x, y, ex_column with color from get_ex_color() and get_stable_color()
    fig = px.scatter(O2_FEV1_processed[mask], x=x, y=y, color=ex_column, color_discrete_sequence=[get_stable_color(), get_ex_color()])
    # Set x axes range to min max of O2_FEV1_processed[x]
    fig.update_xaxes(range=[O2_FEV1_processed[x].min()*0.95, O2_FEV1_processed[x].max()*1.05])
    fig.update_layout(title="{} patients, {} {}".format(n_patients, parition_variable, fev_group))

    fig.show()
    # fig.write_image("{}/Ex labels {} {}-{} group {}.pdf".format(plotsdir, prefix, x, y, fev_group), width=1400, height=600)


Exacerbated labels data from the predictive classifier has 14452 entries (1445 exacerbated, 13006 not exacerbated measurements, 1 NaN)
Merging exacerbated labels into O2_FEV1
Dropped 0 O2_FEV1 entries with NaN exacerbation label. 7577 entries remain.


IndexError: index 3 is out of bounds for axis 0 with size 3

# Analysis of the erroneously dropped data

In [4]:
full = merge_pred_ex_labels_to(O2_FEV1, get_pred_ex_labels(datadir))
nonan = full.dropna()
print(full.shape)
print(nonan.shape)

# Select only rows where isna is True
nan = full[full.isna().any(axis=1)]
full.head()

Exacerbated labels data from the predictive classifier has 14452 entries (1445 exacerbated, 13006 not exacerbated measurements, 1 NaN)
Merging exacerbated labels into O2_FEV1
Dropped 0 O2_FEV1 entries with NaN exacerbation label. 7577 entries remain.


In [122]:
# What's nan?
nan.head()

Unnamed: 0,ID,Date recorded,O2 Saturation,FEV1,Age,Sex,Height,Weight,Predicted FEV1,FEV1 Set As,Pulmonary Exacerbation,Less Exacerbation,Transplant Recipients,Date Last PE Start,Date Last PE Stop,Comments,FEV1 % Predicted,FEV1 % Predicted Group,Is Exacerbated
0,79,2015-08-13,95.0,1.63,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,43.351064,[42.7;66.0[,True
1,79,2015-11-07,94.0,1.52,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,40.425532,<42.7,False
2,79,2015-10-14,95.0,1.66,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,44.148936,[42.7;66.0[,False
3,79,2015-10-21,93.0,1.64,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,43.617021,[42.7;66.0[,False
4,79,2015-12-11,95.0,1.5,27,Male,163.0,54.7,3.76,3.8,Yes,No,No,2015-07-13,2015-08-06 00:00:00,,39.893617,<42.7,False


In [123]:
# What's not nan?
nonan.head()

Unnamed: 0,ID,Date recorded,O2 Saturation,FEV1,Age,Sex,Height,Weight,Predicted FEV1,FEV1 Set As,Pulmonary Exacerbation,Less Exacerbation,Transplant Recipients,Date Last PE Start,Date Last PE Stop,Comments,FEV1 % Predicted,FEV1 % Predicted Group,Is Exacerbated
310,30,2015-12-10,95.0,1.06,33,Female,155.4,57.5,2.7,2.7,Yes,No,No,2015-11-23,2015-12-07 00:00:00,Advised by medical team to stop study as too a...,39.259259,<42.7,False
311,30,2015-12-14,98.0,1.02,33,Female,155.4,57.5,2.7,2.7,Yes,No,No,2015-11-23,2015-12-07 00:00:00,Advised by medical team to stop study as too a...,37.777778,<42.7,False
312,30,2015-12-09,95.0,0.99,33,Female,155.4,57.5,2.7,2.7,Yes,No,No,2015-11-23,2015-12-07 00:00:00,Advised by medical team to stop study as too a...,36.666667,<42.7,False
313,30,2015-12-31,97.0,1.0,33,Female,155.4,57.5,2.7,2.7,Yes,No,No,2015-11-23,2015-12-07 00:00:00,Advised by medical team to stop study as too a...,37.037037,<42.7,False
314,30,2015-12-13,94.0,1.04,33,Female,155.4,57.5,2.7,2.7,Yes,No,No,2015-11-23,2015-12-07 00:00:00,Advised by medical team to stop study as too a...,38.518519,<42.7,False


In [12]:
# We erronesouly removed rows where comments was NaN. There's no biological reason for this. Let's still have a look at where does the difference lie.

# Scatter plot of nan with x Date recorded, y O2 saturation, color Is Exacerbated with go library
x_col='Date recorded'
fig = go.Figure()
fig.add_trace(go.Scatter(x=nan[x_col], y=nan['O2 Saturation'], mode='markers', marker_color=nan['Is Exacerbated'].apply(lambda l: get_ex_color() if l else get_stable_color())))
# Put x axis range to min max of Date recorded of full
fig.update_xaxes(range=[full[x_col].min(), full[x_col].max()])
fig.show()
# 
fig = go.Figure()
fig.add_trace(go.Scatter(x=nonan[x_col], y=nonan['O2 Saturation'], mode='markers', marker_color=nonan['Is Exacerbated'].apply(lambda l: get_ex_color() if l else get_stable_color())))
fig.update_xaxes(range=[full[x_col].min(), full[x_col].max()])
fig.show()

In [25]:
# List IDs present in nan but not in nonan
nan_ids = nan.ID.unique()
nonan_ids = nonan.ID.unique()
print("IDs present in nan but not in nonan: {}".format([l for l in nan_ids if l not in nonan_ids]))

IDs present in nan but not in nonan: ['79', '23', '138', '24', '80', '81', '137', '139', '140', '82', '92', '99', '100', '93', '102', '113', '28', '114', '123', '115', '126', '127', '129', '128', '130', '131', '233', '78', '132', '133', '134', '136', '241', '59', '189', '187', '188', '231', '186', '141', '143', '191', '194', '195', '144', '193', '151', '197', '196', '234', '153', '66', '236', '171', '173', '175', '176', '172', '200', '179', '178', '229']
(4524, 19)


In [119]:
# Compare the displots before and after removing the rows with nan
def distplot(df):
  fig = go.Figure()
  fig.add_trace(go.Histogram(x=df['O2 Saturation'][df['Is Exacerbated']==True], histnorm='probability', marker=dict(color=get_ex_color()), name='Exacerbated'))
  fig.add_trace(go.Histogram(x=df['O2 Saturation'][df['Is Exacerbated']==False], histnorm='probability', marker=dict(color=get_stable_color()), name='Stable'))
  # Name x axis as O2 saturation (%)
  fig.update_xaxes(title_text='O2 saturation (%)')
  # Name y axis probability
  fig.update_yaxes(title_text='Probability')
  fig.update_layout(barmode="overlay")
  # Make the figure square sized
  fig.update_layout(width=600, height=300)
  fig.show()
  return -1

distplot(full)
distplot(nonan)

# distplot(full[~full.ID.isin(['141', '59', '241', '194'])])
distplot(full[~full.ID.isin(['79', '23', '138', '24', '80', '81', '137', '139', '140', '82', '92', '99', '100', '93', '102', '113', '28', '114', '123', '115', '126', 
  '127', '129', '128', '130', '131', '233', '78', '132', '133', '134', '136', '241', '59', '189', '187', '188', '231', '186', '141', '143', '191', '194', '195', '144', 
  '193', '151', '197', '196', '234', '153', '66', '236', '171', '173', '175', '176', '172', '200', '179', '178', '229'])])

-1

In [116]:
# We'd like to explain the difference in the distribution of O2 saturation in the Exacerbated and Stable groups before and after the removal of the rows with nan.
# Changes in O2 Saturation's distribution are much stronger in the Exacerbated group than in the Stable. 
# Let's have a look at the Exacerbated group only.

# Scatter plot nan with x Date recorded, y O2 saturation, color ID with px library
nan_ex = nan[nan['Is Exacerbated']==True]
nonan_ex = nonan[nonan['Is Exacerbated']==True]
full_ex = full[full['Is Exacerbated']==True]
print(nan_ex.shape)
print(nonan_ex.shape)

def set_axes(fig):
  # Set y axis range to min max of O2 saturation of full
  fig.update_yaxes(range=[full['O2 Saturation'].min()-1, full['O2 Saturation'].max()+1])
  # Set x axis range to min max of Date recorded of full
  fig.update_xaxes(range=[full[x_col].min(), full[x_col].max()])
  return -1

fig = px.scatter(nan_ex, x=x_col, y='O2 Saturation', color='ID')

# Add line of O2 saturation 95
fig.add_shape(type="line", x0=full[x_col].min(), y0=95, x1=full[x_col].max(), y1=95, line=dict(color="Black", width=0.5))
set_axes(fig)
fig.update_layout(title='Measurements in exacerbated period - With values erroneously dropped ({} points)'.format(nan_ex.shape[0]))
fig.show()

fig = px.scatter(nonan_ex, x=x_col, y='O2 Saturation', color='ID')
set_axes(fig)
fig.add_shape(type="line", x0=full[x_col].min(), y0=95, x1=full[x_col].max(), y1=95, line=dict(color="Black", width=0.5))
fig.update_layout(title='Measurements in exacerbated period - After erroneously dropping ({} points)'.format(nonan_ex.shape[0]))
fig.show()

fig = px.scatter(full_ex, x=x_col, y='O2 Saturation', color='ID')
set_axes(fig)
fig.add_shape(type="line", x0=full[x_col].min(), y0=95, x1=full[x_col].max(), y1=95, line=dict(color="Black", width=0.5))
fig.update_layout(title='Measurements in exacerbated period - After correction ({} points)'.format(full_ex.shape[0]))
fig.show()

(494, 19)
(210, 19)


## Test the sensitivity of random states on subsampled plots
We wonder if different random states will give largely different results, to an extend that it could challenge the model assumptions.
It turns out that, despite an effect on the x axis range, the subsampled plots are very similar.

In [None]:
# Add subsampled plot
prefix, ex_column, x, y = set_x_y_vars(with_predicted_labels=True, with_predicted_fev1=True, is_smoothed=True, is_normalised=False)

for random_state in range(1,10):
  print(random_state)
  fig = plot_subsampled_scatter(x, y, O2_FEV1_processed, random_state=random_state)
  fig.show()

## Full sized displots

In [46]:
# This part fo the code is unused now
# But the displots can be useful for the report, so I keep it here

if predicted:
    if normalised:
        xrange=[-1.8, 1.2]
    else:
        xrange=[0, 155]
    bin_size=4
else:
    if normalised:
        xrange=[-42,40]
    else:
        xrange=[0, 5.1]
    bin_size=0.14

# Raw plot with superposed exacerbation labels
fig = px.scatter(O2_FEV1_processed, y=y, x=x, color=ex_column)
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig.show()
# fig.write_image("{}/{}-{} by exacerbated labels superposed raw.pdf".format(plotsdir, x, y))

# Raw individual plot for each exacerbation label]
fig_scatter = px.scatter(O2_FEV1_processed.dropna(), y=y, x=x,
                 # color='Exacerbation Labels',
                 facet_col=ex_column)
fig_scatter.update_layout(autosize=False, width=1000, height=500)
fig_scatter.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig_scatter.update_xaxes(range=xrange)
if not(normalised): fig_scatter.update_yaxes(range=[74, 102])
# fig_scatter.show()
# fig.write_image("{}/Ex labels {} {}-{} individual raw.pdf".format(plotsdir, prefix, x, y), width=1300, height=400)

def distplot(x, group_labels, bin_size):
    data=[]
    for exacerbation_label in group_labels:
        mask = O2_FEV1_processed[ex_column] == exacerbation_label
        data.append(O2_FEV1_processed[x][ mask ])
        print("{} measurements with {} = {}".format(sum(mask), ex_column, exacerbation_label))
    fig = ff.create_distplot(data, group_labels, bin_size=bin_size, show_rug=False)
    fig.update_xaxes(title_text=x)
    fig.update_layout(autosize=False, width=1000, height=400)
    return fig

# Create distplot for x = lung function
group_labels = [True, False] if with_predicted_labels else ["Exacerbation Period", "Stable Period"]
fig_distplot_fev1 = distplot(x, group_labels, bin_size)
fig_distplot_fev1.show()
# fig.write_image("{}/Ex labels {} {} distplot.pdf".format(plotsdir, prefix, x), width=1000, height=400)

# Create distplot for y = O2 saturation
group_labels = [True, False] if with_predicted_labels else ["Exacerbation Period", "Stable Period"]
fig_displot_o2 = distplot(y, group_labels, 1)
fig_displot_o2.show()
# fig_displot_o2.write_image("{}/Ex labels {} {} distplot.pdf".format(plotsdir, prefix, y), width=1000, height=400)

NameError: name 'predicted' is not defined

## Patient level plots to validate or validate outliers down

In [None]:
# O2-FEV1: Outliers down can happen, however if you are persistently getting low reading it means that it's not an outlier
# Checking O2 sat stable period measurements outliers
exacerbation_label = 'Stable Period'

# Case 1:
IDs = ['180', '202']
o2_saturation_threshold = 87

# Case 2: smoothed
o2_saturation_threshold = 90

# Plot
ids = O2_FEV1_processed[ (O2_FEV1_processed['Exacerbation Labels'] == exacerbation_label) & (O2_FEV1_processed['O2 Saturation'] < o2_saturation_threshold) ].ID.unique()
print(ids)
for id in ids:
    plot_o2_fev1_raw_for_id(O2_FEV1_processed, id, save=False, show=True)

## Validate smooth max

In [None]:
def plot_fev1_and_fev1_smoothed_for_id(O2_FEV1, id):
  O2_FEV1=O2_FEV1[ O2_FEV1['ID'] == id ]
  
  mode="mean"
  O2_FEV1 = smooth(O2_FEV1, ['FEV1'], mode=mode)

  # Plot
  # Create a subplots that share the same x and y axes with plotly
  fig = make_subplots(rows=1, cols=1, shared_xaxes=True, shared_yaxes=True)
  # Add scatter of FEV1 with Date recorded with a name 'FEV1' with marker opacity
  fig.add_scatter(x=O2_FEV1['Date recorded'], y=O2_FEV1['FEV1'], mode='markers+lines', name='FEV1', opacity=0.5, row=1, col=1)
  # Add scatter of FEV1 smoothed with Date recorded with a name 'FEV1 smoothed'
  fig.add_scatter(x=O2_FEV1['Date recorded'], y=O2_FEV1['FEV1 {} smoothed'.format(mode)], mode='markers+lines', name='FEV1 smoothed', opacity=0.5, row=1, col=1)
  # Update figure size
  fig.update_layout(autosize=False, width=1000, height=300, title='FEV1 and FEV1 smoothed for ID {}'.format(id))
  fig.show()

# Plot plot_fev1_and_fev1_smoothed_for_id for the first 10 IDs of O2_FEV1
for id in O2_FEV1.ID.unique()[:10]:
  plot_fev1_and_fev1_smoothed_for_id(O2_FEV1, id)

## How to smooth O2?

In [None]:
# Plots O2 saturation against Date for a given patient id
def plot_o2_saturation_for_id(df, id, plotsdir, save=False, show=False):
    mask = df['ID'] == id
    fig = px.scatter(df[mask], x='Date recorded', y='O2 Saturation')
    fig.update_layout(autosize=False, width=1000, height=400)
    fig.update_traces(marker=dict(size=5),
                      selector=dict(mode='markers'))
    if save:
        fig.write_image("{}/O2 for ID/O2 saturation for ID {}.pdf".format(plotsdir, id), width=1000, height=400)
    if show:
        fig.show()
        
# For each patient id, plot O2 saturation against Date recorded
for id in O2_FEV1.ID.unique():
    plot_o2_saturation_for_id(O2_FEV1, id, plotsdir, save=True, show=False)

## Why are there measurements with FEV1 % Pred smoothed > 70% and FEV1 smoothed < 1L?
image.png
The scatter plot of O2 vs FEV1 for the FEV1 % Predicted group between 70% and 90% shows several measurements with an FEV1 smoothed below 1L. This looks erroneous.
There are 98 measurements with FEV1 % Pred smoothed > 70% and FEV1 smoothed < 1L. 
- 92 measurements belong to individual with ID 216. There are no other measurements for this individual. Is there a problem for the data collected by this individual?

Action: add function `remove_all_measurements_with_ID_216()`

In [None]:
114 * 0.01 * 3.95 - 44 * 0.025 - 2.6

In [None]:
# Find rows where FEV1 % Predicted smoothed > 70% and FEV1 smoothed < 1 L
mask = (O2_FEV1_processed['FEV1 % Predicted smoothed'] > 70) & (O2_FEV1_processed['FEV1 smoothed'] < 1)
print("{} measurements with FEV1 % Predicted smoothed > 70% and FEV1 < 1 L".format(sum(mask)))
# Set pandas display max row option to 100
# pd.set_option('display.max_rows', 100)
# Apply the mask and filter the folowing columns ID, FEV1, FEV1 smoothed, Age, Sex, Height, Weight, FEV1 % Predicted, FEV1 % Predicted smoothed, Is Exacerbated
O2_FEV1_processed[mask][['ID', 'Age', 'Sex', 'Height', 'Weight', 'FEV1', 'FEV1 smoothed', 'Predicted FEV1', 'FEV1 Set As', 'FEV1 % Predicted', 'FEV1 % Predicted smoothed', 'Is Exacerbated', 'O2 Saturation']].sort_values(by=['ID', 'FEV1 % Predicted smoothed'])

In [None]:
# Filter measurements done by ID 216
mask = O2_FEV1_processed['ID'] == '216'

# Add scatter of FEV1 smoothed against date recorded
fig = px.scatter(O2_FEV1_processed[mask], x='Date recorded', y='FEV1 smoothed')
# Add scatter of FEV1 with trace name FEV1
fig.add_scatter(x=O2_FEV1_processed[mask]['Date recorded'], y=O2_FEV1_processed[mask]['FEV1'], mode='markers', name='FEV1')
fig.show()
# Apply the mask and filter the folowing columns ID, FEV1, FEV1 smoothed, Age, Sex, Height, Weight, FEV1 % Predicted, FEV1 % Predicted smoothed, Is Exacerbated
O2_FEV1_processed[mask][['ID', 'Date recorded', 'FEV1', 'FEV1 smoothed', 'Age', 'Sex', 'Height', 'Weight', 'FEV1 % Predicted', 'FEV1 % Predicted smoothed', 'Is Exacerbated']].sort_values(by=['FEV1 smoothed'])

## What's the difference between Predicted FEV1 and FEV1 Set As? Rounded version?
The two are computed by Damian's code and documented [here](https://tristantreb.github.io/master_thesis_CF_ML/Code/smartcare/populateDerivedColsInMLTables.html)
- FEV1SetAs = round(PredictedFEV1)
- CalcFEV1SetAs is different than PredictedFEV1 because it uses a corrected Age (floor(years(patientStudyStartDate - patientDOB))), instead of the age that was entered during the study.


In [None]:
O2_FEV1.columns

# Create Factor Functions
We want the characterise the factor function that links the Unblocked FEV1 with its parents: Healthy FEV1 and Lung Damage. Here's a model of the relations betweeen those three varaibles: the unblocked FEV1 (L) of an individual is the healthy FEV1 (L), the theoretical lung function based on height, DOB, gdner, ethnicity (TBC exactly which), pejorated by the % in lung damage

In [None]:
# Factor function for unblocked FEV1 (L)
df_unblocked_factor = load_patient_data(datadir)

# For each patient id
# Create np.array to store the unblocked FEV1 (L)
rmax = []
for id in O2_FEV1.ID.unique():
  mask = O2_FEV1['ID'] == id
  # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
  # To avoid taking an outlier up, which is third highest FEV1 measurement
  rmax.append(O2_FEV1['FEV1'][mask].nlargest(3).iloc[-1])
# Add rmax to df_unblocked_factor with column name Unblocked FEV1 (L)
df_unblocked_factor['Unblocked FEV1 (L)'] = pd.Series(rmax)

# Compute Unblocked FEV1 as a percentage of lung damage
# Healthy FEV1 is the FEV1 Set As
df_unblocked_factor['Unblocked FEV1 as a % of lung damage'] = 100 * (1 - df_unblocked_factor['Unblocked FEV1 (L)'] / df_unblocked_factor['FEV1 Set As'])

# Plot a scatter of Unblocked FEV1 (&) against Age
def lung_damage_with_var(df, var, plotsdir):
  fig = px.scatter(df_unblocked_factor, x=var, y='Unblocked FEV1 as a % of lung damage')
  title="Lung damage with {} ({} individuals)".format(var, len(df_unblocked_factor))
  fig.update_layout(autosize=False, width=500, height=500, title=title)
  fig.update_traces(marker=dict(size=5),
                    selector=dict(mode='markers'))
  fig.show()
  fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

lung_damage_with_var(df_unblocked_factor, 'Age', plotsdir)
lung_damage_with_var(df_unblocked_factor, 'Sex', plotsdir)

### Exploration
#### Old factor function code

In [None]:
# Factor function for unblocked FEV1 (L)
df_unblocked_factor=pd.DataFrame(columns=['ID,', 'Unblocked FEV1 (L)', 'Healthy FEV1 (L)'])
for id in O2_FEV1.ID.unique():
  # For a given patient id, filter the FEV1 measurements
  mask = O2_FEV1['ID'] == id
  O2_FEV1_patient = O2_FEV1[mask]
  # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
  # To avoid taking an outlier up, which is third highest FEV1 measurement
  rmax=O2_FEV1_patient['FEV1'].nlargest(3).iloc[-1]
  # Get the theoretical healthy FEV1 (L)
  healthy_fev1=O2_FEV1_patient['FEV1 Set As'].iloc[0]
  # Add the patient id, reversed max FEV1 and healthy FEV1 (L) to the dataframe
  new_row = pd.DataFrame({'ID': [id], 'Unblocked FEV1 (L)': [rmax], 'Healthy FEV1 (L)': [healthy_fev1]})
  df_unblocked_factor = pd.concat([df_unblocked_factor, new_row])


def add_lung_damage(fig, df, lung_damage_prct):
  xmax=df['Healthy FEV1 (L)'].max(); xmin=df['Healthy FEV1 (L)'].min()
  a=1-lung_damage_prct/100
  fig.add_shape(type="line", x0=xmin, y0=a*xmin, x1=xmax, y1=a*xmax, line=dict(color="Red", width=0.5))
  # Add line legend for no lung damage
  fig.add_annotation(x=1.02*xmax, y=a*xmax, text="{}%".format(lung_damage_prct), showarrow=False, font=dict(size=10, color="Red"))

# Plot a scatter of unblocked FEV1 (L) against healthy FEV1 (L)
fig = px.scatter(df_unblocked_factor, x='Healthy FEV1 (L)', y='Unblocked FEV1 (L)')
add_lung_damage(fig, df_unblocked_factor, 0)
add_lung_damage(fig, df_unblocked_factor, 50)
title="Impact of lung damage on healthy FEV1"
fig.update_layout(autosize=False, width=500, height=500, title="Impact of lung damage on healthy FEV1")
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
fig.show()
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

In [None]:
O2_FEV1_factors = merge_pred_ex_labels_to(O2_FEV1, get_pred_ex_labels(datadir))

df_unblocked_factor=pd.DataFrame(columns=['ID,', 'Unblocked FEV1 (L)', 'Healthy FEV1 (L)'])
for id in O2_FEV1.ID.unique():
  # For a given patient id, filter the FEV1 measurements
  mask = O2_FEV1['ID'] == id
  O2_FEV1_patient = O2_FEV1[mask]
  # Find the unblocked FEV1 (L). We assume that, over the 6 months study period, the patient has done some measurements where he was not blocked
  # To avoid taking an outlier up, which is third highest FEV1 measurement
  rmax=O2_FEV1_patient['FEV1'].nlargest(3).iloc[-1]
  # Get the theoretical healthy FEV1 (L)
  healthy_fev1=O2_FEV1_patient['FEV1 Set As'].iloc[0]
  # Add the patient id, reversed max FEV1 and healthy FEV1 (L) to the dataframe
  new_row = pd.DataFrame({'ID': [id], 'Unblocked FEV1 (L)': [rmax], 'Healthy FEV1 (L)': [healthy_fev1]})
  df_unblocked_factor = pd.concat([df_unblocked_factor, new_row])

# Left join O2_FEV1_factors with df_unblocked_factor on ID
O2_FEV1_factors = pd.merge(O2_FEV1_factors, df_unblocked_factor, on='ID', how='left')

# Plot unblocked fev1 (L) in x, measured fev1 in y, and color by Is Exacerbated
fig = px.scatter(O2_FEV1_factors, x='Unblocked FEV1 (L)', y='FEV1', color='Is Exacerbated', color_discrete_sequence=[get_stable_color(), get_ex_color()])
title="Impact of % small airways blockage on unblocked FEV1"
fig.update_layout(autosize=False, width=500, height=500, title=title)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
fig.show() 
# Save figure
fig.write_image("{}/Factors - {}.pdf".format(plotsdir, title), width=500, height=500)

#### Compare calc_predicted_fev1 with FEV1 Set As

In [None]:
df=load_patient_data(datadir)
df['Predicted FEV1 Calc (L)'] = df.apply(lambda x: calc_predicted_fev1(x), axis=1)

# Sort df by Predicted FEV1 Calc (L)
df.sort_values(by=['Predicted FEV1 Calc (L)'], inplace=True)
# Use go.scatter to plot FEV1 Predicted in y with ID in x
fig = go.Figure(data=go.Scatter(x=df['ID'], y=df['Predicted FEV1 Calc (L)'], name="Predicted FEV1 Calc (L)", mode='markers', opacity=0.9))
# Add the same with FEV1 Set As with name "FEV1 Set As"
fig.add_trace(go.Scatter(x=df['ID'], y=df['FEV1 Set As'], name="FEV1 Set As", mode='markers', opacity=0.9))
# Add the same with FEV1 Predicted Calc (L) with name "Predicted FEV1 Calc (L)"
# fig.add_trace(go.Scatter(x=df['ID'], y=df['Predicted FEV1'], name="Predicted FEV1", mode='markers', opacity=0.9))
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))  
fig.show()

In [None]:
# Show df rows for ID '60' and '66'
df[df['ID'].isin(['60', '66'])]
