In [1]:
from data_transformation import *
from exacerbation_periods import *



# settings
datadir="../../SmartCareData/"
plotsdir="../../PlotsSmartcare/O2_FEV1/"



# Process O2-FEV1 data

# Extract data and format datatypes
# Don't use convert_dtypes as it provides types that mess up when doing np calculations
measurements = pd.read_csv(datadir + "mydata.csv")
measurements['Date/Time recorded'] = pd.to_datetime(measurements['Date/Time recorded']).dt.date
measurements.rename(columns={"FEV 1": "FEV1"}, inplace=True)
print("Measurements data columns:\n{}\n".format(measurements.dtypes))

# ID mapping file
id_map = pd.read_excel(datadir + "patientidnew.xlsx", dtype={'SmartCareID': str}).drop("Study_ID", axis=1)
print("ID mapping file columns:\n{}\n".format(id_map.dtypes))

# Clinical data
# Patient data: Information describing the patient
patientsdata = pd.read_excel(datadir + "clinicaldata_updated.xlsx", sheet_name="Patients", dtype={'ID': str})
patientsdata.Weight = patientsdata.Weight.replace(to_replace='75,4',value='75.4').astype(float)
patientsdata['Study Date'] = pd.to_datetime(patientsdata['Study Date']).dt.date
print("Patient data columns:\n{}".format(patientsdata.dtypes))
# Antibiotics data
antibioticsdata = pd.read_excel(datadir + "clinicaldata_updated.xlsx", sheet_name="Antibiotics", dtype={'ID': str})
antibioticsdata['Start Date'] = pd.to_datetime(antibioticsdata['Start Date']).dt.date
antibioticsdata['Stop Date'] = pd.to_datetime(antibioticsdata['Stop Date']).dt.date
print("Antibiotics data columns:\n{}".format(antibioticsdata.dtypes))
# The datetime objects can be used to compare dates because they have an empty time 00:00:00

# Additional data cleaning
length_before_cleaning_O2 = measurements.shape[0]
measurements.drop( measurements[measurements["O2 Saturation"] > 100].index, inplace=True)
print("Removed {} O2 saturation measurements over 100%, {} left".format(length_before_cleaning_O2-measurements.shape[0], measurements.shape[0]))



# Get table with FEV1 and O2 values per day per user
O2 = extract_measure(measurements, "O2 Saturation")
FEV1 = extract_measure(measurements, "FEV1")

length_outer_join = O2.merge(FEV1, on=['User ID', 'Date/Time recorded'], how='outer').shape[0]
length_left_join  = O2.merge(FEV1, on=['User ID', 'Date/Time recorded'], how='left').shape[0]
O2_FEV1 = O2.merge(FEV1, on=['User ID', 'Date/Time recorded'], how='inner')
print("Removed {} rows with inner merge, {:.0%} of O2-FEV1 same day measurements remain".format( length_outer_join - O2_FEV1.shape[0], O2_FEV1.shape[0] /length_outer_join))

# Add patient data
# Map smartcare ids
O2_FEV1 = O2_FEV1.merge(id_map, left_on='User ID', right_on='Patient_ID', copy=True)
# Add clinical data
O2_FEV1 = O2_FEV1.merge(patientsdata, left_on='SmartCareID', right_on='ID', copy=True).drop(["User ID", "Patient_ID", "SmartCareID"], axis=1)

  measurements = pd.read_csv(datadir + "mydata.csv")


Measurements data columns:
User ID                  object
UserName                 object
Recording Type           object
Date/Time recorded       object
FEV1                    float64
FEV 10                  float64
Predicted FEV           float64
FEV 1 %                 float64
Weight in Kg            float64
O2 Saturation           float64
Pulse (BPM)             float64
Calories                float64
Rating                  float64
Temp (deg C)            float64
Sputum sample taken?     object
Activity - Steps        float64
Activity - Points       float64
dtype: object

ID mapping file columns:
Patient_ID     object
SmartCareID    object
dtype: object

Patient data columns:
ID                                       object
Hospital                                 object
Study Number                             object
Study Date                               object
DOB                              datetime64[ns]
Age                                       int64
Sex                 

  for idx, row in parser.parse():
  for idx, row in parser.parse():


# O2-FEV1 analysis
## Definitions
- O2 Saturation
- FEV 1
- Predicted FEV 1 in %: measure for the amount of airway obstruction either due to sputum load (non-permanent obstruction) or scars in the lungs (permanent obstruction).

## Literature
[The association between forced expiratory volume in one second (FEV1) and pulse oximetric measurements of arterial oxygen saturation (SpO2) in the patients with COPD: A preliminary study](https://pubmed.ncbi.nlm.nih.gov/24949035/).
- Context: 31 patients with COPD
- Key results: There was not statistically significant correlation between FEV1 % predicted and SpO2 values (P < 0.05), but a great correlation existed between FEV1/FVC % predicted and SpO2 values (r = 0.556, P < 0.001).

 [ANALYSIS OF CORRELATION BETWEEN FEV1/FEV6 AND OXYGEN SATURATION DURING SIX-MINUTE WALK TEST (6MWT) IN COPD PATIENTS](https://www.researchgate.net/publication/351322676_ANALYSIS_OF_CORRELATION_BETWEEN_FEV1FEV6_AND_OXYGEN_SATURATION_DURING_SIX-MINUTE_WALK_TEST_6MWT_IN_COPD_PATIENTS)
 - Need to download

Note: nobody segments the input by groups. Probably because they don't have enough data to do this.

In [2]:
# Make partitions
O2_FEV1['FEV1 % Predicted'] = O2_FEV1.apply(lambda x: x["FEV1"] / x["Predicted FEV1"] * 100, axis=1)
O2_FEV1["FEV1 % Predicted Group"] = partition_in_n_equal_groups(O2_FEV1['FEV1 % Predicted'],3)

# What are we analysing?
O2_FEV1.head()

Unnamed: 0,Date/Time recorded,O2 Saturation,FEV1,ID,Hospital,Study Number,Study Date,DOB,Age,Sex,...,Inconvenience Payment,Date Last PE Start,Date Last PE Stop,GP Letter Sent,Remote Monitoring App User ID,Study Email,Freezer Required,Comments,FEV1 % Predicted,FEV1 % Predicted Group
0,2015-08-12,95.0,1.49,23,PAP,1,2015-08-03,1960-07-05,55,Female,...,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,,66.816143,>=66.2
1,2015-08-14,95.0,1.29,23,PAP,1,2015-08-03,1960-07-05,55,Female,...,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,,57.847534,[42.2;66.2[
2,2015-08-21,94.0,1.35,23,PAP,1,2015-08-03,1960-07-05,55,Female,...,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,,60.538117,[42.2;66.2[
3,2015-09-05,95.0,1.3,23,PAP,1,2015-08-03,1960-07-05,55,Female,...,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,,58.295964,[42.2;66.2[
4,2015-09-07,97.0,1.46,23,PAP,1,2015-08-03,1960-07-05,55,Female,...,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,,65.470852,[42.2;66.2[


In [None]:
# Raw plot without trendline
fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1")
# fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1", trendline="ols")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
fig.write_image(plotsdir + "FEV1-O2 saturation raw.pdf")

fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1 % Predicted")
fig.update_layout(autosize=False, width=500, height=500)
fig.show()
fig.write_image(plotsdir + "FEV1 % Predicted-O2 saturation raw.pdf")

O2_FEV1.shape

In [None]:
# Plot with height
O2_FEV1['Height Group'], height_labels=partition_in_n_equal_groups(O2_FEV1['Height'], 3, True)

# var = 'FEV1'; xaxis_range = [0, 5.1]
var = 'FEV1 % Predicted'; xaxis_range = [0, 155]

fig = px.scatter(O2_FEV1, y="O2 Saturation", x=var, color="Height Group", marginal_x="histogram", marginal_y="histogram")
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig.update_xaxes(range=[0.9*min(O2_FEV1["FEV1"]), 1.1*max(O2_FEV1["FEV1"])])
# fig.update_yaxes(range=[0.9*min(O2_FEV1["O2 Saturation"]), max(1.1*O2_FEV1["O2 Saturation"])])
# fig.show()

for height_group in height_labels:
    mask_height=O2_FEV1['Height Group'] == height_group

    fig = px.scatter(O2_FEV1[mask_height], y="O2 Saturation", x=var,
                     title='Height group {}'.format(height_group))
    fig.update_layout(autosize=False, width=500, height=500)
    fig.update_traces(marker=dict(size=5),
                      selector=dict(mode='markers'))
    fig.update_xaxes(range=xaxis_range)
    fig.update_yaxes(range=[74, 102])
    fig.show()
    filename= "Height group " +  height_group + " " + var + "-O2 saturation raw"
    fig.write_image(plotsdir + filename + ".pdf")

In [18]:
predicted = True
x_var = 'Weight'
lung_function_var = 'FEV1 % Predicted' if predicted else 'FEV1'

fig1, fig2, _ = desaturation_FEV1_for_variable(O2_FEV1, x_var, n_var_groups=6, predicted=predicted)
fig1.show()
fig1.write_image(plotsdir + "Desaturation {} for {} main plot.pdf".format(lung_function_var, x_var))

fig2.show()
fig2.write_image(plotsdir + "Desaturation {} for {} with lines.pdf".format(lung_function_var, x_var))

# Observations
# o2 10, var 15 contains a lot of noise and groups have very low number of datapoints. It gets too patient specific

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
O2_FEV1.head()

In [None]:
# Same patient variability: can we observe the same pattern for each patient?
# Include time evolution into the graph
def to_float(dt_time):
    return (365*dt_time.year + 12*dt_time.month + dt_time.day)/12

mindate = to_float(min(O2_FEV1["Date/Time recorded"]))
O2_FEV1["Months since study start"]=O2_FEV1["Date/Time recorded"].apply(lambda x: to_float(x)-mindate)

IDs=O2_FEV1.ID.unique()
for ID in IDs:
    fig = px.scatter(O2_FEV1[O2_FEV1.ID == ID], y="O2 Saturation", x="FEV1", color="Months since study start")
    # fig.update_layout(autosize=False, width=500, height=500)
    fig.update_xaxes(title_text="FEV1 (L)")
    fig.update_yaxes(title_text="O2 Saturation (%)")

    filename=ID+"-O2_FEV1"
    # fig.write_image(plotsdir + filename + ".pdf")

## Highlight exacerbated/no exacerbated states on O2-FEV1 plots
O2-FEV1 quadrants
- high o2, high fev1: concerns individuals with low % small airway blockage and low (permanent) lung damage.
- low o2, high fev1: this does not happen in CF
- high o2, low fev1: low % small airway blockage and high lung damage, probably other classes
- low o2, low fev1: high % small airway blockage and high lung damage

### Scientific reasoning using simple heuristics
Heuristics used to define the labels:
- Exacerbated: since 1 week before treatment start
- Not exacerbated: until 3 weeks before treatment start
- Between 1 and 3 weeks you throw away.
- Recovery: treatment start and stop

Expectation: We expect the % small airway blockage to be very sensitive to a lung inflammation. Hence, measurements done in exacerbated state should be most present in the low o2, low fev1 quadrant, whereas the other 2 quadrants should contain mostly measurements done in a stable period

Observations:
- The expectations are clearly not met.
- There is a surprising amount of Undefined Labels (90%) of the data. Does it makes sense that 90% of the measurements are done between 1-3 weeks prior to an exacerbation? What's the distribution of the number of days between two exacerbations?
    → Indeed for the time between two treatments in the vast majority fo the time it between 10 to 19 days which falls in the undefined period
    → Let's try to reduce this window

In [None]:
for id in list_patients(O2_FEV1):
    patient_antibioticsdata = get_rows_for_id(id, antibioticsdata)
    exacerbation_labels = get_patient_exacerbation_labels(patient_antibioticsdata, patientsdata, numdays_before_ab_start_is_exacerbated=7, numdays_before_ab_start_not_exacerbated=21)
    O2_FEV1['Exacerbation Labels'] = O2_FEV1['Date/Time recorded'].apply(lambda x: add_measurement_exacerbation_label(x, exacerbation_labels))
O2_FEV1['Exacerbation Labels'].value_counts()

In [None]:
fig = px.scatter(O2_FEV1, y="O2 Saturation", x="FEV1", color="Exacerbation Labels", marginal_x="histogram", marginal_y="histogram")
# fig.update_layout(autosize=False, width=800, height=800)
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))
# fig.update_xaxes(range=[0.9*min(O2_FEV1["FEV1"]), 1.1*max(O2_FEV1["FEV1"])])
# fig.update_yaxes(range=[0.9*min(O2_FEV1["O2 Saturation"]), max(1.1*O2_FEV1["O2 Saturation"])])
fig.show()

for exacerbation_label in O2_FEV1['Exacerbation Labels'].unique():
    mask=O2_FEV1['Exacerbation Labels'] == exacerbation_label

    fig = px.scatter(O2_FEV1[mask], y="O2 Saturation", x="FEV1",
                     title='{}'.format(exacerbation_label))
    fig.update_layout(autosize=False, width=500, height=500)
    fig.update_traces(marker=dict(size=5),
                      selector=dict(mode='markers'))
    fig.update_xaxes(range=[0, 5.1])
    fig.update_yaxes(range=[74, 102])
    fig.show()

In [None]:
# Measure time between treatments
time_between_treatments = []

for id in list_patients(O2_FEV1):
    patient_antibioticsdata = get_rows_for_id(id, antibioticsdata)
    for i in range(len(patient_antibioticsdata)-1):
        # Removing NaT input
        if patient_antibioticsdata.index[i] not in [149, 150, 315, 321]:
            time_between_treatments.append((antibioticsdata['Stop Date'][i] - antibioticsdata['Start Date'][i+1]).days)

time_between_treatments = pd.DataFrame(time_between_treatments, columns=['Time between two treatments (days)'])
# time_between_treatments = time_between_treatments.value_counts().reset_index()

fig = px.histogram(time_between_treatments, x='Time between two treatments (days)')
fig.update_layout(autosize=False, width=800, height=400)
fig.show()