In [271]:
import numpy as np
import pandas as pd
from datetime import datetime
import plotly.express as px

# settings
datadir = "../../SmartCareData/"

# Process O2-FEV1 data

In [358]:
# Extract data and format datatypes
# Don't use convert_dtypes as it provides types that mess up when doing np calculations
measurements = pd.read_csv(datadir + "mydata.csv")
measurements['Date/Time recorded'] = pd.to_datetime(measurements['Date/Time recorded']).dt.date
print("Measurements data columns:\n{}\n".format(measurements.dtypes))

# ID mapping file
id_map = pd.read_excel(datadir + "patientidnew.xlsx", dtype={'SmartCareID': str}).drop("Study_ID", axis=1)
print("ID mapping file columns:\n{}\n".format(id_map.dtypes))

# Clinical data
clinicaldata = pd.read_excel(datadir + "clinicaldata_updated.xlsx", sheet_name="Patients", dtype={'ID': str})
print("Clinical data columns:\n{}".format(clinicaldata.dtypes))

# Additional data cleaning
length_before_cleaning_O2 = measurements.shape[0]
measurements.drop( measurements[measurements["O2 Saturation"] > 100].index, inplace=True)
print("Removed {} O2 saturation measurements over 100%, {} left".format(length_before_cleaning_O2-measurements.shape[0], measurements.shape[0]))


Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.



Measurements data columns:
User ID                  object
UserName                 object
Recording Type           object
Date/Time recorded       object
FEV 1                   float64
FEV 10                  float64
Predicted FEV           float64
FEV 1 %                 float64
Weight in Kg            float64
O2 Saturation           float64
Pulse (BPM)             float64
Calories                float64
Rating                  float64
Temp (deg C)            float64
Sputum sample taken?     object
Activity - Steps        float64
Activity - Points       float64
dtype: object

ID mapping file columns:
Patient_ID     object
SmartCareID    object
dtype: object

Clinical data columns:
ID                                       object
Hospital                                 object
Study Number                             object
Study Date                       datetime64[ns]
DOB                              datetime64[ns]
Age                                       int64
Sex                


Unknown extension is not supported and will be removed



In [366]:
# Get table with FEV1 and O2 values per day per user

# Extract data for one column
def extract_measure(label):
    # Could also filter by Recording Type
    data = measurements[measurements[ label ].notnull()][['User ID', 'Date/Time recorded', label]]
    print("{} contains {} measurements".format(label, data.shape[0]))
    return data

O2 = extract_measure("O2 Saturation")
FEV1 = extract_measure("FEV 1")

length_outer_join = O2.merge(FEV1, on=['User ID', 'Date/Time recorded'], how='outer').shape[0]
length_left_join  = O2.merge(FEV1, on=['User ID', 'Date/Time recorded'], how='left').shape[0]
O2_FEV1 = O2.merge(FEV1, on=['User ID', 'Date/Time recorded'], how='inner')
print("Removed {} rows with inner merge, {:.0%} of O2-FEV1 same day measurements remain".format( length_outer_join, O2_FEV1.shape[0] /length_outer_join))

# Add patient data
# Map smartcare ids
O2_FEV1 = O2_FEV1.merge(id_map, left_on='User ID', right_on='Patient_ID', copy=True)
# Add clinical data
O2_FEV1 = O2_FEV1.merge(clinicaldata, left_on='SmartCareID', right_on='ID', copy=True).set_index("ID").drop(["User ID", "Patient_ID", "SmartCareID"], axis=1)

O2 Saturation contains 14118 measurements
FEV 1 contains 11063 measurements
Removed 15641 rows with inner merge, 68% of O2-FEV1 same day measurements remain


# O2-FEV1 analysis

In [367]:
# What are we analysing?
O2_FEV1.head()

Unnamed: 0_level_0,Date/Time recorded,O2 Saturation,FEV 1,Hospital,Study Number,Study Date,DOB,Age,Sex,Height,...,Date Consent Obtained,CFQR Quest Comp,Inconvenience Payment,Date Last PE Start,Date Last PE Stop,GP Letter Sent,Remote Monitoring App User ID,Study Email,Freezer Required,Comments
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23,2015-08-12,95.0,1.49,PAP,1,2015-08-03,1960-07-05,55,Female,157.0,...,2015-08-10,Yes,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,
23,2015-08-14,95.0,1.29,PAP,1,2015-08-03,1960-07-05,55,Female,157.0,...,2015-08-10,Yes,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,
23,2015-08-21,94.0,1.35,PAP,1,2015-08-03,1960-07-05,55,Female,157.0,...,2015-08-10,Yes,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,
23,2015-09-05,95.0,1.3,PAP,1,2015-08-03,1960-07-05,55,Female,157.0,...,2015-08-10,Yes,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,
23,2015-09-07,97.0,1.46,PAP,1,2015-08-03,1960-07-05,55,Female,157.0,...,2015-08-10,Yes,Yes,2014-11-03,2014-11-17 00:00:00,Yes,PAP001,papworth005,Yes,


In [372]:
# Scatter plot
fig = px.scatter(O2_FEV1, x="O2 Saturation", y="FEV 1", trendline="ols")
fig.update_layout(autosize=False, width=800, height=800)
fig.show()