In [None]:
# Title: Create Measurement Features
# Author: Anna Zink
# Date: May 16, 2024
# Description: Take ehr data and create a set of features 

# Set Up

In [None]:
library(viridis)    # A nice color scheme for plots.
library(ggthemes)   # Common themes to change the look and feel of plots.
library(scales)     # Graphical scales map data to aesthetics in plots.
library(skimr)      # Better summaries of data.
library(lubridate)  # Date library from the tidyverse.
library(tidyverse)  # Data wrangling packages.
library(bigrquery)  # Data extraction from Google BigQuery
library(stringr)
#library(DescTools)

In [None]:
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
write_csv<-function(df, fn) {
    
   my_dataframe <- df
   destination_filename <- fn

   # store the dataframe in current workspace
   write_excel_csv(my_dataframe, destination_filename)

   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/ehr/"), intern=T)
}

# Define Functions to Process & Clean Measurement Data

## Process Function
- create a map of measurement names to short nicknames
- define "process" function

In [None]:
# want to create a function to do this
yr<-2016
msrs<-load_data(paste0('msrs_', yr, '.csv'), "/ehr/")
head(msrs)

In [None]:
# create a short name for the measures
measure<-unique(msrs$MEASUREMENT)
#new_name<-c('Creatinine', 'Diastolic_BP','Sodium','RR','Urea','HR','Glucose','BMI','Systolic_BP','Temp','Height',
#           'Erythrocyte','Hemoglobin','Oxygen','Weight','Chloride','Calcium','Potassium')
measure<-c(measure, 'Heart rate', 'Diastolic blood pressure', 'Systolic blood pressure','Respiratory rate')
map<-data.frame(og=measure)
map$new<-ifelse(grepl('A1C', toupper(map$og)), 'A1C',
          ifelse(grepl('GLUCOSE', toupper(map$og)), 'Glucose', 
         ifelse(grepl('TRIGLYCERIDE', toupper(map$og)), 'Triglyceride', sub(" \\[.*", "", map$og))))
map$new<-ifelse(map$new == 'Glucose' & grepl('FASTING', toupper(map$og)), 'Glucose_Fasting', map$new)
map$new<-ifelse(map$new == 'Diastolic blood pressure', 'Diastolic_BP',
         ifelse(map$new == 'Systolic blood pressure', 'Systolic_BP',
         ifelse(map$new == 'Body mass index (BMI)', 'BMI', 
         ifelse(map$new == 'Oxygen saturation in Arterial blood by Pulse oximetry', 'Oxygen',
         ifelse(map$new == 'Body weight', 'Weight', 
         ifelse(map$new == 'Respiratory rate', 'RR',
         ifelse(map$new == 'Heart rate', 'HR',
         ifelse(map$new == 'Urea nitrogen', 'Urea', map$new))))))))

In [None]:
# read in measure data and update it with new features 

# remove measures we don't care about (for now)
#remove_msrs<-c('Body height','Erythrocyte distribution width [Ratio] by Automated count')
keepvars<-c('MEASUREMENT','PERSON_ID','VISIT_OCCURRENCE_ID','MEASUREMENT_DATE','VALUE_AS_NUMBER', 'UNIT')

process_data<-function(yr, fn) {
    msrs<-load_data(paste0(fn, '_', yr, '.csv'), "/ehr/")

    # remove measures and columns we don't care about - already done
    #msrs<-msrs[!(msrs$MEASUREMENT %in% remove_msrs),keepvars]
    
    # map in short names
    msrs<-merge(msrs, map, by.x='MEASUREMENT', by.y='og')
    msrs<-msrs[,!(names(msrs) %in% c('MEASUREMENT','MEASUREMENT_CONCEPT_ID','year'))]
    
    # rename for coding ease 
    msrs<- msrs %>% rename(Value = VALUE_AS_NUMBER, Unit = UNIT)
    
    # remove missing values & defaults?
    msrs<-msrs[!is.na(msrs$Value),]
    msrs<-msrs[msrs$Value > 0 & msrs$Value < 10000000,]
    return(msrs)
}

In [None]:
test<-process_data(2016, 'msrs')

## Define "Clean Values" function to convert units and remove outliers 

**Unit Conversions**
- AIC: convert to percent
    - milligram per deciliter or mg/dL 

- Glucose: convert to miligram per deciliter
    - gram per deciliter

- Urea: convert to miligram per deciliter 
    - millimole per liter 

- Hemoglobin: convert to gram per deciliter
    - gram per liter

- Chloride: convert to millimole per liter
    - equivalent per liter to 

**Outliers** 

Use Z-score to identify outliers

Z-score = X-mean(X)/sd(X)

If abs(z-score)>3 remove

In [None]:
units<-test %>% group_by(new, Unit) %>% summarize(mean=mean(Value, na.rm=TRUE),  
                                                median=median(Value, na.rm=TRUE),
                                            max=max(Value, na.rm=TRUE),
                                            min=min(Value, na.rm=TRUE),
                                            na=sum(is.na(Value)),
                                            n=n())

In [None]:
# convert common unit problems 
clean_values<-function(ehr_msrs) {
    
# do common conversions
ehr_msrs$Value<-ifelse(ehr_msrs$new == 'Hemoglobin' 
                       & ehr_msrs$Unit %in% c('gram per liter','g/L'), ehr_msrs$Value/10, ehr_msrs$Value)

# AIC 
ehr_msrs$Value<-ifelse(ehr_msrs$new == 'A1C' 
                       & ehr_msrs$Unit %in% c('milligram per deciliter','mg/dL'), 
                       (ehr_msrs$Value+46.7)/28.7, ehr_msrs$Value)
    
# remove outliers 
dist_data<-ehr_msrs %>% group_by(new) %>% summarise(mean=mean(Value), sd=sd(Value))
ehr_msrs<-merge(ehr_msrs, dist_data, by='new')
ehr_msrs$z_score<-(ehr_msrs$Value - ehr_msrs$mean)/ehr_msrs$sd
ehr_msrs$outlier<-ifelse(abs(ehr_msrs$z_score)>3, 1, 0)
    

ehr_msrs<-ehr_msrs[ehr_msrs$outlier == 0,]
    
return(ehr_msrs)

}

In [None]:
cleaned<-clean_values(test)

## Define "Collapse" function to convert to person - visit id - date 

In [None]:
collapse<-function(df) {
    
    # drop unnecessary vars
    dropvars<-c('Value','Unit', 'mean','sd','z_score','outlier','new')
    df<-df[,!(names(df) %in% dropvars)]
    
    # make all missings 0 
    df[is.na(df)] <- 0
    
    # collapse
    collapse<- df %>% group_by(PERSON_ID, VISIT_OCCURRENCE_ID, MEASUREMENT_DATE) %>% 
        summarize(across(everything(), ~ max(.x, na.rm = TRUE)))
    
    return(collapse)
}

# Run on each year

For each file: 
- Process data (process_data)
- Define features 
- collapse to person-visit-date

Because of size of data do different features:
- heart rate
- blood pressure
- respiratory rate 
- other measures

## Heart Rate

Define Tachycardia (fast heart rate)

In [None]:
options(scipen=999) 
run_hr_year<-function(yr) {
    hr_msrs<-process_data(yr, 'hr')
    hr_msrs<-clean_values(hr_msrs)
    hr_msrs$tachycardia<-ifelse(hr_msrs$Value > 100, 1, 0)
    hr_collapsed<-collapse(hr_msrs)
    
    write_csv(hr_collapsed, paste0('hr_features_', yr, '.csv'))  
}

In [None]:
#run_hr_year(2016)
#run_hr_year(2017)
#run_hr_year(2018)
#run_hr_year(2019)
#run_hr_year(2020)
#run_hr_year(2021)
#run_hr_year(2022)
run_hr_year(2023)

## Blood Pressure

Define high blood pressure

In [None]:
run_bp_year<-function(yr) {
    bp_msrs<-process_data(yr, 'hr')
    bp_msrs<-clean_values(bp_msrs)
    bp_msrs$high_blood_pressure<-ifelse(bp_msrs$Value > 130, 1, ifelse(bp_msrs$Value > 80, 1, 0))
    bp_collapsed<-collapse(bp_msrs)
    write_csv(bp_collapsed, paste0('bp_features_', yr, '.csv'))  
}

In [None]:
#run_bp_year(2016)
#run_bp_year(2017)
#run_bp_year(2018)
#run_bp_year(2019)
#run_bp_year(2020)
#run_bp_year(2021)
#run_bp_year(2022)
run_bp_year(2023)

In [None]:
test<-load_data('bp_features_2023.csv', "/ehr/")
summary(test$high_blood_pressure)

## Respiratory Rate

Define tachypneic and bradypnea

In [None]:
run_rr_year<-function(yr) {
    rr_msrs<-process_data(yr, 'rr')
    rr_msrs<-clean_values(rr_msrs)
    rr_msrs$tachypneic<-ifelse(rr_msrs$Value > 20, 1, 0)
    rr_msrs$bradypnea<-ifelse(rr_msrs$Value < 12, 1, 0)
    rr_collapsed<-collapse(rr_msrs)
    write_csv(rr_collapsed, paste0('rr_features_', yr, '.csv'))  
}

In [None]:
#run_rr_year(2016)
#run_rr_year(2017)
#run_rr_year(2018)
#run_rr_year(2019)
#run_rr_year(2020)
#run_rr_year(2021)
run_rr_year(2022)
run_rr_year(2023)

In [None]:
test<-load_data('rr_features_2023.csv', "/ehr/")
summary(test$tachypneic)
summary(test$bradypnea)

## Define Other Measures

In [None]:
define_features<-function(ehr_msrs) {
    
# obesity
ehr_msrs$obesity<-ifelse(ehr_msrs$new == 'BMI' & ehr_msrs$Value >=30, 1, 0)

# body weight (raw)
ehr_msrs$weight<-ifelse(ehr_msrs$new == 'Weight', ehr_msrs$Value, NA)

# fever   
#ehr_msrs$fever<-ifelse(ehr_msrs$new == 'Temp' & 
#                       (ehr_msrs$Value > 100.4 | (ehr_msrs$Value > 38 & ehr_msrs$Value < 50)) , 1, 0)

# Triglyceride 
ehr_msrs$triglyceride_high<-ifelse(ehr_msrs$new == 'Triglyceride' & ehr_msrs$Value >=175, 1, 0) 
    
# Calcium 
ehr_msrs$hypocalcemia<-ifelse(ehr_msrs$new == 'Calcium' & ehr_msrs$Value < 8.5, 1, 0)
ehr_msrs$hypercalcemia<-ifelse(ehr_msrs$new == 'Calcium' & ehr_msrs$Value > 10.5, 1,0)

# Chloride (set min/max at 2.75 & 97.5 percentile)
ehr_msrs$hypochloremia<-ifelse(ehr_msrs$new == 'Chloride' & ehr_msrs$Value<96,1,0)
ehr_msrs$hyperchloremia<-ifelse(ehr_msrs$new == 'Chloride' & ehr_msrs$Value>107, 1,0)

# Creatine
ehr_msrs$creatine<-ifelse(ehr_msrs$new == 'Creatinine' & ehr_msrs$Value > 1.3, 1, 0)

# diabetes hba1c
ehr_msrs$prediabetes_hba1c<-ifelse(ehr_msrs$new == 'A1C' & ehr_msrs$Value>=5.7 & ehr_msrs$Value < 6.5, 1, 0)
                             
ehr_msrs$diabetes_hba1c<-ifelse(ehr_msrs$new == 'A1C' & ehr_msrs$Value >= 6.5, 1, 0 )

# diabetes fasting glucose 
ehr_msrs$prediabetes_gluc_fast<-ifelse(ehr_msrs$new == 'Glucose_Fasting' & ehr_msrs$Value >=100 & ehr_msrs$Value < 126, 1, 0)
ehr_msrs$diabetes_gluc_fast<-ifelse(ehr_msrs$new == 'Glucose_Fasting' & ehr_msrs$Value >=126, 1, 0)

# diabetes unknown fasting glucose     
ehr_msrs$diabetes_gluc_na<-ifelse(ehr_msrs$new == 'Glucose' & ehr_msrs$Value >=200, 1, 0)

# hemoglobin
ehr_msrs$anemia<-ifelse(ehr_msrs$new == 'Hemoglobin' & ehr_msrs$Value <12, 1, 0)
ehr_msrs$high_hemo<-ifelse(ehr_msrs$new == 'Hemoglobin' & ehr_msrs$Value >15, 1, 0)

# Oxygen Sat 
ehr_msrs$hypoxemia<-ifelse(ehr_msrs$new == 'Oxygen' & ehr_msrs$Value < 90, 1,0)

# Potassium
ehr_msrs$hyperkalemia<-ifelse(ehr_msrs$new == 'Potassium' & ehr_msrs$Value > 5.5, 1,0 )
ehr_msrs$hypokalemia<-ifelse(ehr_msrs$new == 'Potassium' & ehr_msrs$Value < 3.5, 1,0)

# Sodium 
ehr_msrs$hypernatremia<-ifelse(ehr_msrs$new == 'Sodium' & ehr_msrs$Value > 145, 1,0)
ehr_msrs$hyponatremia<-ifelse(ehr_msrs$new == 'Sodium' & ehr_msrs$Value < 135, 1,0)

# Urea
ehr_msrs$low_urea<-ifelse(ehr_msrs$new == 'Urea' & ehr_msrs$Value < 8, 1,0)
ehr_msrs$high_urea<-ifelse(ehr_msrs$new == 'Urea' & ehr_msrs$Value > 24, 1,0)

return(ehr_msrs)
}

In [None]:
run_year<-function(yr) {
    ehr_msrs<-process_data(yr, 'msrs')
    ehr_msrs<-clean_values(ehr_msrs)
    ehr_msrs<-define_features(ehr_msrs)
    collapsed<-collapse(ehr_msrs)
    write_csv(collapsed, paste0('msrs_features_', yr, '.csv'))  
    
}

In [None]:
#run_year(2016)
#run_year(2017)
#run_year(2018)
#run_year(2019)
#run_year(2020)
run_year(2021)
#run_year(2022)
#run_year(2023)

In [None]:
test<-load_data(paste0('msrs_features_2022.csv'), "/ehr/")

In [None]:
names(test)