In [None]:
# Reliability Analysis
# Author: Anna Zink
# Description: Create data for reliability plots in the paper 

In [None]:
# install packages 
library(plyr)
library(tidyverse)
library(bigrquery)
library(stringr)  
library(lubridate)

# get sys environment settings
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
CDR <- Sys.getenv('WORKSPACE_CDR')
MY_BUCKET <- Sys.getenv('WORKSPACE_BUCKET')

# useful functions
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
# folder = "/ehr/" 
write_csv<-function(df, fn, folder) {
   my_dataframe <- df
   destination_filename <- fn
   write_excel_csv(my_dataframe, destination_filename)
   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, folder), intern=T)
}

# Load data

In [None]:
# load analysis sample and a list of people who filled out the self report survey 
sample<-load_data('analysis_sample.csv', "/analysis/")
self_sample<-load_data('answered_self_reported_health.csv', "/survey/")
self_sample$in_self_survey<-1
sample<-merge(sample, self_sample, by= 'person_id' , all.x = TRUE)
sample$in_self_survey<-ifelse(is.na(sample$in_self_survey), 0, sample$in_self_survey)
sample$noaccess<-ifelse(sample$delayed_ind == 1 | sample$afford_ind == 1, 1, 0)

In [None]:
ehr<-load_data('ehr_conditions_long.csv', "/data/")
self<-load_data('self_conditions_long.csv', "/data/")
visit<-load_data('by_person_ehr_visit_flag.csv', '/ehr/')
self$self<-1

In [None]:
all<-bind_rows(ehr, self[,c('person_id','condition','self')])
all[is.na(all)] <- 0
head(all)

In [None]:
byperson<- all %>% group_by(person_id, condition) %>% 
                summarise(ehr=max(ehr_ever),
                          ehr_before=max(ehr_lookback),
                          ehr_1_yr=max(ehr_1_yr_lookback),
                          ehr_2_yr=max(ehr_2_yr_lookback),
                          self=max(self))

In [None]:
# subset to people in our analysis sample and merge in access labels
merged<-merge(byperson, sample, by='person_id')
merged<-merge(merged, visit, by='person_id', all.x=TRUE)
head(merged)

In [None]:
write_csv(merged, 'ehr_reliability_data.csv', '/data/')

# Dataset for boxplots in figure 1

Datasets: 
- access_ehr_boxplot.csv
- access_self-report_boxplot.csv

Variables: 
- label
- lower_whisker
- lower_quartile 
- median
- upper_quartile
- upper_whisker
- fliers



## EHR boxplots

In [None]:
box_out <- boxplot.stats(all_counts$ehr_count[all_counts$afford_ind == 1])
stats<-box_out$stats
out<-box_out$out
outliers<-unique(out)
afford<-data.frame(label='afford_access', 
                   lower_whisker=stats[1], 
                   lower_quartile=stats[2], 
                   median=stats[3], 
                   upper_quartile=stats[4], 
                   upper_whisker=stats[5],
                  fliers=I(list(outliers)))

In [None]:
box_out <- boxplot.stats(all_counts$ehr_count[all_counts$delayed_ind == 1])
stats<-box_out$stats
out<-box_out$out
outliers<-unique(out)
delay<-data.frame(label='delay_access', 
                   lower_whisker=stats[1], 
                   lower_quartile=stats[2], 
                   median=stats[3], 
                   upper_quartile=stats[4], 
                   upper_whisker=stats[5],
                  fliers=I(list(outliers)))

In [None]:
box_out <- boxplot.stats(all_counts$ehr_count[all_counts$delayed_ind == 0 & all_counts$afford_ind == 0])
stats<-box_out$stats
out<-box_out$out
outliers<-unique(out)
high<-data.frame(label='high_access', 
                   lower_whisker=stats[1], 
                   lower_quartile=stats[2], 
                   median=stats[3], 
                   upper_quartile=stats[4], 
                   upper_whisker=stats[5],
                  fliers=I(list(outliers)))

In [None]:
all<-rbind(afford, delay, high)
write_csv(all, 'access_ehr_boxplot.csv',"/output/")

## Survey boxplots

In [None]:
box_out <- boxplot.stats(all_counts$survey_count[all_counts$afford_ind == 1])
stats<-box_out$stats
out<-box_out$out
outliers<-unique(out)
afford<-data.frame(label='afford_access', 
                   lower_whisker=stats[1], 
                   lower_quartile=stats[2], 
                   median=stats[3], 
                   upper_quartile=stats[4], 
                   upper_whisker=stats[5],
                  fliers=I(list(outliers)))

box_out <- boxplot.stats(all_counts$survey_count[all_counts$delayed_ind == 1])
stats<-box_out$stats
out<-box_out$out
outliers<-unique(out)
delay<-data.frame(label='delay_access', 
                   lower_whisker=stats[1], 
                   lower_quartile=stats[2], 
                   median=stats[3], 
                   upper_quartile=stats[4], 
                   upper_whisker=stats[5],
                  fliers=I(list(outliers)))

box_out <- boxplot.stats(all_counts$survey_count[all_counts$delayed_ind == 0 & all_counts$afford_ind == 0])
stats<-box_out$stats
out<-box_out$out
outliers<-unique(out)
high<-data.frame(label='high_access', 
                   lower_whisker=stats[1], 
                   lower_quartile=stats[2], 
                   median=stats[3], 
                   upper_quartile=stats[4], 
                   upper_whisker=stats[5],
                  fliers=I(list(outliers)))

In [None]:
all<-rbind(afford, delay, high)
write_csv(all, 'access_self_boxplot.csv',"/output/")

In [None]:
# not merged excludes people without a condition listed
merged$ehr_only<-ifelse(merged$ehr == 1 & merged$self == 0, 1, 0)
merged$self_only<-ifelse(merged$ehr == 0 & merged$self == 1, 1, 0)
merged$ehr_sr<-ifelse(merged$ehr == 1 & merged$self == 1, 1, 0)
merged$ehr_before_sr<-ifelse(merged$ehr_before == 1 & merged$self == 1, 1,0)
merged$neither<-ifelse(merged$ehr == 0 & merged$self == 0, 1, 0)
merged$condition_present<-ifelse(merged$ehr == 1 | merged$self == 1, 1, 0)

In [None]:
bycondition<-merged %>% group_by(condition) %>% summarise(count=sum(condition_present),
                                                            ehr=sum(ehr_only), 
                                                            self=sum(self_only), 
                                                            ehr_sr=sum(ehr_sr),
                                                             neither=sum(neither))
# need to divide by the population 
n<-sum(sample$in_self_survey)
bycondition$pct_ehr<-bycondition$ehr/n
bycondition$pct_sr<-bycondition$self/n
bycondition$pct_ehr_sr<-bycondition$ehr_sr/n
bycondition$pct_none<-1-(bycondition$pct_ehr + bycondition$pct_sr + bycondition$pct_ehr_sr)
bycondition

In [None]:
write_csv(bycondition, 'diag_reliability_plot.csv',"/output/")

In [None]:
# repeat by access group 
bycondition<-merged %>% group_by(condition, noaccess) %>% summarise(count=sum(condition_present),
                                                                    self=sum(self), 
                                                                    self_ehr=sum(ehr_sr), 
                                                                    ehr_before=sum(ehr_before),
                                                                    self_ehr_before=sum(ehr_before_sr),
                                                                    error=sum(ehr != self))

# need to divide by the population 
n<-sample %>% group_by(noaccess) %>% summarise(n=sum(in_self_survey))
bycondition<-merge(bycondition, n, by='noaccess')
bycondition$pct_missing_ehr<-1-bycondition$self_ehr/bycondition$self
bycondition$pct_missing_sr<-1-bycondition$self_ehr_before/bycondition$ehr_before
write_csv(bycondition, 'diag_reliability_by_access_plot.csv',"/output/")

# Dataset for EHR reliability plot

For each condition group, find the missing diagnosis rate, i.e., for those that have the condition flag if ehr =0

- Condition
- Neither
- Cant
- Delay 
- All 
- pval

In [None]:
# pass in dataset
get_pval<-function(df, cond){
    
    temp_df<-df[df$condition == cond ,]
    counts<-temp_df %>% group_by(neither) %>% summarise(x=sum(num), n=n())
    prop_test<-prop.test(counts$x, counts$n, alternative="two.sided", correct=TRUE)
    return(prop_test$p.value)
    
}

merged<-load_data('ehr_reliability_data.csv', '/data/')
merged$neither<-ifelse(merged$afford_ind == 0 & merged$delayed_ind == 0, 1, 0)

## reliability among diabetes task sample

Look at reliability across all conditions for standard vs low access group

In [None]:
# subset to diabetes sample and get ehr reliabilty for each 
diab<-load_data('prediction_data_updt.csv',"/diabetes/")
diab$in_diab<-1
diab<-diab[,c('person_id','in_diab')]
diab_merge<-merge(merged, diab, by='person_id')

In [None]:
diab_merge$denom<-ifelse(diab_merge$self == 1, 1, 0)
diab_merge$num<-ifelse(diab_merge$self == 1 & diab_merge$ehr == 0, 1, 0)

neither<-diab_merge[diab_merge$neither == 1,]  %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(neither=num/denom) %>% select(-c(num, denom))
low<-diab_merge[diab_merge$neither == 0,]  %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(neither=num/denom) %>% select(-c(num, denom))

print(paste0("ehr reliability for diab sample standard: ", 1-neither))
print(paste0("ehr reliability for diab sample low: ", 1-low))

In [None]:
sub<-diab_merge[diab_merge$denom == 1,]
counts<-sub %>% group_by(neither) %>% summarise(x=sum(num), n=n())
prop_test<-prop.test(counts$x, counts$n, alternative="two.sided", correct=TRUE)
prop_test

## create EHR reliability estimates

This is calculating the EHR missingness rate --> to get reliability you need to 1-val

In [None]:
# define numerator and denominator for ehr 
merged$denom<-ifelse(merged$self == 1, 1, 0)
merged$num<-ifelse(merged$self == 1 & merged$ehr == 0, 1, 0)

# calculate missed ehr rates for each condition and create one dataset 
all<-merged %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(all=num/denom) %>% select(-c(num, denom))
cant<-merged[merged$afford_ind == 1,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(afford=num/denom) %>% select(-c(num, denom))
delay<-merged[merged$delayed_ind == 1,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(delay=num/denom) %>% select(-c(num, denom))
neither<-merged[merged$neither == 1,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(neither=num/denom) %>% select(-c(num, denom))
combined<-merge(all, cant, by='condition')
combined<-merge(combined, delay, by='condition')
combined<-merge(combined, neither, by='condition')

# subset to rows where denom criteria is met
subset<-merged[merged$denom == 1,]

# create datasets for copmarison
delay<-subset[subset$delayed_ind == 1 | subset$neither == 1,]
afford<-subset[subset$afford_ind == 1 | subset$neither == 1,]

# delay vs neither comparisions with benjamin-hochberg correction 
combined$pval_delay<-NA
combined$pval_afford<-NA
condition_list<-unique(merged$condition)
for (c in condition_list){
    pdelay<-get_pval(delay, c)
    pafford<-get_pval(afford, c)
 
    combined$pval_delay<-ifelse(combined$condition == c, pdelay, combined$pval_delay)
    combined$pval_afford<-ifelse(combined$condition == c, pafford, combined$pval_afford)
  
}
write_csv(combined, 'ehr_reliability_plot.csv',"/output/")

## create self-report reliability mesaure

In [None]:
# define numerator and denominator for ehr - % of peple who self reported something in the ehr
merged$denom<-ifelse(merged$ehr_before == 1, 1, 0)
merged$num<-ifelse(merged$self == 1 & merged$ehr_before == 1, 1, 0)

# calculate missed ehr rates for each condition and create one dataset 
all<-merged %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(all=num/denom) %>% select(-c(num, denom))
#cant<-merged[merged$afford_ind == 1,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(afford=num/denom) %>% select(-c(num, denom))
#delay<-merged[merged$delayed_ind == 1,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(delay=num/denom) %>% select(-c(num, denom))
lowaccess<-merged[merged$neither == 0,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(neither=num/denom) %>% select(-c(num, denom))
names(lowaccess)<-c('condition','lowaccess')
standard<-merged[merged$neither == 1,] %>% group_by(condition) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(neither=num/denom) %>% select(-c(num, denom))
combined<-merge(all, lowaccess, by='condition')
combined<-merge(combined, standard, by='condition')
# subset to rows where denom criteria is met
subset<-merged[merged$denom == 1,]

# delay vs neither comparisions with benjamin-hochberg correction 
combined$pval_low<-NA
condition_list<-unique(merged$condition)
for (c in condition_list){
    plow<-get_pval(subset, c)
 
    combined$pval_low<-ifelse(combined$condition == c, plow, combined$pval_low)

}
combined$pval_low_bh<-p.adjust(combined$pval_low, method='BH')
combined$flag<-ifelse(combined$lowaccess>combined$neither, 1, 0)
write_csv(combined, 'self_reliability_plot.csv',"/output/")

## Stratify by Visit

In [None]:
merged$ehr_visit<-ifelse(coalesce(merged$ehr_visit_pre,0) | coalesce(merged$ehr_visit_post,0), 1, 0)
merged$ehr_visit_counts<-coalesce(merged$ehr_visit_pre_count,0) + coalesce(merged$ehr_visit_post_count,0)

In [None]:
summary(merged$ehr_visit_pre_count)

In [None]:
all<-merged %>% group_by(condition, ehr_visit) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(all=num/denom) %>% select(-c(num, denom))
cant<-merged[merged$afford_ind == 1,] %>% group_by(condition, ehr_visit) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(afford=num/denom) %>% select(-c(num, denom))
delay<-merged[merged$delayed_ind == 1,] %>% group_by(condition, ehr_visit) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(delay=num/denom) %>% select(-c(num, denom))
neither<-merged[merged$neither == 1,] %>% group_by(condition, ehr_visit) %>% summarise(num=sum(num), denom=sum(denom)) %>% mutate(neither=num/denom) %>% select(-c(num, denom))

In [None]:
combined<-merge(all, cant, by=c('condition','ehr_visit'))
combined<-merge(combined, delay, by=c('condition','ehr_visit'))
combined<-merge(combined, neither, by=c('condition','ehr_visit'))

In [None]:
head(combined)

In [None]:
library(data.table)
long<-melt(combined, id.vars=c('condition','ehr_visit'))
head(long)

In [None]:
long_sub<-long[!(long$variable == 'all') & long$ehr_visit == 1,]
ggplot(long_sub, aes(x=value, y=condition, color=variable))+geom_point()+
facet_wrap(~ehr_visit)+theme_bw()

# Summarize EHR reliability across conditions

- average conditions missing by group
- how likelihood of EHR condition changes by group if self-reported

In [None]:
names(merged)
merged$ehr_visit_pre<-coalesce(merged$ehr_visit_pre, 0)
merged$ehr_visit_post<-coalesce(merged$ehr_visit_post, 0)
merged$ehr_visit<-ifelse(merged$ehr_visit_pre | merged$ehr_visit_post, 1, 0)
merged$ehr_visit_count<-coalesce(merged$ehr_visit_pre_count, 0) + coalesce(merged$ehr_visit_post_count, 0)

In [None]:
# average
has_self<-merged[merged$self == 1,]

In [None]:
# missing diagnoses per person
has_self_person<-has_self %>% group_by(person_id, afford_ind, delayed_ind) %>% 
    summarise(n_conditions=n(), ehr=mean(ehr), ehr_1yr=mean(ehr_1_yr))
head(has_self_person)

In [None]:
has_self_person %>% group_by(afford_ind, delayed_ind) %>% summarise(ehr=mean(ehr), ehr_1yr=mean(ehr_1yr))
has_self_person %>% group_by(afford_ind) %>% summarise(ehr=mean(ehr), ehr_1yr=mean(ehr_1yr))
has_self_person %>% group_by(delayed_ind) %>% summarise(ehr=mean(ehr), ehr_1yr=mean(ehr_1yr))

In [None]:
mod1<-lm(ehr ~ neither + condition, data=has_self)
summary(mod1)

In [None]:
mod2<-lm(ehr ~ neither + ehr_visit + condition , data=has_self)
summary(mod2)

In [None]:
mod3<-lm(ehr ~ neither + ehr_visit_count + condition , data=has_self)
summary(mod3)