In [None]:
# Title: Diabetes Model
# Author: Anna Zink
# Date: May 28, 2024
# Description: Predict new diabetes incidence 

# Set Up

In [None]:
library(viridis)    # A nice color scheme for plots.
library(ggthemes)   # Common themes to change the look and feel of plots.
library(scales)     # Graphical scales map data to aesthetics in plots.
library(skimr)      # Better summaries of data.
library(lubridate)  # Date library from the tidyverse.
library(tidyverse)  # Data wrangling packages.
library(bigrquery)  # Data extraction from Google BigQuery
library(pROC)
library(ranger)
#library(xgboost)
library(glmnet)
library(cutpointr)
library(caret)
library(broom)
library(data.table)
library(matrixStats)
library(janitor)

# disable scientific notation (so you can read full ids)
options(scipen = 999)

## Helper Functions

In [None]:
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
# folder = "/ehr/" 
write_csv<-function(df, fn, folder) {
   my_dataframe <- df
   destination_filename <- fn
   write_excel_csv(my_dataframe, destination_filename)
   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, folder), intern=T)
}

# get best cutoff from training data
get_cutoff<-function(label, preds){
    roc<-pROC::roc(label, preds)
    msrs<-coords(roc, x="best", input = "threshold", ret=c("threshold"))
    return(msrs[["threshold"]])
}


get_msrs<-function(label, preds, pred_label, description) {
    
    msrs<-c()
    roc<-pROC::roc(label, preds)
    auc<-pROC::auc(roc)
    auc_ci<-ci(auc)
    
    msrs['auc']<-auc
    msrs['auc_lb']<-auc_ci[1]
    msrs['auc_ub']<-auc_ci[3]
    msrs['desc']<-description
    msrs['n']<-length(label)
      
    conf_matrix <- table(Predicted = pred_label, Actual = label)
    TP <- conf_matrix[2, 2]
    TN <- conf_matrix[1, 1]
    FP <- conf_matrix[2, 1]
    FN <- conf_matrix[1, 2]
    
    sensitivity <- TP / (TP + FN)
    specificity <- TN / (TN + FP)
    precision <- TP / (TP + FP)
    accuracy <- (TP + TN) / sum(conf_matrix)
    f1_score <- 2 * ((precision * sensitivity) / (precision + sensitivity))

    msrs['sensitivity']<-as.numeric(sensitivity)
    msrs['specificity']<-as.numeric(specificity)
    msrs['precision']<-as.numeric(precision)
    msrs['accuracy']<-as.numeric(accuracy)
    msrs['f1_score']<-as.numeric(f1_score)
    
    return(msrs)

}


# Create analytic dataset

Predict whether someone developes type 2 diabetes. Use the date they entered the health survey (or access usrvey if that is missing) as the index date. Keep patients with index date 2018-2021 (2022 excluded bc they wont have a long enough follow-up window).

Exclude members with evidence of diabetes as of the index date based on EHR conditions (all years) and lab measures (2016 on).

##  Create sample and outcomes
Start with everyoen in the sample and the date of the self report (if it exists)

In [None]:
# load data  
sample<-load_data('analysis_sample.csv', "/analysis/")

# get date of self-reported condition
self_date<-load_data('self_reported_health_byperson.csv', "/survey/")
dups<-self_date %>% group_by(person_id) %>% summarize(n=n(), self_date=min(survey_date))

sample<-merge(sample, dups[,c('person_id','self_date')], all.x=TRUE)

# set the index date as the date they filled out the self-report data (or maxdate of access)
sample$index_date<-ifelse(!is.na(sample$self_date), as.Date(sample$self_date), as.Date(sample$maxdate))
sample$index_date<-as.Date(sample$index_date)

# find observation period (data ends on 07/01/22)
# https://support.researchallofus.org/hc/en-us/articles/360033200232-Data-Dictionaries-for-the-Curated-Data-Repositories-CDRs
sample$obs_window<-as.numeric(difftime(as.Date("2023-10-1"), sample$index_date, units = "days"))

# keep people with at least 2-year window
sample_sub<-sample[sample$obs_window>=730,]

### Load diabetes diagnosis and lab dates

In [None]:
### Create criteria for diabetes (conditions)
diab_dgn<-load_data("diabetes_dgn_dates.csv", "/diabetes/")
diab_lab<-load_data("hba1c_glucose_readings.csv", "/ehr/")

# update condition data so I can join it with lab data
keepvars<-c('person_id','measurement_datetime','value_updt','diabetes','source')
diab_dgn$measurement_datetime<-diab_dgn$condition_start_datetime
diab_dgn$value_updt<-1
diab_dgn$diabetes<-1
diab_dgn$source<-'Diagnosis'
diab_dgn<-diab_dgn[,keepvars]

# combine two sources 
diab_tests<-rbind(diab_dgn, diab_lab)
diab_tests$diab_date<-as.Date(diab_tests$measurement_datetime)

In [None]:
# flag anyone with a lab in the pre-period
has_labs<-merge(sample_sub, diab_lab, by='person_id')
has_labs$has_labs<-ifelse(has_labs$measurement_datetime < has_labs$index_date, 1, 0)
has_labs<-has_labs %>% group_by(person_id) %>% summarise(has_labs=max(has_labs))
has_labs<-has_labs[has_labs$has_labs == 1,]

### Exclude anyone with a diabetes diagnosis

- at least 1 type 2 diagnosis
- at least 1 hba1c or fasting glucose measures above clinical criteria
- at least 2 random glucose measures above clinical criteria

In [None]:
# subset to positive values
diab_dates<-diab_tests[!is.na(diab_tests$diabetes) & diab_tests$diabetes == 1,]

# for random glucose require confirmatory test so only take the second value 
diab_dates<-diab_dates %>% arrange(person_id, source, diab_date) %>%
                        group_by(person_id,source) %>% mutate(rank=row_number()) %>% ungroup()

# mark a diagnosis if first record of 
diab_dates$qual_diag_flag<-ifelse(diab_dates$source %in% 
                                  c("random glucose","Diagnosis","fasting glucose","HbA1c") 
                                  & diab_dates$rank == 1, 1,0)

# subset to qualifying diagnoses and dedup 
diab_dgn_dates<- diab_dates[diab_dates$qual_diag_flag == 1,]
diab_dgn_dates<- diab_dgn_dates %>% group_by(person_id) %>% slice_min(order_by = diab_date, n=1) %>% ungroup()

# subset to people in the sample 
merged<-merge(sample_sub, diab_dgn_dates, by='person_id', all.x=TRUE)

# flag peple with evidence of diabetes before the index date
merged$pre<-ifelse(is.na(merged$diab_date), 0, 
            ifelse(merged$diab_date < merged$index_date, 1, 0))

In [None]:
dim(merged)
no_diabetes<-merged[merged$pre == 0, ]
dim(no_diabetes)

In [None]:
# define outcomes 
no_diabetes$time_to_dgn<-as.numeric(difftime(no_diabetes$diab_date, no_diabetes$index_date, units = "days"))

no_diabetes$y<-ifelse(is.na(no_diabetes$time_to_dgn), 0,
                  ifelse(no_diabetes$time_to_dgn < 730, 1, 0))

keepvars<-c('person_id','index_date','diab_date','time_to_dgn','y')
write_csv(no_diabetes[,keepvars], 'diabetes_sample.csv', "/diabetes/")

## Create predictors

Include time component by flagging things that occur in the last 3 months, last year, last 2 years

- Conditions with occuring for at least 1% of sample 
- Measures 
- Drugs

In [None]:
# load sample
sample<-load_data('diabetes_sample.csv',"/diabetes/")
# create beginning period for predictors (2 year lookback window)
sample$start_window<-sample$index_date - lubridate::years(2)
head(sample)

### Conditions

In [None]:
# pull in condition data from 2016-2021
ehr_conditions<-data.frame()
for (yr in seq(2016, 2021)) {
    conds<-load_data(paste0('conditions_', yr, '.csv'), "/ehr/")
    keepconds<-c('PERSON_ID','CONDITION_NAME','CONDITION_START_DATE')
    sample_conds<-merge(conds[,keepconds], sample, by.x=c('PERSON_ID'), by.y='person_id')
    # keep rows within 2 years of the index date 
    sample_conds<-sample_conds[sample_conds$CONDITION_START_DATE >= sample_conds$start_window & 
                               sample_conds$CONDITION_START_DATE < sample_conds$index_date,]
    ehr_conditions<-rbind(ehr_conditions, sample_conds)
}
dim(ehr_conditions)

In [None]:
# dedup by personid and condition
ehr_conditions<- ehr_conditions %>% group_by(PERSON_ID, CONDITION_NAME) %>% summarize(nenc = n())

# find # of occurences for each condition (and pct in the sample) and limit to conditions with at least 1% 
conditions<- ehr_conditions %>% group_by(CONDITION_NAME) %>% 
summarize(nobs=n(), n=n_distinct(PERSON_ID))  %>% 
arrange(desc(n))
npeople<- length(unique(ehr_conditions$PERSON_ID))
conditions$pct<-conditions$n/npeople
subconds<-conditions[conditions$pct>=.01,]
subconds$cond_short<-gsub(" ", "_", subconds$CONDITION_NAME)
subconds$cond_short<-gsub("[^[:alnum:]_]", "", subconds$cond_short)

ehr<-merge(ehr_conditions, subconds, by='CONDITION_NAME')

In [None]:
# create a set of wide conditions
conditions<-subconds$cond_short
for (i in 1:length(conditions)) {
    condition<-conditions[i]
    
    varname1<-paste0('dx_',  condition)
    
    ehr[, varname1]<-ifelse(ehr$cond_short == condition, 1, 0)
        
}

In [None]:
vars<-names(ehr)
dxvars<-vars[grepl('dx_',vars)]
keepvars<-c('PERSON_ID', dxvars)
ehr<-ehr[,keepvars]
dim(ehr)

In [None]:
# Convert to data.table so it is more efficient 
dt <- as.data.table(ehr)
numeric_cols <- names(dt)[sapply(dt, is.numeric)]
byperson <- dt[, lapply(.SD, max, na.rm = TRUE), by = PERSON_ID, .SDcols = numeric_cols]

In [None]:
# creates duplicate PERSON_ID so drop and then rename
byperson<-byperson[,-2]
byperson <- byperson %>% rename(person_id=PERSON_ID)

In [None]:
# save 
write_csv(byperson, 'diabetes_dx.csv', "/diabetes/")

### Measures

In [None]:
# pull in measures from 2016-2021
ehr_msrs<-data.frame()
for (yr in seq(2016, 2021)) {
    msrs<-load_data(paste0('msrs_features_', yr, '.csv'), "/ehr/")
    print(yr)
    msrs<-msrs[!is.na(msrs$MEASUREMENT_DATE),]
    sample_msrs<-merge(msrs, sample, by.x=c('PERSON_ID'), by.y='person_id')
    # keep rows within 2 years of the index date 
    sample_msrs<-sample_msrs[sample_msrs$MEASUREMENT_DATE >= sample_msrs$start_window & 
                               sample_msrs$MEASUREMENT_DATE < sample_msrs$index_date,]
    ehr_msrs<-rbind(ehr_msrs, sample_msrs)
}

In [None]:
# hr, rr, bp
# pull in measures from 2016-2021
hr_msrs<-data.frame()
for (yr in seq(2016, 2021)) {
    msrs<-load_data(paste0('hr_features_', yr, '.csv'), "/ehr/")
    msrs<-msrs[!is.na(msrs$MEASUREMENT_DATE),]
    sample_msrs<-merge(msrs, sample, by.x=c('PERSON_ID'), by.y='person_id')
    # keep rows within 2 years of the index date 
    sample_msrs<-sample_msrs[sample_msrs$MEASUREMENT_DATE >= sample_msrs$start_window & 
                               sample_msrs$MEASUREMENT_DATE < sample_msrs$index_date,]
    hr_msrs<-rbind(hr_msrs, sample_msrs)
}

In [None]:
rr_msrs<-data.frame()
for (yr in seq(2016, 2021)) {
    msrs<-load_data(paste0('rr_features_', yr, '.csv'), "/ehr/")
    msrs<-msrs[!is.na(msrs$MEASUREMENT_DATE),]
    sample_msrs<-merge(msrs, sample, by.x=c('PERSON_ID'), by.y='person_id')
    # keep rows within 2 years of the index date 
    sample_msrs<-sample_msrs[sample_msrs$MEASUREMENT_DATE >= sample_msrs$start_window & 
                               sample_msrs$MEASUREMENT_DATE < sample_msrs$index_date,]
    rr_msrs<-rbind(rr_msrs, sample_msrs)
}

In [None]:
bp_msrs<-data.frame()
for (yr in seq(2016, 2021)) {
    msrs<-load_data(paste0('bp_features_', yr, '.csv'), "/ehr/")
    msrs<-msrs[!is.na(msrs$MEASUREMENT_DATE),]
    sample_msrs<-merge(msrs, sample, by.x=c('PERSON_ID'), by.y='person_id')
    # keep rows within 2 years of the index date 
    sample_msrs<-sample_msrs[sample_msrs$MEASUREMENT_DATE >= sample_msrs$start_window & 
                               sample_msrs$MEASUREMENT_DATE < sample_msrs$index_date,]
    bp_msrs<-rbind(bp_msrs, sample_msrs)
}

In [None]:
# force all the measures together
ehr_msrs<-bind_rows(ehr_msrs, hr_msrs, rr_msrs, bp_msrs)

# remove missing person id and set the remaining misisng values to zero 
ehr_msrs<-ehr_msrs[!is.na(ehr_msrs$PERSON_ID),]
ehr_msrs[is.na(ehr_msrs)] <- 0

In [None]:
keepvars<-c('PERSON_ID', 'obesity','hypocalcemia','hypercalcemia',
            'hypochloremia','hyperchloremia','creatine','high_blood_pressure',
            'prediabetes_gluc_fast','prediabetes_hba1c','triglyceride_high',
            'tachycardia','anemia','high_hemo','hypoxemia','hyperkalemia',
            'hypokalemia','tachypneic','bradypnea','hypernatremia','hyponatremia','low_urea','high_urea')

ehr_msrs_updt <- ehr_msrs[,keepvars] %>% group_by(PERSON_ID) %>% 
        summarize(across(everything(), ~ max(.x, na.rm = TRUE))) %>% rename(person_id = PERSON_ID)

In [None]:
# save 
write_csv(ehr_msrs_updt, 'diabetes_msrs.csv', "/diabetes/")

### Drugs

create indicator for ATC_2ND classification which is approx. 92 classes

In [None]:
 atc<-load_data('atc_classes.csv',"/ehr/")

In [None]:
# pull in measures from 2018-2021 
ehr_drugs<-data.frame()
for (yr in seq(2016, 2021)) {
    drugs<-load_data(paste0('drugs_', yr, '.csv'), "/ehr/")
    drugs<-drugs[!is.na(drugs$DRUG_EXPOSURE_START_DATE),c('PERSON_ID','DRUG_CONCEPT_ID','DRUG_EXPOSURE_START_DATE')]
    sample_drugs<-merge(drugs, sample, by.x=c('PERSON_ID'), by.y='person_id')       
    # keep rows within 2 years of the index date 
    sample_drugs<-sample_drugs[sample_drugs$DRUG_EXPOSURE_START_DATE >= sample_drugs$start_window & 
                               sample_drugs$DRUG_EXPOSURE_START_DATE < sample_drugs$index_date,]
    
    # summarise n drugs by class 
    sample_drugs<-merge(sample_drugs, atc, by='DRUG_CONCEPT_ID')
    class_count <- sample_drugs %>% select(PERSON_ID, ATC_2nd) %>% group_by(PERSON_ID, ATC_2nd) %>% summarize(n=n())
    
    ehr_drugs<-rbind(ehr_drugs, class_count)
}

In [None]:
# merge in atc classes (drop drugs without atc classes since we are creating an indicator for drug classes)
classes <- ehr_drugs %>% group_by(PERSON_ID, ATC_2nd) %>% summarize(n=sum(n))

In [None]:
classes$any<-ifelse(classes$n>0, 1,0)

In [None]:
classes_wide<- classes %>% 
    mutate(
    atc = str_replace_all(ATC_2nd, " ", "_"),    # replace spaces with _              # shorten (keep first 6 chars, e.g. "C07_Be")
    atc2 = paste0("atc_", ATC_2nd)                # add prefix
  ) %>% select(PERSON_ID, atc2, any) %>%
  pivot_wider(
    names_from = atc2,
    values_from = any,
    values_fill = 0)

In [None]:
classes_wide <- classes_wide %>% rename(person_id = PERSON_ID)

In [None]:
# drop atc classes appearing less than 1% of the time 
means <- colMeans(classes_wide, na.rm = TRUE)
low_cols <- names(means[means < 0.01])
classes_wide <- classes_wide %>% select(-all_of(low_cols))

In [None]:
# save 
write_csv(classes_wide, 'diabetes_drugs.csv', "/diabetes/")

### SES data

- insurance indicators
- household income indicators
- ACS zipc-code SES metrics

In [None]:
# load in demographics and census level data 
all<-load_data('all_participant_demo.csv',"/data/")
ses<-merge(all, sample, by='person_id') 
ses <- clean_names(ses)

ses$ins_uninsured<-ifelse(ses$ins_none == 1 | ses$ins_indian ==1 | ses$anyins_no == 1, 1, 0)
ses$ins_medicare<-ses$ins_medicare
ses$ins_medicaid<-ses$ins_medicaid
ses$ins_employ<-ses$ins_employer_or_union
ses$ins_unknown<-ifelse(ses$ins_pmi_skip ==1 | ses$ins_invalid ==1, 1, 0)
ses$ins_other<-ifelse(ses$ins_va == 1 | ses$ins_military == 1 | 
                      ses$ins_purchased == 1 | ses$ins_other_health_plan == 1, 1, 0)

# household income 
ses$inc_10<-ses$inc_less_10k
ses$inc_10_49<-ifelse(ses$inc_10k_25k == 1 | ses$inc_25k_35k == 1 | ses$inc_35k_50k == 1, 1, 0)
ses$inc_50_99<-ifelse(ses$inc_50k_75k == 1 | ses$inc_75k_100k == 1, 1, 0)
ses$inc_100_199<-ifelse(ses$inc_100k_150k == 1 | ses$inc_150k_200k == 1, 1, 0)
ses$inc_200<-ses$inc_more_200k
ses$inc_unknown<-ses$inc_skip_unknown

# ACS info (poverty, vacant_housing, deprivation_index, high_school_education)
ses$zip_poverty<-ifelse(is.na(ses$poverty), mean(ses$poverty, na.rm=TRUE), ses$poverty)
ses$zip_vac_housing<-ifelse(is.na(ses$vacant_housing), mean(ses$vacant_housing, na.rm=TRUE), ses$vacant_housing)
ses$zip_dep_index<-ifelse(is.na(ses$deprivation_index), mean(ses$deprivation_index, na.rm=TRUE), ses$deprivation_index)
ses$zip_hs<-ifelse(is.na(ses$high_school_education), mean(ses$high_school_education, na.rm=TRUE), ses$high_school_education)

# save variables and write to csv 
ins_vars<-c('ins_uninsured','ins_medicare','ins_medicaid','ins_employ','ins_unknown','ins_other')
inc_vars<-c('inc_10','inc_10_49','inc_50_99','inc_100_199','inc_200')
zip_vars<-c('zip_poverty','zip_vac_housing','zip_dep_index','zip_hs')
ses_vars<-c(ins_vars, inc_vars, zip_vars)

write_csv(ses[,c('person_id',ses_vars)], "diabetes_ses.csv", "/diabetes/")

### Visits data 

Flag people who have had a visit in the pre-period and/or visit in the observation window. 

In [None]:
sample_ids<-unique(sample$person_id)
visits<-load_data('visits_16_23.csv', '/ehr/')
visits<-visits %>% filter(PERSON_ID %in% sample_ids)

In [None]:
visits_sub<-merge(visits, sample[,c('person_id','start_window','index_date')], by.x='PERSON_ID', by.y='person_id')

In [None]:
visits_sub$end_window<-visits_sub$index_date + lubridate::years(2)

# flag if visits fell in the predictor window
visits_sub$predictor_flag<-ifelse(visits_sub$VISIT_END_DATE >= visits_sub$start_window &
                                  visits_sub$VISIT_END_DATE < visits_sub$index_date, 1, 0)

# flag if visit feel in the outcome window 
visits_sub$outcome_flag<-ifelse(visits_sub$VISIT_END_DATE >= visits_sub$index_date & 
                               visits_sub$VISIT_END_DATE < visits_sub$end_window, 1, 0)

In [None]:
visit_byperson<-visits_sub %>% group_by(PERSON_ID) %>% summarise(visit_pred_flag=max(predictor_flag),
                                                 visit_y_flag=max(outcome_flag))

In [None]:
write_csv(visit_byperson, 'diabetes_visits.csv', "/diabetes/")

## Create analytic dataset

In [None]:
# load sample (base dataset) 
sample<-load_data('diabetes_sample.csv',"/diabetes/")

In [None]:
# merge in access data
access<-load_data('analysis_sample.csv', "/analysis/")
keepvars<-c('person_id','afford','delayed','afford_ind','delayed_ind')
access<-access[,keepvars]
sample<-merge(sample, access, by='person_id')

In [None]:
# merge in self reported conditions & flag people who responded to the healht survey 
self<-load_data('self_reported_health_byperson.csv', "/survey/")
names(self)<-paste0("self_", names(self))
in_self<-load_data('answered_self_reported_health.csv', "/survey/")
sample<-merge(sample, self, by.x='person_id', by.y='self_person_id',all.x=TRUE)
in_self$in_self<-1
sample<-merge(sample, in_self, by='person_id', all.x=TRUE)
sample$self_missing<-ifelse(is.na(sample$in_self), 1, 0)

In [None]:
# merge in demographics & calculate age based on index date 
demo<-load_data('demographics.csv', "/survey/")
demo$birth_date<-ymd(paste0(demo$YEAR_OF_BIRTH, '-1-1'))
keepvars<-c('PERSON_ID','RACE','GENDER','ETHNICITY','birth_date')
sample<-merge(sample, demo[,keepvars], by.x='person_id',by.y='PERSON_ID', all.x=TRUE)
sample$AGE <- as.integer(interval(sample$birth_date, sample$index_date) / years(1))
sample$scaled_age<-as.numeric(scale(sample$AGE))

In [None]:
# add in clinical info
dx<-load_data('diabetes_dx.csv', "/diabetes/")
msrs<-load_data('diabetes_msrs.csv', "/diabetes/")
drugs<-load_data('diabetes_drugs.csv', "/diabetes/")
ses<-load_data('diabetes_ses.csv',"/diabetes/")
vis<-load_data('diabetes_visits.csv',"/diabetes/")
sample<-merge(sample, dx, by='person_id', all.x=TRUE)
sample<-merge(sample, msrs, by='person_id', all.x=TRUE)
sample<-merge(sample, drugs, by='person_id', all.x=TRUE)
sample<-merge(sample, ses, by='person_id', all.x=TRUE)
sample<-merge(sample, vis, by.x='person_id', by.y='PERSON_ID',all.x=TRUE)

In [None]:
# replace all missing values with zero 
sample[is.na(sample)] <- 0

In [None]:
# save
write_csv(sample, 'analysis_data_updt.csv', "/diabetes/")

# Summarize Data

In [None]:
data<-load_data('analysis_data_updt.csv', "/diabetes/")

In [None]:
# add in variable names 
data$noaccess<-ifelse(data$afford_ind | data$delayed_ind, 1, 0)
data$hi_access<-ifelse(data$afford_ind == 0 & data$delayed_ind == 0, 1, 0)
data$low_access<-ifelse(data$afford_ind == 1 & data$delayed_ind == 1, 1, 0)
data$male<-ifelse(data$GENDER == "Male", 1, 0)
data$fem<-ifelse(data$GENDER == "Female", 1, 0)
data$gender_other<-ifelse(!(data$fem | data$male), 1, 0)
data$white<-ifelse(data$RACE == "White", 1, 0)
data$black<-ifelse(data$RACE == "Black or African American", 1, 0)
data$race_unknown<-ifelse(data$RACE %in% c('None Indicated','MPI: Skip','I prefer not to answer'), 1, 0)
data$asian<-ifelse(data$RACE == 'Asian', 1, 0)
data$ai_an<-ifelse(data$RACE == 'American Indian or Alaska Native', 1, 0)
data$race_other<-ifelse(data$RACE %in% c('More than one population',
                                         'None of these',
                                         'Middle Eastern or North African',
                                        'Native Hawaiian or Other Pacific Islander'), 1, 0)
data$is_latino<-ifelse(data$ETHNICITY == "Hispanic or Latino", 1, 0)
data$not_hispanic<-ifelse(data$ETHNICITY == 'Not Hispanic or Latino',1,0)
data$ethnicity_unknown<-ifelse(!(data$ETHNICITY %in% c("Hispanic or Latino",'Not Hispanic or Latino')), 1, 0)

In [None]:
# there are a few duplicate IDs (multiple survey dates) - remove
data <- data %>%
  arrange(person_id) %>% # Sort by ID and Date
  group_by(person_id) %>% # Group by ID
  dplyr::slice(1) %>% # Take the first row of each group
  ungroup() # Ungroup the data

In [None]:
# access by group 
data %>% group_by(black) %>% summarise(noaccess=mean(noaccess))
data %>% group_by(white) %>% summarise(noaccess=mean(noaccess))

In [None]:
# save data
write_csv(data, 'prediction_data_updt.csv', "/diabetes/")

## Show predictive quality of features by access group

In [None]:
# save data
data<-load_data('prediction_data_updt.csv', "/diabetes/")

In [None]:
mod1<-lm(y ~ obesity*noaccess + obesity + scaled_age + fem, data=data)
summary(mod1)

# Run Prediction

## Set up Data

In [None]:
data<-load_data('prediction_data_updt.csv',"/diabetes/")

In [None]:
y<-'y'
y_self<-'type_2_diabetes'
vars<-names(data)
dgns<-vars[grepl('dx_', vars)]
meds<-vars[grepl('atc_',vars)]
msrs<-c('obesity','hypocalcemia','hypercalcemia',
            'hypochloremia','hyperchloremia','creatine','high_blood_pressure','prediabetes_gluc_fast',
            'tachycardia','anemia','high_hemo','hypoxemia','hyperkalemia','prediabetes_hba1c','triglyceride_high',
            'hypokalemia','tachypneic','bradypnea','hypernatremia','hyponatremia','low_urea','high_urea')
#demo<-c('male','fem','white','black','is_latino','scaled_age')
demo<-c('gender_other','fem','black','race_unknown','asian','ai_an','race_other',
        'is_latino','ethnicity_unknown','scaled_age')
self_vars<-names(data)[grepl('self_', names(data))]
x_tmp<-c(dgns, demo, meds, msrs)
vars[!(vars %in% x_tmp)]

# remove diabetes labs & diagnosis from prediction 
x<-x_tmp[!(x_tmp %in% c('diabetes'))]

## Set up 5-fold cross validation

In [None]:
set.seed(1234)

n<-nrow(data)
nfolds<-5
shuffled<-sample(1:n)
fold_ids<-cut(seq(1,n),breaks=nfolds,labels=FALSE)
folds<-fold_ids[order(shuffled)]

In [None]:
data[is.na(data)] <- 0
dim(data)

In [None]:
data %>% group_by(delayed_ind, afford_ind) %>% summarise(pred_miss=mean(visit_pred_flag), y_miss=mean(visit_y_flag))

## Predict output (without access)

In [None]:
##### PREDICT OUTPUT WITHOUT ACCESS 
# create dataset with patient id, predicted risk (lasso), label (lasso), predicted risk (rf), label (rf)
y<-'y'
    
tmp_data<-data.frame(fold=integer(), person_id=double(), 
                     lasso_preds=double(), ridge_preds=double(),
                     lasso_label=double(), ridge_label=double())

for (i in 1:nfolds){
    
  print(i) 

  ### PREPROCESS DATA ###
  index<-which(folds==i)
  test_i<-data[index,]
  train_i<-data[-index,]
    
  test_i$fold<-i
  
  # define design matrix X and outcome y for train and test datasets
  train_x<-train_i[,x]
  train_y<-train_i[[y]]
  test_x<-test_i[,x]
  test_y<-test_i[[y]]
  X<-data.matrix(train_x)
  X_test<-data.matrix(test_x)
    
  #### LASSO
  cv.lasso<-cv.glmnet(X, train_y, alpha=1, family="binomial")

  # find cut-off using train data
  tr_preds<-predict(cv.lasso, newx = X, s = "lambda.min", type="response")   
  cutoff<-get_cutoff(train_y, tr_preds)

  # get predicted risk for test data and create label
  preds<-predict(cv.lasso, newx = X_test, s = "lambda.min", type="response")
  test_i$lasso_preds<-as.vector(preds[,1])
  test_i$lasso_label<-ifelse(test_i$lasso_preds >= cutoff, 1, 0)

  #### RIDGE
  cv.ridge<-cv.glmnet(X, train_y, alpha=0, family="binomial")

  # find cut-off using train data
  tr_preds<-predict(cv.ridge, newx = X, s = "lambda.min", type="response")   
  cutoff_ridge<-get_cutoff(train_y, tr_preds)

  # get predictions   
  preds_ridge<-predict(cv.ridge, newx = X_test, s = "lambda.min", type="response")
  test_i$ridge_preds<-as.vector(preds_ridge[,1])
  test_i$ridge_label<-ifelse(test_i$ridge_preds >= cutoff_ridge, 1, 0)
    
  # add to the dataset 
    
  keepvars<-c('fold','person_id','lasso_preds','ridge_preds','lasso_label','ridge_label')

  tmp_data<-rbind(tmp_data, test_i[,keepvars])

}

# save output 
write_csv(tmp_data, 'diabetes_preds_logistic_updt.csv', "/diabetes/")

In [None]:
##### PREDICT OUTPUT WITHOUT ACCESS 
# create dataset with patient id, predicted risk (lasso), label (lasso), predicted risk (rf), label (rf)
data$y_char<-factor(data$y, levels=c(0,1))
y<-'y_char'

# save list of results
out_list <- vector("list", nfolds)

# compute mean over k trees
mean_first_k <- function(tree_mat, k) {
  k <- min(k, ncol(tree_mat))
  if (k == 0) return(rep(NA_real_, nrow(tree_mat)))
  matrixStats::rowMeans2(tree_mat[, 1:k, drop = FALSE])
}

for (i in 1:nfolds){
    
  print(i) 

  ### PREPROCESS DATA ###
  index<-which(folds==i)
  test_i<-data[index,]
  train_i<-data[-index,]
    
  test_i$fold<-i
  
  # define design matrix X and outcome y for train and test datasets
  train_x<-train_i[,x]
  train_y<-train_i[[y]]
  test_x<-test_i[,x]
  test_y<-test_i[[y]]
  X<-data.matrix(train_x)
  X_test<-data.matrix(test_x)
    
  ##### RANGER 
  rf_1000<-ranger(x = train_x, y=train_y, num.trees = 1000, write.forest = TRUE, num.threads=2,min.node.size = 10, save.memory=TRUE)   
  preds <- predict(rf_1000, data = test_x, predict.all = TRUE, type='response')

  # return predictions for all trees
  tree_preds<-preds$predictions 
  if (is.factor(tree_preds)) tree_preds <- as.character(tree_preds)
  if (!is.matrix(tree_preds)) tree_preds <- as.matrix(tree_preds)
  pos_label <- levels(data$y_char)[2]  # "1"
  tree_bin <- (tree_preds == pos_label) * 1.0
    
  # find predicted probability across all trees K
  p5 <- mean_first_k(tree_bin, 5)
  p50 <- mean_first_k(tree_bin, 50)
  p500 <- mean_first_k(tree_bin, 500)
  p1000 <- mean_first_k(tree_bin, 1000)
    
  out_list[[i]] <- data.frame(
    fold = i,
    person_id = test_i$person_id,
    rf_preds_1000 = p1000,
    rf_preds_500  = p500,
    rf_preds_50   = p50,
    rf_preds_5    = p5,
    row.names = NULL
  )

  # free memory between folds
  rm(rf_1000, tree_preds, tree_bin, p5, p50, p500, p1000)
  gc()
}

# save output 
tmp_data <- do.call(rbind, out_list)
write_csv(tmp_data, 'diabetes_preds_rf_updt.csv', "/diabetes/")

### Find best model for diabetes 

LASSO

In [None]:
diab_rf<-load_data('diabetes_preds_rf_updt.csv', "/diabetes/")
diab_lr<-load_data('diabetes_preds_logistic_updt.csv', "/diabetes/")

In [None]:
preds<-merge(data[,c('person_id','y')], diab_lr, by='person_id')
preds<-merge(preds, diab_rf, by='person_id')
dim(preds)

In [None]:
# get AUC for each of these 
get_auc<-function(label, preds) {
    roc<-pROC::roc(label, preds)
    auc<-pROC::auc(roc)
    auc_ci<-ci(auc)
    print(auc)
    print(auc_ci)
}

In [None]:
# lasso does the best 
get_auc(preds$y, preds$rf_preds_1000)
get_auc(preds$y, preds$rf_preds_500)
get_auc(preds$y, preds$rf_preds_50)
get_auc(preds$y, preds$rf_preds_5)
get_auc(preds$y, preds$lasso_preds)
get_auc(preds$y, preds$ridge_preds)

## Predict output (with access)

In [None]:
##### PREDICT OUTPUT WITH ACCESS LABEL 
y<-'y'

tmp_data<-data.frame(fold=integer(), person_id=double(), 
                     lasso_preds_access=double(), lasso_label_access=double())

for (i in 1:nfolds){

  ### PREPROCESS DATA ###
  index<-which(folds==i)
  test_i<-data[index,]
  train_i<-data[-index,]
    
  test_i$fold<-i
  
  # define design matrix X and outcome y for train and test datasets
  train_x<-train_i[,c(x, 'delayed_ind','afford_ind')]
  train_y<-train_i[[y]]
  test_x<-test_i[,c(x, 'delayed_ind','afford_ind')]
  test_y<-test_i[[y]]
  X<-data.matrix(train_x)
  X_test<-data.matrix(test_x)
    
  #### LASSO
  cv.lasso<-cv.glmnet(X, train_y, alpha=1, family="binomial")

  # find cut-off using train data
  tr_preds<-predict(cv.lasso, newx = X, s = "lambda.min", type="response")   
  cutoff<-get_cutoff(train_y, tr_preds)

  # get predicted risk for test data and create label
  preds<-predict(cv.lasso, newx = X_test, s = "lambda.min", type="response")
  test_i$lasso_preds_access<-as.vector(preds[,1])
  test_i$lasso_label_access<-ifelse(test_i$lasso_preds_access >= cutoff, 1, 0)

  keepvars<-c('fold','person_id','lasso_preds_access','lasso_label_access')

  tmp_data<-rbind(tmp_data, test_i[,keepvars])
    
}

# save output 
write_csv(tmp_data, 'diabetes_preds_access.csv', "/diabetes/")

## Predict output (with access related features)
- add in visit history, sociodemographic information, etc. 
- scale continuous features (as.numeric(scale(var))

In [None]:
# variables related to access 
ins_vars<-c('ins_uninsured','ins_medicare','ins_medicaid','ins_employ','ins_unknown','ins_other')
inc_vars<-c('inc_10','inc_10_49','inc_50_99','inc_100_199','inc_200')
zip_vars<-c('zip_poverty','zip_vac_housing','zip_dep_index','zip_hs')
visits_vars<-c('visit_pred_flag','visit_y_flag')

data$zip_poverty_scaled<-as.numeric(scale(data$zip_poverty))
data$zip_vac_housing_scaled<-as.numeric(scale(data$zip_vac_housing))
data$zip_dep_index_scaled<-as.numeric(scale(data$zip_dep_index))
data$zip_hs_scaled<-as.numeric(scale(data$zip_hs))

zip_scale_vars<-c('zip_poverty_scaled','zip_vac_housing_scaled','zip_dep_index_scaled','zip_hs_scaled')
ses_vars<-c(ins_vars, inc_vars, zip_scale_vars,'visit_pred_flag')

In [None]:
##### PREDICT OUTPUT WITH ACCESS LABEL 
y<-'y'

tmp_data<-data.frame(fold=integer(), person_id=double(), 
                     lasso_preds_ses=double(), lasso_label_ses=double())

for (i in 1:nfolds){

  ### PREPROCESS DATA ###
  index<-which(folds==i)
  test_i<-data[index,]
  train_i<-data[-index,]
    
  test_i$fold<-i
  
  # define design matrix X and outcome y for train and test datasets
  train_x<-train_i[,c(x, ses_vars)]
  train_y<-train_i[[y]]
  test_x<-test_i[,c(x, ses_vars)]
  test_y<-test_i[[y]]
  X<-data.matrix(train_x)
  X_test<-data.matrix(test_x)
    
  #### LASSO
  cv.lasso<-cv.glmnet(X, train_y, alpha=1, family="binomial")

  # find cut-off using train data
  tr_preds<-predict(cv.lasso, newx = X, s = "lambda.min", type="response")   
  cutoff<-get_cutoff(train_y, tr_preds)

  # get predicted risk for test data and create label
  preds<-predict(cv.lasso, newx = X_test, s = "lambda.min", type="response")
  test_i$lasso_preds_ses<-as.vector(preds[,1])
  test_i$lasso_label_ses<-ifelse(test_i$lasso_preds_ses >= cutoff, 1, 0)

  keepvars<-c('fold','person_id','lasso_preds_ses','lasso_label_ses')

  tmp_data<-rbind(tmp_data, test_i[,keepvars])
    
}

# save output 
write_csv(tmp_data, 'diabetes_preds_ses.csv', "/diabetes/")

## Add in self reported Xs

Add in self reported Xs (what to do about the survey date?) a lot of people answered the survey after the cut-off period. 

In [None]:
remove_vars<-c('self_type_1_diabetes','self_type_2_diabetes','self_aortic_aneurysm','self_survey_date')
self_vars_updt<-self_vars[!(self_vars %in% remove_vars)]
length(self_vars_updt)
self_vars_updt

In [None]:
##### PREDICT OUTPUT WITH SELF-REPORTED X 
y<-'y'

tmp_data<-data.frame(fold=integer(), person_id=double(), 
                     lasso_preds_self=double(), lasso_label_self=double())

for (i in 1:nfolds){

  ### PREPROCESS DATA ###
  index<-which(folds==i)
  test_i<-data[index,]
  train_i<-data[-index,]
    
  test_i$fold<-i
  
  # define design matrix X and outcome y for train and test datasets
  train_x<-train_i[,c(x,self_vars_updt)]
  train_y<-train_i[[y]]
  test_x<-test_i[,c(x,self_vars_updt)]
  test_y<-test_i[[y]]
  X<-data.matrix(train_x)
  X_test<-data.matrix(test_x)
    
  #### LASSO
  cv.lasso<-cv.glmnet(X, train_y, alpha=1, family="binomial")

  # find cut-off using train data
  tr_preds<-predict(cv.lasso, newx = X, s = "lambda.min", type="response")   
  cutoff<-get_cutoff(train_y, tr_preds)

  # get predicted risk for test data and create label
  preds<-predict(cv.lasso, newx = X_test, s = "lambda.min", type="response")
  test_i$lasso_preds_self<-as.vector(preds[,1])
  test_i$lasso_label_self<-ifelse(test_i$lasso_preds_self >= cutoff, 1, 0)

  # add to the dataset 
    
  keepvars<-c('fold','person_id','lasso_preds_self','lasso_label_self')

  tmp_data<-rbind(tmp_data, test_i[,keepvars])
    
}

# save output 
write_csv(tmp_data, 'diabetes_preds_self.csv', "/diabetes/")
dim(tmp_data)

# Evaluate


In [None]:
# load data if not already loaded (and merge preds if necessary
preds_all<-load_data('diabetes_preds_logistic_updt.csv', "/diabetes/")
preds_access<-load_data('diabetes_preds_access.csv', "/diabetes/")
preds_self<-load_data('diabetes_preds_self.csv', "/diabetes/")
preds_ses<-load_data('diabetes_preds_ses.csv', "/diabetes/")

In [None]:
preds_tmp<-merge(preds_all, preds_access, by=c('person_id'))
preds_all<-merge(preds_tmp, preds_self, by=c('person_id'))
preds_all<-merge(preds_all, preds_ses, by=c('person_id'))

In [None]:
# merge data in so we have access to relevant labels
data<-load_data('prediction_data_updt.csv',"/diabetes/")
test<-merge(data, preds_all, by='person_id')

In [None]:
# subset to groups of interest 
access<-test[test$noaccess == 0,]
noaccess<-test[test$afford_ind == 1 & test$delayed_ind == 1,]
cant<-test[test$afford_ind == 1,]
delay<-test[test$delayed_ind == 1,]
cant_extra<-test[test$afford>2,]
delay_extra<-test[test$delayed>2,]
black<-test[test$black == 1,]
white<-test[test$white == 1,]
hispanic<-test[test$is_latino == 1,]
black_low_access<-test[test$black == 1 & test$afford_ind == 1 & test$delayed_ind == 1,]
hispanic_low_access<-test[test$is_latino == 1 & test$afford_ind == 1 & test$delayed_ind == 1,]
white_low_access<-test[test$white == 1 & test$afford_ind == 1 & test$delayed_ind == 1,]
black_hi_access<-test[test$black == 1 & test$noaccess == 0,]
hispanic_hi_access<-test[test$is_latino == 1 & test$noaccess == 0,]
white_hi_access<-test[test$white == 1 & test$noaccess == 0,]

In [None]:
summarize_results<-function(test_preds, pred_label) {

    ret0<-get_msrs(test$y, test[[test_preds]], test[[pred_label]], 'all')
    ret1<-get_msrs(access$y, access[[test_preds]], access[[pred_label]], 'high access')
    ret2<-get_msrs(noaccess$y, noaccess[[test_preds]], noaccess[[pred_label]], 'low access')
    ret3<-get_msrs(cant$y, cant[[test_preds]], cant[[pred_label]],  'cant')
    ret4<-get_msrs(delay$y, delay[[test_preds]], delay[[pred_label]], 'delay')
    ret5<-get_msrs(cant_extra$y, cant_extra[[test_preds]], cant_extra[[pred_label]],  'cant 3+')
    ret6<-get_msrs(delay_extra$y, delay_extra[[test_preds]], delay_extra[[pred_label]], 'delay 3+')
    ret7<-get_msrs(black$y, black[[test_preds]], black[[pred_label]], 'black')
    ret8<-get_msrs(white$y, white[[test_preds]], white[[pred_label]], 'white')
    ret9<-get_msrs(hispanic$y, hispanic[[test_preds]], hispanic[[pred_label]], 'hispanic')
    ret10<-get_msrs(black_low_access$y, black_low_access[[test_preds]], black_low_access[[pred_label]], 'black - no access')
    ret11<-get_msrs(hispanic_low_access$y, hispanic_low_access[[test_preds]], hispanic_low_access[[pred_label]], 'hispanic - no access')
    ret12<-get_msrs(white_low_access$y, white_low_access[[test_preds]], white_low_access[[pred_label]], 'white - no access')
    ret13<-get_msrs(black_hi_access$y, black_hi_access[[test_preds]], black_hi_access[[pred_label]], 'black - access')
    ret14<-get_msrs(hispanic_hi_access$y, hispanic_hi_access[[test_preds]], hispanic_hi_access[[pred_label]], 'hispanic - access')
    ret15<-get_msrs(white_hi_access$y, white_hi_access[[test_preds]], white_hi_access[[pred_label]], 'white - access')
        
    results<-rbind(ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7, ret8, ret9, ret10, ret11, ret12, ret13, ret14, ret15)
    return(as.data.frame(results))

    
}

In [None]:
# create a function for prepping the output 
msr_list<-c("sensitivity","specificity", "accuracy", "balanced_accuracy","precision", "f1_score", "auc")
save_metrics<-function(df, alg) {
    
    # add in balancedaccuracy
    df$balanced_accuracy<-(as.numeric(df$sensitivity) + as.numeric(df$specificity))/2

    # convert auc to numeric (type AUC in the data)
    df$auc<-as.numeric(df$auc)
    
    # convert long to wide
    for (msr in msr_list){
        df[,msr]<-as.numeric(df[,msr])
        names(df)[names(df) == msr] <- paste0(msr, '_val')
    }

    perf_long<- df %>% pivot_longer(cols = ends_with("_val"), names_to = "variables", values_to = "val")
    perf_long$n<-as.numeric(perf_long$n)
    perf_long$se<-sqrt(perf_long$val*(1-perf_long$val)*1/perf_long$n)
    perf_long$lb_se<-perf_long$val-1.96*perf_long$se
    perf_long$ub_se<-perf_long$val+1.96*perf_long$se

    # save
    fn<-paste0('diabetes_',alg,'_metrics.csv')
    write_csv(perf_long, fn, "/output/") 
}

In [None]:
suppressMessages(
res_lasso<-summarize_results("lasso_preds", "lasso_label")
)
save_metrics(res_lasso, 'lasso')

In [None]:
# results with access label 
suppressMessages(
res_lasso_access<-summarize_results("lasso_preds_access","lasso_label_access")
)
save_metrics(res_lasso_access, 'lasso_inc_access')

In [None]:
# results with self-reported x 
suppressMessages(
res_lasso_self<-summarize_results("lasso_preds_self","lasso_label_self")
)
save_metrics(res_lasso_self, 'lasso_inc_self_x')

In [None]:
# results with self-reported x 
suppressMessages(
res_lasso_self<-summarize_results("lasso_preds_ses","lasso_label_ses")
)
save_metrics(res_lasso_self, 'lasso_inc_ses')

## Curves

In [None]:
test$group<-ifelse(test$noaccess == 1, 'Low Access', 'High Access')
ggplot(test, aes(x = lasso_preds, color = group)) +
  geom_density(size = 1) + # Line thickness
  scale_color_manual(values=c('blue','red'))+
  labs(title = "Density Plot by Group", x = "Value", y = "Density") +
  theme_minimal()+xlim(0, .10)

## look at AUC curves

### Compare AUC by subgroup 

In [None]:
suppressMessages(
roc_standard<-pROC::roc(access$y, access[["lasso_preds"]])
roc_cant<-pROC::roc(cant$y, cant[['lasso_preds']])
roc_delay<-pROC::roc(delay$y, delay[['lasso_preds']])
roc_cant_sev<-pROC::roc(cant_extra$y, cant_extra[['lasso_preds']])
roc_delay_sev<-pROC::roc(delay_extra$y, delay_extra[['lasso_preds']])
)

In [None]:
roc.test(roc_standard, roc_cant, paired = FALSE)
roc.test(roc_standard, roc_delay, paired = FALSE)
roc.test(roc_standard, roc_delay_sev, paired = FALSE)
roc.test(roc_standard, roc_cant_sev, paired = FALSE)

### Compare AUC by method

In [None]:
run_aoc_comparison<-function(dsn) {
    
    roc1<-pROC::roc(dsn[['y']], dsn[['lasso_preds']])
    roc2<-pROC::roc(dsn[['y']], dsn[['lasso_preds_access']])
    roc3<-pROC::roc(dsn[['y']], dsn[['lasso_preds_self']])
    roc4<-pROC::roc(dsn[['y']], dsn[['lasso_preds_ses']])
    
    diff1<-roc.test(roc1, roc2, method = "delong")
    diff2<-roc.test(roc1, roc3, method = "delong")
    diff3<-roc.test(roc1, roc4, method = "delong")
    
    # Create summary dataframe
    res <- data.frame(
      auc_1 = as.numeric(pROC::auc(roc1)),
      auc_2 = as.numeric(pROC::auc(roc2)),
      auc_3 = as.numeric(pROC::auc(roc3)),
      auc_4 = as.numeric(pROC::auc(roc4)),
      pval_1_2 = diff1$p.value,
      pval_1_3 = diff2$p.value,
      pval_1_4 = diff3$p.value
    )
    return(res)
    
}

In [None]:
all<-run_aoc_comparison(test)
st<-run_aoc_comparison(access)
afford<-run_aoc_comparison(cant)
del<-run_aoc_comparison(delay)

res<-rbind(all, st, afford, del)
res

In [None]:
# adjust for multiple comparisions 
pvals <- res %>% select(pval_1_2, pval_1_3, pval_1_4) %>% unlist()
pvals_adj <- p.adjust(pvals, method="BH")
res_adj<-res
res_adj[,c("pval_1_2", "pval_1_3", "pval_1_4")] <- matrix(pvals_adj, nrow=nrow(res_adj), ncol=3)
res_adj

## How many additionanl people targeted?

In [None]:
# how many additional people identified if increased recall for low access to that for high access 
test$tp<-ifelse(test$y == 1 & test$lasso_label == 1,1, 0)
test$tp_self<-ifelse(test$y == 1 & test$lasso_label_self == 1, 1,0)
test$fp<-ifelse(test$y == 0 & test$lasso_label == 1,1, 0)
test$fp_self<-ifelse(test$y == 0 & test$lasso_label_self == 1, 1,0)

summ<-test %>% group_by(noaccess) %>% summarise(n=n(),y=sum(y), p=sum(lasso_label), p_self=sum(lasso_label_self),
                                                tp=sum(tp), tp_self=sum(tp_self),
                                                fp=sum(fp), fp_self=sum(fp_self))


summ$adl_new<-summ$tp_self - summ$tp
summ$adl_false<-summ$fp_self - summ$fp
summ$pct_tp<-summ$adl_new/summ$y
summ$pct_fp<-summ$adl_false/(summ$n - summ$y)
summ