In [None]:
# Analysis Sample
# Author: Anna Zink
# Description: Create a sample of peolpe with EHR data and merge in access data information

In [None]:
# install packages 
library(plyr)
library(tidyverse)
library(bigrquery)
library(stringr)  
library(lubridate)

In [None]:
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
CDR <- Sys.getenv('WORKSPACE_CDR')
MY_BUCKET <- Sys.getenv('WORKSPACE_BUCKET')

In [None]:
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
# folder = "/ehr/" 
write_csv<-function(df, fn, folder) {
   my_dataframe <- df
   destination_filename <- fn
   write_excel_csv(my_dataframe, destination_filename)
   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, folder), intern=T)
}

## Pull in person ids for people with EHR and access survey

In [None]:
sample_temp <- bq_table_download(bq_project_query(
    BILLING_PROJECT_ID, page_size = 25000,
    query = str_glue('
SELECT DISTINCT 
   a.person_id
FROM 
    `{CDR}.cb_search_all_events` a 
    join `{CDR}.cb_search_person` b on a.PERSON_ID = b.PERSON_ID
WHERE b.has_ehr_data = 1  
    AND  (concept_id IN (SELECT distinct concept_id FROM `{CDR}.cb_criteria` 
     WHERE path LIKE "%3000000694%" and is_standard = 0 AND is_selectable = 1)) 
')))

In [None]:
access<-load_data('access_byperson.csv', "/survey/")

In [None]:
# merge IDs in with access survey 
# v7 sample - 134513
# v8 sample - 205186
sample<-merge(access, sample_temp, by='person_id')
dim(sample)

In [None]:
# add in date of self report (useful for other programs referring to the analysis sample)
self<-load_data('self_reported_health_byperson.csv', "/survey/")

In [None]:
self = self %>% rename(self_survey_date=survey_date)

In [None]:
# for multiples take the first date that they responded
self = self %>% group_by(person_id) %>% summarise(self_survey_date = min(self_survey_date))
head(self)

In [None]:
sample<-merge(sample, self[,c('person_id','self_survey_date')], all.x=TRUE, by='person_id')

In [None]:
write_csv(sample, 'analysis_sample.csv', "/analysis/")