In [None]:
## Title: Summarize Survey Data
## Author: Anna Zink
## Date: October 30, 2023
## Updated; February 4, 2025; re-ran for v8, removed overall health survey
## Description: Summarize the survey data from the basic, overall health and family history health survey. 
## Notes: only queries this data for people who answered the access survey 

# set up 

In [None]:
library(plyr)
library(tidyverse)
library(bigrquery)
library(janitor)
library(data.table)

In [None]:
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
CDR <- Sys.getenv('WORKSPACE_CDR')
MY_BUCKET <- Sys.getenv('WORKSPACE_BUCKET')

In [None]:
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
# folder = "/ehr/" 
write_csv<-function(df, fn, folder) {
   my_dataframe <- df
   destination_filename <- fn
   write_excel_csv(my_dataframe, destination_filename)
   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, folder), intern=T)
}

## Read in survey data

In [None]:
# This query represents dataset "BASICS & HEALTH HISTORY" for domain "survey" and was generated for All of Us Controlled Tier Dataset v8
survey_sql <- paste("
    SELECT
        answer.person_id,
        answer.survey_datetime,
        answer.survey,
        answer.question_concept_id,
        answer.question,
        answer.answer_concept_id,
        answer.answer,
        answer.survey_version_concept_id,
        answer.survey_version_name  
    FROM
        `ds_survey` answer   
    WHERE
        (
            question_concept_id IN (SELECT
                DISTINCT concept_id                         
            FROM
                `cb_criteria` c                         
            JOIN
                (SELECT
                    CAST(cr.id as string) AS id                               
                FROM
                    `cb_criteria` cr                               
                WHERE
                    concept_id IN (1586134, 1740639)                               
                    AND domain_id = 'SURVEY') a 
                    ON (c.path like CONCAT('%', a.id, '.%'))                         
            WHERE
                domain_id = 'SURVEY'                         
                AND type = 'PPI'                         
                AND subtype = 'QUESTION')
        )", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
survey_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
 # strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "survey_",
  "survey_*.csv")
message(str_glue('The data will be written to {survey_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
#bq_table_save(
#  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), survey_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
#  survey_path,
#  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {survey_36706926_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(survey = col_character(), question = col_character(), answer = col_character(), survey_version_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
surveydata <- read_bq_export_from_workspace_bucket(survey_path)

In [None]:
## Summarize by person_id 
questions<-ddply(surveydata, .(survey, question_concept_id, question), summarize, n=length(survey_version_concept_id))

In [None]:
questions<-questions[order(questions$n),]

In [None]:
table(questions$survey)

In [None]:
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
system(paste0("gsutil ls ", my_bucket, "/survey/*.csv"), intern=T)

# Basics survey

In [None]:
basics<-questions[questions$survey == "The Basics", ]
dim(basics)

In [None]:
basics

In [None]:
# Basics keep (starts with): Disability*, *Insurance*, Income*, Marital*, Eduction*, Employment*
# subset to q
basics$insurance_flag<-ifelse(grepl("Insurance", basics$question), 1, 0)
basics$income_flag<-ifelse(grepl("Income", basics$question), 1, 0)
basics$education_flag<-ifelse(grepl("Education", basics$question), 1, 0)
basics$employment_flag<-ifelse(grepl("Employment", basics$question), 1, 0)

In [None]:
basics$flag_sum<-basics$insurance_flag+basics$income_flag+basics$education_flag+basics$employment_flag

In [None]:
basics_sub<-basics[basics$flag_sum>0,]
dim(basics_sub)

In [None]:
basics_qs<-basics_sub[,c('n','question_concept_id')]

In [None]:
# SUBSET TO QUESTIONS OF INTEREST
basic_df<-merge(surveydata, basics_qs, by='question_concept_id')

In [None]:
dim(basic_df)

In [None]:
keepvars<-c('person_id','survey_datetime','question_concept_id', 'question','answer')
basic_df<-basic_df[,keepvars]
head(basic_df)

In [None]:
table(basic_df$question)

In [None]:
print_ans<-function(q) {
    tmp<-basic_df[basic_df$question == q,]
    tbl<-table(tmp$answer, useNA="always")
    print(tbl)
    #print(prop.table(tbl))
}

In [None]:
print_ans('Education Level: Highest Grade')

In [None]:
# text fields - insurance, annual_inc, marital
subset_byperson<-function(q, var) {
    tmp<-basic_df[basic_df$question == q,]
    datevar<-paste0("datetime_", var)
    tmp[,var]<-tmp$answer
    tmp[,datevar]<-tmp$survey_datetime
    keepvars<-c('person_id',datevar,var, 'counter')
    tmp<-tmp %>% group_by(person_id) %>% mutate(counter=row_number(person_id))
    return(tmp[,keepvars])
}

ins_type<-subset_byperson("Health Insurance: Insurance Type Update", "ins_type")
ins<-subset_byperson("Insurance: Health Insurance", "ins")
inc<-subset_byperson("Income: Annual Income", "inc")
edu<-subset_byperson("Education Level: Highest Grade","edu")
work<-subset_byperson("Employment: Employment Status","work")

#### Create insurance type data by person

In [None]:
# clean ins_type
ins_type$ins_type_updt<-gsub("Insurance Type Update: ", "", ins_type$ins_type)

In [None]:
table(ins_type$ins_type_updt)

In [None]:
head(ins_type)

In [None]:
ins_type$value<-1
ins_type_wide<- ins_type %>% 
    select(person_id, datetime_ins_type, ins_type_updt, value) %>%
    pivot_wider(names_from = "ins_type_updt", names_prefix = "ins_", values_from = "value")

In [None]:
ins_type_wide[is.na(ins_type_wide)]<-0

In [None]:
head(ins_type_wide)

In [None]:
write_csv(ins_type_wide, 'ins_type_byperson.csv', "/data/")

#### Create insurance (any) by person

In [None]:
table(ins$ins)

In [None]:
# clean ins_type
ins$ins_updt<-gsub("Health Insurance: ", "", ins$ins)
ins$ins_updt<-gsub("PMI: ", "", ins$ins_updt)

In [None]:
table(ins$ins_updt)

In [None]:
ins$ins_updt<-ifelse(ins$ins_updt %in% c('Prefer Not To Answer','Dont Know','Skip'), 'skip_unknown', ins$ins_updt)

In [None]:
table(ins$ins_updt)

In [None]:
head(ins)

In [None]:
ins$value<-1
ins_wide<- ins %>% 
    select(person_id, datetime_ins, ins_updt, value) %>%
    pivot_wider(names_from = "ins_updt", names_prefix = "anyins_", values_from = "value")
ins_wide[is.na(ins_wide)]<-0

In [None]:
head(ins_wide)

In [None]:
write_csv(ins_wide, 'ins_any_byperson.csv', "/data/")

#### Create income dataset

In [None]:
head(inc)

In [None]:
table(inc$inc)

In [None]:
# clean ins_type
inc$inc_updt<-gsub("Annual Income: ", "", inc$inc)
inc$inc_updt<-gsub("PMI: ", "", inc$inc_updt)
table(inc$inc_updt)

In [None]:
inc$inc_updt<-ifelse(inc$inc_updt %in% c('Prefer Not To Answer','Skip'), 'skip_unknown', inc$inc_updt)
table(inc$inc_updt)

In [None]:
inc$value<-1
inc_wide<- inc %>% 
    select(person_id, datetime_inc, inc_updt, value) %>%
    pivot_wider(names_from = "inc_updt", names_prefix = "inc_", values_from = "value")
inc_wide[is.na(inc_wide)]<-0
head(inc_wide)

In [None]:
write_csv(inc_wide, 'inc_byperson.csv', "/data/")

#### Create education dataset

In [None]:
table(edu$edu)

In [None]:
# clean 
edu$edu_updt<-gsub("Highest Grade: ", "", edu$edu)
edu$edu_updt<-gsub("PMI: ", "", edu$edu_updt)
edu$edu_updt<-ifelse(edu$edu_updt %in% c('Prefer Not To Answer','Skip'), 'skip_unknown', edu$edu_updt)
table(edu$edu_updt)

In [None]:
edu$value<-1
data_wide<- edu %>% 
    select(person_id, datetime_edu, edu_updt, value) %>%
    pivot_wider(names_from = "edu_updt", names_prefix = "edu_", values_from = "value")
data_wide[is.na(data_wide)]<-0
head(data_wide)

In [None]:
write_csv(data_wide, 'edu_byperson.csv', "/data/")

#### Create work dataset

In [None]:
table(work$work)

In [None]:
# clean 
work$work_updt<-gsub("Employment Status: ", "", work$work)
work$work_updt<-gsub("PMI: ", "", work$work_updt)
work$work_updt<-ifelse(work$work_updt %in% c('Prefer Not To Answer','Skip'), 'skip_unknown', work$work_updt)
table(work$work_updt)

In [None]:
work$value<-1
data_wide<- work %>% 
    select(person_id, datetime_work, work_updt, value) %>%
    pivot_wider(names_from = "work_updt", names_prefix = "emp_", values_from = "value")
data_wide[is.na(data_wide)]<-0
head(data_wide)

In [None]:
write_csv(data_wide, 'employment_byperson.csv', "/data/")

## Basics survey per person 
Merge in all datasets into one basics survey including ins_type, ins_any, inc, marital, edu, and work information

In [None]:
head(basic_df)

In [None]:
# create master list of all beneficiaries 
personids<-basic_df %>% group_by(person_id) %>% summarize(n=n())

In [None]:
# read in the other files 
name_of_file_in_bucket <- 'employment_byperson.csv'
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)
work <- read_csv(name_of_file_in_bucket)

name_of_file_in_bucket <- 'inc_byperson.csv'
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)
inc <- read_csv(name_of_file_in_bucket)

name_of_file_in_bucket <- 'ins_any_byperson.csv'
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)
ins_any <- read_csv(name_of_file_in_bucket)

name_of_file_in_bucket <- 'ins_type_byperson.csv'
system(paste0("gsutil cp ", my_bucket, "/data/", name_of_file_in_bucket, " ."), intern=T)
ins_type <- read_csv(name_of_file_in_bucket)

In [None]:
all<-merge(personids, edu, by='person_id', all.x=TRUE)
all<-merge(all, work, by='person_id', all.x=TRUE)
all<-merge(all, inc, by='person_id', all.x=TRUE)
all<-merge(all, ins_any, by='person_id', all.x=TRUE)
all<-merge(all, ins_type, by='person_id', all.x=TRUE)

In [None]:
write_csv(all, 'basics_survey_byperson.csv', "/data/")

# Family Health history 

- Subset to questions of interest
    - do you have a condition
    - are you being treated for it
    - are you seeing a doctor for it
    
For now just looking at wheter you indicate having a condition. Can include the seeing a doctor and prescribed information in future iterations if desired.

In [None]:
# This query represents dataset "BASICS & HEALTH HISTORY" for domain "survey" and was generated for All of Us Controlled Tier Dataset v8
survey_sql <- paste("
    SELECT
        answer.person_id,
        answer.survey_datetime,
        answer.survey,
        answer.question_concept_id,
        answer.question,
        answer.answer_concept_id,
        answer.answer,
        answer.survey_version_concept_id,
        answer.survey_version_name  
    FROM
        `ds_survey` answer   
    WHERE
        (
            question_concept_id IN (SELECT
                DISTINCT concept_id                         
            FROM
                `cb_criteria` c                         
            JOIN
                (SELECT
                    CAST(cr.id as string) AS id                               
                FROM
                    `cb_criteria` cr                               
                WHERE
                    concept_id IN (1740639)                               
                    AND domain_id = 'SURVEY') a 
                    ON (c.path like CONCAT('%', a.id, '.%'))                         
            WHERE
                domain_id = 'SURVEY'                         
                AND type = 'PPI'                         
                AND subtype = 'QUESTION')
        )", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
survey_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
 # strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "survey_fmhx",
  "survey_fmhx_*.csv")
message(str_glue('The data will be written to {survey_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
#bq_table_save(
#  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), survey_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
#  survey_path,
#  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {survey_36706926_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(survey = col_character(), question = col_character(), answer = col_character(), survey_version_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
fmhx <- read_bq_export_from_workspace_bucket(survey_path)

In [None]:
fmhx <- as.data.table(fmhx)

## Subset to relevant questions

In [None]:
# not run for now 
if (1 == 0) {
fmhx_qs$doc<-ifelse(grepl('seeing a doctor', fmhx_qs$question), 1, 0)
fmhx_qs$rx<-ifelse(grepl('prescribed', fmhx_qs$question), 1, 0)
fmhx_qs$how_old<-ifelse(grepl('how old', fmhx_qs$question), 1, 0)
fmhx_qs$dgn<-ifelse(grepl('Including yourself, who in your family has had', fmhx_qs$question), 1, 0)
fmhx_qs$have_you<-ifelse(grepl('Have you', fmhx_qs$question), 1, 0)
    
# diagnosis questions
dgn_qs<-fmhx_qs[fmhx_qs$dgn == 1,]
have_you_qs<-fmhx_qs[fmhx_qs$have_you == 1,]
rx_qs<-fmhx_qs[fmhx_qs$rx == 1,]
doc_qs<-fmhx_qs[fmhx_qs$doc == 1,]
    
}

## Get diagnosis answers

In [None]:
# subset survey answers to relevant questions
health <- fmhx[grepl('Including yourself, who in your family has had', question)]

In [None]:
health[,q := gsub("Including yourself, who in your family has had", "", question)]
health[,q := gsub(" a ","", q)]
health[,q := gsub("Select all that apply.","", q)]
health[,q := gsub("?","", q, fixed= TRUE)]
health[,q := gsub("\\s*\\([^\\)]+\\)","", q)]

In [None]:
# clean things up a bit
health[,a := gsub(".*:","", health$answer)]
health[,a := gsub(".*-","", health$a)]

table(health$a)

In [None]:
# keep all "self" rows
self <- health[trimws(a) == "Self"]

In [None]:
self[, a := 1]
head(self)

In [None]:
self[,survey_date := as.Date(survey_datetime)]

In [None]:
write_csv(self,'self_reported_health_byperson_tmp.csv', "/survey/")

In [None]:
# create wide dataset of all conditions where someone answered "Self" 
self_wide<- self %>% 
    select(person_id, survey_date, q, a) %>%
    pivot_wider(names_from = "q", names_prefix = "", values_from = "a")
head(self_wide) 

In [None]:
self_wide <-self_wide %>% clean_names()
setDT(self_wide)

In [None]:
self_wide[, (3:ncol(self_wide)) := lapply(.SD, as.numeric), .SDcols = 3:ncol(self_wide)]
self_wide[is.na(self_wide)]<-0

In [None]:
write_csv(self_wide,'self_reported_health_byperson.csv', "/survey/")

## get list of responders

Create a list of all people who responded to the survey (to distinguish 0s from NAs)

In [None]:
# get full list of names that answered the family history survey (so we know who answered no vs who didn't respond)
fmhx_ids<-fmhx[,c('person_id','survey_datetime')]
setDT(fmhx_ids)
unique_ids <- unique(fmhx_ids[, .(person_id)])

# save 
write_csv(unique_ids,'answered_self_reported_health.csv', "/survey/")

In [None]:
dim(self_wide)
dim(unique_ids)