**SUMMARIZE ACCESS SURVEY**

**AUTHOR**: Anna Zink

**DATE**: April 23, 2024

**Updated**: February 4, 2025 to account for v8 data

**DESCRIPTION**: Pull answers to access survey and create flags for people who can't afford care or have delayed care.

# Set Up

In [None]:
# install packages 
library(plyr)
library(tidyverse)
library(bigrquery)
library(stringr)
library(lubridate)

In [None]:
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
# folder = "/ehr/" 
write_csv<-function(df, fn, folder) {
   my_dataframe <- df
   destination_filename <- fn
   write_excel_csv(my_dataframe, destination_filename)
   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, folder), intern=T)
}

# Download access survey 

This code is based off the cohort builder result from searching for the access survey

In [None]:
# This query represents dataset "ACCESS_SURVEY" for domain "survey" and was generated for All of Us Controlled Tier Dataset v8
access_survey_sql <- paste("
    SELECT
        answer.person_id,
        answer.survey_datetime,
        answer.survey,
        answer.question_concept_id,
        answer.question,
        answer.answer_concept_id,
        answer.answer,
        answer.survey_version_concept_id,
        answer.survey_version_name  
    FROM
        `ds_survey` answer   
    WHERE
        (
            question_concept_id IN (SELECT
                DISTINCT concept_id                         
            FROM
                `cb_criteria` c                         
            JOIN
                (SELECT
                    CAST(cr.id as string) AS id                               
                FROM
                    `cb_criteria` cr                               
                WHERE
                    concept_id IN (43528895)                               
                    AND domain_id = 'SURVEY') a 
                    ON (c.path like CONCAT('%', a.id, '.%'))                         
            WHERE
                domain_id = 'SURVEY'                         
                AND type = 'PPI'                         
                AND subtype = 'QUESTION')
        )", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
access_survey_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  #strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "access_survey",
  "access_survey_*.csv")
message(str_glue('The data will be written to {access_survey_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.

#bq_table_save(
#  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), access_survey_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
#  access_survey_path,
#  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {survey_72329960_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(survey = col_character(), question = col_character(), answer = col_character(), survey_version_name = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
data <- read_bq_export_from_workspace_bucket(access_survey_path )

In [None]:
dim(data)

# Create flags for questions about affordability and delayed care

The questions ask about the past 12 MONTHS.

In [None]:
# Create one large "Can't Afford Care" bucket & "Delayed Medical Care" bucket 
data$afford_flag<-ifelse(grepl("Can't Afford Care", data$question), 1, 0)
data$delayed_flag<-ifelse(grepl("Delayed Medical Care", data$question), 1, 0)
data$healthcare_int_flag<-ifelse(grepl("Spoken To Professional", data$question), 1, 0)

In [None]:
# make sure flags are flagging the questions we would expect them to
data %>% group_by(question, afford_flag, delayed_flag, healthcare_int_flag) %>% summarize(n=length(person_id))

In [None]:
data$ans<-str_split(data$answer, ':', simplify = TRUE)[,2]

In [None]:
data$ans_yes<-ifelse(trimws(data$ans) == "Yes", 1, 0)
data$ans_yes<-ifelse(data$afford_flag ==1 & trimws(data$ans) == "Very Worried", 1, data$ans_yes)
data$ans_no<-ifelse(trimws(data$ans) == "No", 1, 0)
data$ans_dontknow<-ifelse(trimws(data$ans) == "Dont Know", 1, 0)
data$skip<-ifelse(trimws(data$ans) == "Skip", 1, 0)
# flag peolpe who have had a hc interaction within the year 
data$last_hc_int<-ifelse(data$healthcare_int_flag == 1 & trimws(data$ans) %in% c('6mo Or Less','6 Mo To 1 Year Ago'),
                         'within a year', ifelse(data$healthcare_int_flag == 1 & trimws(data$ans) %in% c('1 To 2 Years Ago'),
                         '1-2 years ago',ifelse(data$healthcare_int_flag == 1, 'other', NA)))
data$last_hc_int_1yr<-ifelse(data$last_hc_int %in% c('within a year'), 1, 0)
data$last_hc_int_2yr<-ifelse(data$last_hc_int %in% c('within a year','1-2 years ago'), 1, 0)

In [None]:
# print out different reasons
summ<-data %>% group_by(afford_flag, delayed_flag, question) %>% summarize(n=n(),n_yes=sum(ans_yes),yes=mean(ans_yes))

In [None]:
access_freq<-summ[summ$delayed_flag == 1 | summ$afford_flag == 1,]
head(access_freq)
write_csv(access_freq, 'access_question_freq.csv', "/output/")

# Summarize by person

In [None]:
data$tmp_count<-1
byperson <- data %>% group_by(person_id) %>% summarize(
     nobs=sum(tmp_count),
                mindate=min(survey_datetime), 
                maxdate=max(survey_datetime),
                nskip=sum(skip), 
                afford=sum(afford_flag*ans_yes), 
                delayed=sum(delayed_flag*ans_yes),
                hc_int_1_yr=max(last_hc_int_1yr, na.rm=TRUE), 
                hc_int_2_yr=max(last_hc_int_2yr, na.rm=TRUE)
)

In [None]:
byperson$afford_ind<-ifelse(byperson$afford>0, 1, 0)
byperson$delayed_ind<-ifelse(byperson$delayed>0, 1, 0)

In [None]:
byperson %>% group_by(afford_ind, delayed_ind) %>% summarise(hc1=mean(hc_int_1_yr), hc2=mean(hc_int_2_yr))

## Save 

In [None]:
my_dataframe <- byperson

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename <- 'access_byperson.csv'

# store the dataframe in current workspace
write_excel_csv(my_dataframe, destination_filename)

# Get the bucket name
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# Copy the file from current workspace to the bucket
system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, "/survey/"), intern=T)