**Title:** Query Diabetes Task Data

**Author:** Anna Zink

**Date:** August 18, 2025

**Description:** Pull hba1c and glucose data for the diabetes task sample


# Set Up

In [None]:
library(tidyverse)
library(bigrquery)

In [None]:
load_data<-function(file, folder){
    my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
    system(paste0("gsutil cp ", my_bucket, folder, file, " ."), intern=T)
    dsn <- read_csv(file, show_col_types = FALSE)
    return(dsn)
}

# Replace df with THE NAME OF YOUR DATAFRAME
# folder = "/ehr/" 
write_csv<-function(df, fn, folder) {
   my_dataframe <- df
   destination_filename <- fn
   write_excel_csv(my_dataframe, destination_filename)
   my_bucket <- Sys.getenv('WORKSPACE_BUCKET')
   system(paste0("gsutil cp ./", destination_filename, " ", my_bucket, folder), intern=T)
}

# Pull hba1c and glucose measures

In [None]:
# This query represents dataset "access_sample_glucose_hba1c_readings" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
hba1c_glucose_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        m_standard_concept.concept_code as standard_concept_code,
        m_standard_concept.vocabulary_id as standard_vocabulary,
        measurement.measurement_datetime,
        measurement.measurement_type_concept_id,
        m_type.concept_name as measurement_type_concept_name,
        measurement.operator_concept_id,
        m_operator.concept_name as operator_concept_name,
        measurement.value_as_number,
        measurement.value_as_concept_id,
        m_value.concept_name as value_as_concept_name,
        measurement.unit_concept_id,
        m_unit.concept_name as unit_concept_name,
        measurement.range_low,
        measurement.range_high,
        measurement.visit_occurrence_id,
        m_visit.concept_name as visit_occurrence_concept_name,
        measurement.measurement_source_value,
        measurement.measurement_source_concept_id,
        m_source_concept.concept_name as source_concept_name,
        m_source_concept.concept_code as source_concept_code,
        m_source_concept.vocabulary_id as source_vocabulary,
        measurement.unit_source_value,
        measurement.value_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (3003309, 3004410, 3005673, 3007263, 3034639, 36032094, 36304734, 40762352, 40795740, 4149519, 4184637, 4197971, 42869630)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )  
            AND (
                measurement.PERSON_ID IN (SELECT
                    distinct person_id  
                FROM
                    `cb_search_person` cb_search_person  
                WHERE
                    cb_search_person.person_id IN (SELECT
                        criteria.person_id 
                    FROM
                        (SELECT
                            DISTINCT person_id, entry_date, concept_id 
                        FROM
                            `cb_search_all_events` 
                        WHERE
                            (concept_id IN(SELECT
                                DISTINCT c.concept_id 
                            FROM
                                `cb_criteria` c 
                            JOIN
                                (SELECT
                                    CAST(cr.id as string) AS id       
                                FROM
                                    `cb_criteria` cr       
                                WHERE
                                    concept_id IN (43528895)       
                                    AND full_text LIKE '%_rank1]%'      ) a 
                                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                    OR c.path LIKE CONCAT('%.', a.id) 
                                    OR c.path LIKE CONCAT(a.id, '.%') 
                                    OR c.path = a.id) 
                            WHERE
                                is_standard = 0 
                                AND is_selectable = 1) 
                            AND is_standard = 0 )) criteria ) )
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_type 
            ON measurement.measurement_type_concept_id = m_type.concept_id 
    LEFT JOIN
        `concept` m_operator 
            ON measurement.operator_concept_id = m_operator.concept_id 
    LEFT JOIN
        `concept` m_value 
            ON measurement.value_as_concept_id = m_value.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id 
    LEFT JOIn
        `visit_occurrence` v 
            ON measurement.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `concept` m_visit 
            ON v.visit_concept_id = m_visit.concept_id 
    LEFT JOIN
        `concept` m_source_concept 
            ON measurement.measurement_source_concept_id = m_source_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
hba1c_glucose_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  #strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "hba1c_glucose",
  "hba1c_glucose_*.csv")
message(str_glue('The data will be written to {hba1c_glucose_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
#bq_table_save(
#  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), hba1c_glucose_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
#  hba1c_glucose_path,
#destination_format = "CSV")

In [None]:
# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_93468355_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), standard_concept_code = col_character(), standard_vocabulary = col_character(), measurement_type_concept_name = col_character(), operator_concept_name = col_character(), value_as_concept_name = col_character(), unit_concept_name = col_character(), visit_occurrence_concept_name = col_character(), measurement_source_value = col_character(), source_concept_name = col_character(), source_concept_code = col_character(), source_vocabulary = col_character(), unit_source_value = col_character(), value_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
hba1c_glucose_df <- read_bq_export_from_workspace_bucket(hba1c_glucose_path)

dim(hba1c_glucose_df)

head(hba1c_glucose_df, 5)

In [None]:
hba1c_glucose_df$msr_cat<-ifelse(grepl('FASTING', toupper(hba1c_glucose_df$standard_concept_name)), 'fasting glucose', 
                          ifelse(grepl('GLUCOSE', toupper(hba1c_glucose_df$standard_concept_name)), 'random glucose', 'HbA1c'))

In [None]:
keepvars<-c('person_id','standard_concept_name','msr_cat','measurement_datetime','value_as_concept_name','value_as_number','unit_concept_name','range_low','range_high')

In [None]:
hba1c_glucose_df$year<-year(hba1c_glucose_df$measurement_datetime)

In [None]:
df<-hba1c_glucose_df[hba1c_glucose_df$year>=2016 & hba1c_glucose_df$year <2024,keepvars]

# Add in diabetes flag based on clinical cutoffs

- HbA1c >= 6.5
- Fasting Glucose >= 126
- Random Glucose >= 200

Exclude values of 1000000 (default set in the All of Us data)

In [None]:
df$value_2<-suppressWarnings(as.numeric(as.character(df$value_as_concept_name)))
df$value_updt<-coalesce(df$value_as_number, df$value_2)

## HbA1c

In [None]:
# unit conversion
df$value_updt<-ifelse(df$msr_cat == 'HbA1c' 
                       & df$unit_concept_name %in% c('milligram per deciliter','mg/dL'), 
                       (df$value_updt+46.7)/28.7,
               ifelse(df$msr_cat == 'HbA1c' 
                      & df$unit_concept_name %in% c('nanogram per milliliter'), 
                       (df$value_updt/10000 + 46.7)/28.7, df$value_updt))

In [None]:
df$hba1c_diabetes<-ifelse(df$msr_cat == "HbA1c" & df$value_updt >= 6.5 & df$value_as_number < 10000000, 1,
                   ifelse(df$msr_cat == "HbA1c" & df$value_updt < 6.5, 0, NA))

## Fasting Glucose

In [None]:
df$hba1c_gluc_fast<-ifelse(df$msr_cat == "fasting glucose" & df$value_updt >= 126 & df$value_as_number < 10000000, 1,
                   ifelse(df$msr_cat == "fasting glucose" & df$value_updt < 126, 0, NA))

## Random Glucose

In [None]:
df$hba1c_gluc_rand<-ifelse(df$msr_cat == "random glucose" & df$value_updt >= 200 & df$value_as_number < 10000000, 1,
                   ifelse(df$msr_cat == "random glucose" & df$value_updt < 200, 0, NA))

## Finalize and Output

Create a measure for diabetes 
- diabetes = 1 if met clinical criteria, 0 if did not meet clinical critiera, NA if result but no value. 

In [None]:
df$diabetes<-coalesce(df$hba1c_diabetes, df$hba1c_gluc_fast, df$hba1c_gluc_rand)
df$source<-df$msr_cat

In [None]:
keepvars<-c('person_id','measurement_datetime','value_updt','diabetes','source')

In [None]:
write_csv(df[,keepvars], 'hba1c_glucose_readings.csv', "/ehr/")