In [1]:
import os
import pandas as pd

CDR = os.environ["WORKSPACE_CDR"]
USE_BQSTORAGE = "BIGQUERY_STORAGE_API_ENABLED" in os.environ

PHENOTYPE_CONFIG = {
    "RA": {
        # --- survey for case definition (RA = Yes) ---
        # (question_concept_id, answer_concept_id)
        "survey_inclusion_pairs": [
            (1384593, 1385113),   # e.g. RA self-report = Yes
            # e.g. family history question including self = Yes
        ],

        # --- survey to exclude from controls (RA Yes by any definition) ---
        "survey_exclusion_pairs": [
            (1384593, 1385113),
            (836820,  1384653),
        ],

        # --- SNOMED condition_concept_id for RA diagnosis (standard) ---
        "condition_snomed_ids": [
            4184657, 4116441, 4344166, 4117689, 4344258, 4117686, 37108590, 37395590, 37108605, 4116150, 134453, 4116151, 
            42534837, 4216972, 77630, 37108607, 4211842, 319825, 4083556, 4142899, 4114439, 4107913, 36685024, 37108614, 
            4009619, 4118008, 37108606, 4116440, 36685020, 80809, 42534836, 4271003, 4305027, 35609010, 42534835, 256197, 
            4115161, 46273162, 35609009, 4035611, 4117687, 762446, 81097, 4102493, 37119224, 37108591, 42534834
        ],

        # --- two (or more) groups of SNOMED concepts to exclude from CONTROLS ---
        "exclude_snomed_groups": [
            [
                380097, 45769873, 43531578, 321822, 378743, 4228443, 4338896, 4174977, 438557, 35626042, 4226798, 4255401, 43530690, 443592, 4266637, 4311629, 45773064, 134398, 35626044, 35626038, 4114427, 4131908, 4226238, 201826, 4338897, 318712, 4033942, 441968, 40484648, 201531, 37016348, 4221495, 35626041, 376114, 4095288, 45757435, 4042067, 4338901, 443729, 4225656, 36714116, 45769830, 4227210, 45763583, 37017431, 4099214, 4308509, 4212441, 4175440, 443730, 4099651, 4290822, 43531010, 200687, 443734, 201820, 4222876, 35626039, 45769832, 4215961, 4152858, 43530656, 4228112, 437758, 35626068, 4252356, 46270483, 377821, 4140466, 37016349, 45770881, 4191611, 4029423, 4032787, 45763584, 43530685, 43531616, 376979, 4223303, 443767, 4008576, 42538169, 45770902, 443412, 4042728, 443735, 376112, 37016180, 201254, 4159742, 443732, 435216, 37016767, 37016768, 40482801, 192279, 4009303, 195771, 35626043, 4202383, 439770, 377552, 43531563, 45769876, 4234742, 443731,
 4153217, 442793, 4196141, 376065, 37016179, 443733, 4048028, 37016358, 4042502, 4227657, 45770830, 4193704, 201530, 4214376, 4224254, 376683, 443238, 37016356, 37017432, 4222415, 4143857, 35626067, 4044391, 4225055, 443727, 434164, 444094, 45757363, 4226354, 380096     
                            
            ],
            [
                313502, 40481896, 314369, 316139, 312648, 312938, 193493, 439696, 44782690, 442603, 314378, 444101, 42709887, 439694, 44782429, 439698, 317895, 316994, 439695, 443919, 319826, 319034, 4183981, 43021852, 442766, 35615135, 44782728, 45768449, 443771, 314958, 317898, 4110947, 46271022, 4249016, 43020455, 442626, 318437, 45757756, 195556, 44784439, 43020424, 44784621, 4110948, 320128, 4263504, 442604, 201313
            ],
        ],
    }
}


In [2]:
# ============================================
# Process configuration data to generate SQL fragments for direct use
# ============================================

def generate_survey_conditions(pairs):
    """
    Generate SQL conditions for survey question-answer pairs.
    Each question-answer pair forms an independent condition group, connected by OR.
    """
    conditions = []
    for question_id, answer_id in pairs:
        condition = f"""(concept_id IN ({question_id}) 
                        AND is_standard = 0  
                        AND value_source_concept_id IN ({answer_id}))"""
        conditions.append(condition)
    return ' OR '.join(conditions)

def generate_condition_ids_str(condition_ids):
    """
    Convert condition_snomed_ids to SQL IN clause format.
    """
    return ', '.join(map(str, condition_ids))

def generate_exclude_groups_conditions(exclude_groups):
    """
    Generate SQL conditions for exclude_snomed_groups.
    Each group forms a full 'concept tree' condition using cb_criteria,
    matching the original cohort-builder query logic.
    """
    group_conditions = []
    for group in exclude_groups:
        group_ids_str = ', '.join(map(str, group))
        condition = f"""
            concept_id IN (
                SELECT DISTINCT c.concept_id
                FROM `{CDR}.cb_criteria` c
                JOIN (
                    SELECT CAST(cr.id AS string) AS id
                    FROM `{CDR}.cb_criteria` cr
                    WHERE concept_id IN ({group_ids_str})
                      AND full_text LIKE '%_rank1]%'
                ) a
                ON (
                    c.path LIKE CONCAT('%.', a.id, '.%')
                    OR c.path LIKE CONCAT('%.', a.id)
                    OR c.path LIKE CONCAT(a.id, '.%')
                    OR c.path = a.id
                )
                WHERE is_standard = 1
                  AND is_selectable = 1
            )
        """
        group_conditions.append(condition)
    return group_conditions

# Generate SQL fragments for RA phenotype
RA_config = PHENOTYPE_CONFIG['RA']

# Survey inclusion conditions (for case definition)
RA_survey_inclusion_conditions = generate_survey_conditions(RA_config['survey_inclusion_pairs'])

# Survey exclusion conditions (for excluding controls)
RA_survey_exclusion_conditions = generate_survey_conditions(RA_config['survey_exclusion_pairs'])

# Condition SNOMED IDs (for case definition)
RA_condition_snomed_ids_str = generate_condition_ids_str(RA_config['condition_snomed_ids'])

# Exclude SNOMED groups (for excluding controls)
RA_exclude_group_conditions = generate_exclude_groups_conditions(RA_config['exclude_snomed_groups'])

# Print confirmation
print("=" * 60)
print("RA Phenotype Configuration Processed:")
print("=" * 60)
print(f"\n1. Survey Inclusion Conditions (for cases):")
print(f"   {len(RA_config['survey_inclusion_pairs'])} pairs")
print(f"\n2. Survey Exclusion Conditions (for controls):")
print(f"   {len(RA_config['survey_exclusion_pairs'])} pairs")
print(f"\n3. Condition SNOMED IDs:")
print(f"   {len(RA_config['condition_snomed_ids'])} IDs")
print(f"\n4. Exclude SNOMED Groups:")
print(f"   {len(RA_config['exclude_snomed_groups'])} groups")
for i, group in enumerate(RA_config['exclude_snomed_groups'], 1):
    print(f"   Group {i}: {len(group)} IDs")
print("=" * 60)

RA Phenotype Configuration Processed:

1. Survey Inclusion Conditions (for cases):
   1 pairs

2. Survey Exclusion Conditions (for controls):
   2 pairs

3. Condition SNOMED IDs:
   47 IDs

4. Exclude SNOMED Groups:
   2 groups
   Group 1: 136 IDs
   Group 2: 47 IDs


In [3]:
# Query to retrieve RA case participants with demographic information
# Generated for All of Us Controlled Tier Dataset v7
RA_Survey_person_sql = """
    SELECT
        person.person_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        p_race_concept.concept_name as race,
        p_ethnicity_concept.concept_name as ethnicity,
        p_sex_at_birth_concept.concept_name as sex_at_birth 
    FROM
        `""" + CDR + """.person` person 
    LEFT JOIN
        `""" + CDR + """.concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id  
    WHERE
        person.PERSON_ID IN (
            SELECT DISTINCT person_id  
            FROM `""" + CDR + """.cb_search_person` cb_search_person  
            WHERE
                -- Include participants meeting survey or condition criteria
                cb_search_person.person_id IN (
                    -- Survey-based case identification
                    SELECT criteria.person_id 
                    FROM (
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE """ + RA_survey_inclusion_conditions + """
                    ) criteria 
                    
                    UNION DISTINCT
                    
                    -- Condition-based case identification (SNOMED codes)
                    SELECT criteria.person_id 
                    FROM (
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE concept_id IN (
                            SELECT DISTINCT c.concept_id 
                            FROM `""" + CDR + """.cb_criteria` c 
                            JOIN (
                                SELECT CAST(cr.id as string) AS id       
                                FROM `""" + CDR + """.cb_criteria` cr       
                                WHERE concept_id IN (""" + RA_condition_snomed_ids_str + """)       
                                    AND full_text LIKE '%_rank1]%'
                            ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                OR c.path LIKE CONCAT('%.', a.id) 
                                OR c.path LIKE CONCAT(a.id, '.%') 
                                OR c.path = a.id) 
                            WHERE is_standard = 1 AND is_selectable = 1
                        ) 
                        AND is_standard = 1
                    ) criteria
                )
                -- Restrict to participants with whole genome variant data
                AND cb_search_person.person_id IN (
                    SELECT person_id 
                    FROM `""" + CDR + """.cb_search_person` p 
                    WHERE has_whole_genome_variant = 1
                )
        )"""

# Execute query and save results
RA_Survey_person_df = pd.read_gbq(
    RA_Survey_person_sql,
    dialect="standard",
    use_bqstorage_api=USE_BQSTORAGE,
    progress_bar_type="tqdm_notebook")

RA_Survey_person_df.to_parquet('RA_Survey_person_df.parquet', index=False)
print(f"RA cases identified: {len(RA_Survey_person_df)}")

Downloading:   0%|          | 0/82384 [00:00<?, ?rows/s]

RA cases identified: 82384


In [4]:
# Extract all unique question_concept_ids from survey pairs for filtering
RA_survey_question_ids = list(set(
    [pair[0] for pair in RA_config['survey_inclusion_pairs']] + 
    [pair[0] for pair in RA_config['survey_exclusion_pairs']]
))
RA_survey_question_ids_str = ', '.join(map(str, RA_survey_question_ids))

RA_Survey_survey_sql = """
    SELECT
        answer.person_id,
        answer.survey_datetime,
        answer.survey,
        answer.question_concept_id,
        answer.question,
        answer.answer_concept_id,
        answer.answer  
    FROM
        `""" + CDR + """.ds_survey` answer   
    WHERE
        -- Filter for relevant survey questions
        question_concept_id IN (""" + RA_survey_question_ids_str + """)
        
        -- Restrict to RA case participants
        AND answer.PERSON_ID IN (
            SELECT DISTINCT person_id  
            FROM `""" + CDR + """.cb_search_person` cb_search_person  
            WHERE
                -- Include participants meeting survey or condition criteria
                cb_search_person.person_id IN (
                    -- Survey-based case identification
                    SELECT criteria.person_id 
                    FROM (
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE """ + RA_survey_inclusion_conditions + """
                    ) criteria 
                    
                    UNION DISTINCT
                    
                    -- Condition-based case identification (SNOMED codes)
                    SELECT criteria.person_id 
                    FROM (
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE concept_id IN (
                            SELECT DISTINCT c.concept_id 
                            FROM `""" + CDR + """.cb_criteria` c 
                            JOIN (
                                SELECT CAST(cr.id as string) AS id       
                                FROM `""" + CDR + """.cb_criteria` cr       
                                WHERE concept_id IN (""" + RA_condition_snomed_ids_str + """)       
                                    AND full_text LIKE '%_rank1]%'
                            ) a 
                            ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                OR c.path LIKE CONCAT('%.', a.id) 
                                OR c.path LIKE CONCAT(a.id, '.%') 
                                OR c.path = a.id) 
                            WHERE is_standard = 1 AND is_selectable = 1
                        ) 
                        AND is_standard = 1
                    ) criteria
                )
                -- Restrict to participants with whole genome variant data
                AND cb_search_person.person_id IN (
                    SELECT person_id 
                    FROM `""" + CDR + """.cb_search_person` p 
                    WHERE has_whole_genome_variant = 1
                )
        )"""

# Execute query and save results
RA_Survey_survey_df = pd.read_gbq(
    RA_Survey_survey_sql,
    dialect="standard",
    use_bqstorage_api=USE_BQSTORAGE,
    progress_bar_type="tqdm_notebook"
)

RA_Survey_survey_df.to_parquet('RA_Survey_survey_df.parquet', index=False)
print(f"Survey responses retrieved: {len(RA_Survey_survey_df)}")

Downloading:   0%|          | 0/41041 [00:00<?, ?rows/s]

Survey responses retrieved: 41041


In [5]:
# This query represents dataset "RA_case" for domain "condition" and was generated for All of Us Controlled Tier Dataset v7
RA_Survey_condition_sql = """
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_concept_id,
        c_standard_concept.concept_name as standard_concept_name,
        c_standard_concept.concept_code as standard_concept_code,
        c_standard_concept.vocabulary_id as standard_vocabulary,
        c_occurrence.condition_start_datetime,
        c_occurrence.condition_end_datetime,
        c_occurrence.condition_type_concept_id,
        c_type.concept_name as condition_type_concept_name,
        c_occurrence.stop_reason,
        c_occurrence.visit_occurrence_id,
        visit.concept_name as visit_occurrence_concept_name,
        c_occurrence.condition_source_value,
        c_occurrence.condition_source_concept_id,
        c_source_concept.concept_name as source_concept_name,
        c_source_concept.concept_code as source_concept_code,
        c_source_concept.vocabulary_id as source_vocabulary,
        c_status.concept_name as condition_status_concept_name 
    FROM
        ( SELECT
            * 
        FROM
            `""" + CDR + """.condition_occurrence` c_occurrence 
        WHERE
            (
                condition_concept_id IN (
                    SELECT DISTINCT c.concept_id 
                    FROM `""" + CDR + """.cb_criteria` c 
                    JOIN (
                        SELECT CAST(cr.id as string) AS id       
                        FROM `""" + CDR + """.cb_criteria` cr       
                        WHERE concept_id IN (""" + RA_condition_snomed_ids_str + """)       
                          AND full_text LIKE '%_rank1]%'
                    ) a 
                    ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                    WHERE
                        is_standard = 1 
                        AND is_selectable = 1
                )
            )  
            AND (
                c_occurrence.PERSON_ID IN (
                    SELECT DISTINCT person_id  
                    FROM `""" + CDR + """.cb_search_person` cb_search_person  
                    WHERE
                        cb_search_person.person_id IN (
                            SELECT criteria.person_id 
                            FROM (
                                SELECT DISTINCT person_id, entry_date, concept_id 
                                FROM `""" + CDR + """.cb_search_all_events` 
                                WHERE
                                    (concept_id IN (1384593) 
                                     AND is_standard = 0  
                                     AND value_source_concept_id IN (1385113))
                            ) criteria 
                            UNION DISTINCT
                            SELECT criteria.person_id 
                            FROM (
                                SELECT DISTINCT person_id, entry_date, concept_id 
                                FROM `""" + CDR + """.cb_search_all_events` 
                                WHERE
                                    (concept_id IN (
                                        SELECT DISTINCT c.concept_id 
                                        FROM `""" + CDR + """.cb_criteria` c 
                                        JOIN (
                                            SELECT CAST(cr.id as string) AS id       
                                            FROM `""" + CDR + """.cb_criteria` cr       
                                            WHERE concept_id IN (""" + RA_condition_snomed_ids_str + """)       
                                              AND full_text LIKE '%_rank1]%'
                                        ) a 
                                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                                            OR c.path LIKE CONCAT('%.', a.id) 
                                            OR c.path LIKE CONCAT(a.id, '.%') 
                                            OR c.path = a.id) 
                                        WHERE
                                            is_standard = 1 
                                            AND is_selectable = 1
                                    )
                                    AND is_standard = 1
                                    )
                            ) criteria
                        )
                        AND cb_search_person.person_id IN (
                            SELECT person_id 
                            FROM `""" + CDR + """.cb_search_person` p 
                            WHERE has_whole_genome_variant = 1
                        )
                )
            )
        ) c_occurrence 
    LEFT JOIN
        `""" + CDR + """.concept` c_standard_concept 
            ON c_occurrence.condition_concept_id = c_standard_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` c_type 
            ON c_occurrence.condition_type_concept_id = c_type.concept_id 
    LEFT JOIN
        `""" + CDR + """.visit_occurrence` v 
            ON c_occurrence.visit_occurrence_id = v.visit_occurrence_id 
    LEFT JOIN
        `""" + CDR + """.concept` visit 
            ON v.visit_concept_id = visit.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` c_source_concept 
            ON c_occurrence.condition_source_concept_id = c_source_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` c_status 
            ON c_occurrence.condition_status_concept_id = c_status.concept_id
"""

# Execute query and save results
RA_Survey_condition_df = pd.read_gbq(
    RA_Survey_condition_sql,
    dialect="standard",
    use_bqstorage_api=USE_BQSTORAGE,
    progress_bar_type="tqdm_notebook"
)

RA_Survey_condition_df.to_parquet('RA_Survey_condition_df.parquet', index=False)
print(f"Condition records retrieved: {len(RA_Survey_condition_df)}")

Downloading:   0%|          | 0/1500679 [00:00<?, ?rows/s]

Condition records retrieved: 1500679


In [6]:
# Build NOT IN blocks for all exclude_snomed_groups based on RA_exclude_group_conditions.
exclude_snomed_notin_sql_parts = []
for group_condition in RA_exclude_group_conditions:
    part = f"""
            AND cb_search_person.person_id NOT IN (
                SELECT criteria.person_id
                FROM (
                    SELECT DISTINCT person_id, entry_date, concept_id
                    FROM `{CDR}.cb_search_all_events`
                    WHERE
                        ({group_condition}
                         AND is_standard = 1)
                ) criteria
            )
    """
    exclude_snomed_notin_sql_parts.append(part)

exclude_snomed_notin_sql = "".join(exclude_snomed_notin_sql_parts)

# This query represents dataset "RA_control_all_age" for domain "person"

RA_Control_person_sql = """
    SELECT
        person.person_id,
        p_gender_concept.concept_name as gender,
        person.birth_datetime as date_of_birth,
        p_race_concept.concept_name as race,
        p_ethnicity_concept.concept_name as ethnicity,
        p_sex_at_birth_concept.concept_name as sex_at_birth 
    FROM
        `""" + CDR + """.person` person 
    LEFT JOIN
        `""" + CDR + """.concept` p_gender_concept 
            ON person.gender_concept_id = p_gender_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` p_race_concept 
            ON person.race_concept_id = p_race_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` p_ethnicity_concept 
            ON person.ethnicity_concept_id = p_ethnicity_concept.concept_id 
    LEFT JOIN
        `""" + CDR + """.concept` p_sex_at_birth_concept 
            ON person.sex_at_birth_concept_id = p_sex_at_birth_concept.concept_id  
    WHERE
        person.PERSON_ID IN (
            SELECT DISTINCT person_id  
            FROM `""" + CDR + """.cb_search_person` cb_search_person  
            WHERE
                -- Keep only participants with WGS data
                cb_search_person.person_id IN (
                    SELECT person_id 
                    FROM `""" + CDR + """.cb_search_person` p 
                    WHERE has_whole_genome_variant = 1
                ) 
                -- Exclude RA cases: survey-based + RA SNOMED
                AND cb_search_person.person_id NOT IN (
                    SELECT criteria.person_id 
                    FROM (
                        -- Survey-based RA case definitions (RA Yes)
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE
                            (""" + RA_survey_exclusion_conditions + """)
                    ) criteria
                    UNION DISTINCT
                    SELECT criteria.person_id 
                    FROM (
                        -- Condition-based RA definitions via SNOMED
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE
                            concept_id IN (
                                SELECT DISTINCT c.concept_id 
                                FROM `""" + CDR + """.cb_criteria` c 
                                JOIN (
                                    SELECT CAST(cr.id AS string) AS id
                                    FROM `""" + CDR + """.cb_criteria` cr
                                    WHERE concept_id IN (""" + RA_condition_snomed_ids_str + """)
                                      AND full_text LIKE '%_rank1]%'
                                ) a
                                ON (
                                    c.path LIKE CONCAT('%.', a.id, '.%')
                                    OR c.path LIKE CONCAT('%.', a.id)
                                    OR c.path LIKE CONCAT(a.id, '.%')
                                    OR c.path = a.id
                                )
                                WHERE is_standard = 1
                                  AND is_selectable = 1
                            )
                            AND is_standard = 1
                    ) criteria
                )
""" + exclude_snomed_notin_sql + """
        )
"""


# Execute query and save results
RA_Control_person_df = pd.read_gbq(
    RA_Control_person_sql,
    dialect="standard",
    use_bqstorage_api=USE_BQSTORAGE,  # Use BigQuery Storage API if enabled
    progress_bar_type="tqdm_notebook"
)

RA_Control_person_df.to_parquet('RA_Control_person_df.parquet', index=False)
print(f"Control persons retrieved: {len(RA_Control_person_df)}")

Downloading:   0%|          | 0/105702 [00:00<?, ?rows/s]

Control persons retrieved: 105702


In [7]:
# This query represents dataset "RA_control_all_age" for domain "survey"
RA_Control_survey_sql = """
    SELECT
        answer.person_id,
        answer.survey_datetime,
        answer.survey,
        answer.question,
        answer.answer  
    FROM
        `""" + CDR + """.ds_survey` answer   
    WHERE
        -- Restrict to RA-related survey questions
        question_concept_id IN (""" + RA_survey_question_ids_str + """)
        AND answer.PERSON_ID IN (
            SELECT DISTINCT person_id  
            FROM `""" + CDR + """.cb_search_person` cb_search_person  
            WHERE
                -- Keep only participants with WGS data
                cb_search_person.person_id IN (
                    SELECT person_id 
                    FROM `""" + CDR + """.cb_search_person` p 
                    WHERE has_whole_genome_variant = 1
                ) 
                -- Exclude RA cases: survey-based + RA SNOMED (same as RA_Control_person_sql)
                AND cb_search_person.person_id NOT IN (
                    SELECT criteria.person_id 
                    FROM (
                        -- Survey-based RA case definitions (RA = Yes)
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE
                            (""" + RA_survey_exclusion_conditions + """)
                    ) criteria 
                    UNION DISTINCT
                    SELECT criteria.person_id 
                    FROM (
                        -- Condition-based RA case definitions via SNOMED
                        SELECT DISTINCT person_id, entry_date, concept_id 
                        FROM `""" + CDR + """.cb_search_all_events` 
                        WHERE
                            concept_id IN (
                                SELECT DISTINCT c.concept_id 
                                FROM `""" + CDR + """.cb_criteria` c 
                                JOIN (
                                    SELECT CAST(cr.id AS string) AS id       
                                    FROM `""" + CDR + """.cb_criteria` cr       
                                    WHERE concept_id IN (""" + RA_condition_snomed_ids_str + """)       
                                      AND full_text LIKE '%_rank1]%'
                                ) a 
                                ON (
                                    c.path LIKE CONCAT('%.', a.id, '.%')
                                    OR c.path LIKE CONCAT('%.', a.id)
                                    OR c.path LIKE CONCAT(a.id, '.%')
                                    OR c.path = a.id
                                )
                                WHERE is_standard = 1 
                                  AND is_selectable = 1
                            )
                            AND is_standard = 1
                    ) criteria
                )
""" + exclude_snomed_notin_sql + """
        )
"""

# Execute query and save results
RA_Control_survey_df = pd.read_gbq(
    RA_Control_survey_sql,
    dialect="standard",
    use_bqstorage_api=USE_BQSTORAGE,  # Use BigQuery Storage API if enabled
    progress_bar_type="tqdm_notebook"
)

RA_Control_survey_df.to_parquet('RA_Control_survey_df.parquet', index=False)
print(f"Control survey responses retrieved: {len(RA_Control_survey_df)}")


Downloading:   0%|          | 0/31354 [00:00<?, ?rows/s]

Control survey responses retrieved: 31354
