In [1]:
import pandas as pd
import os
import numpy as np

<span style="font-size:20pt; font-weight:bold;">RA Case</span>

In [2]:
RA_Survey_person_df = pd.read_parquet('RA_Survey_person_df.parquet')

In [3]:
RA_Survey_survey_df = pd.read_parquet('RA_Survey_survey_df.parquet')

In [4]:
# RA health care provider survey responses information
ra_survey_info = RA_Survey_survey_df.loc[(RA_Survey_survey_df['question']=='Are you still seeing a doctor or health care provider for rheumatoid arthritis (RA)?')]

ra_survey_info['answer'].value_counts()

answer
Are you still seeing a doctor or health care provider for rheumatoid arthritis (RA)? - Yes    3879
Are you still seeing a doctor or health care provider for rheumatoid arthritis (RA)? - No      855
PMI: Skip                                                                                       31
Name: count, dtype: int64

In [5]:
# Merge survey responses information
ra_case = RA_Survey_person_df.merge(ra_survey_info[['person_id', 'survey_datetime','answer']], on = 'person_id', how = 'left')

In [6]:
# Filter to keep only "Yes" answers (optional)
ra_case = ra_case[ra_case['answer'] == 'Are you still seeing a doctor or health care provider for rheumatoid arthritis (RA)? - Yes']

In [7]:
RA_Survey_condition_df = pd.read_parquet("RA_Survey_condition_df.parquet")

In [8]:
RA_Survey_condition_df.shape

(1500679, 18)

In [9]:
# datetime for 1st code of RA
ra_condition = RA_Survey_condition_df.loc[RA_Survey_condition_df.groupby('person_id')['condition_start_datetime'].idxmin()]
person_id_with_ra_code = list(set(RA_Survey_condition_df['person_id']))
print(f"{len(person_id_with_ra_code)} had main diagnosis code")

80996 had main diagnosis code


In [10]:
# extract EHR record
ra_case_ehr_code_sql = """
    SELECT
        person_id,
        condition_concept_id,
        condition_start_datetime
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + f""".condition_occurrence` 
    WHERE person_id in {tuple(ra_case['person_id'])}
    """
ra_case_ehr_code = pd.read_gbq(
    ra_case_ehr_code_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

Downloading:   0%|          | 0/1889273 [00:00<?, ?rows/s]

In [11]:
# Last EHR datetime
ra_case_ehr_code = ra_case_ehr_code.loc[ra_case_ehr_code.groupby('person_id')['condition_start_datetime'].idxmax()]

In [12]:
# Merge proxy time data.
ra_case_ehr_code = ra_case_ehr_code[['person_id', 'condition_start_datetime']].merge(ra_condition[['person_id', 'condition_start_datetime']], on='person_id', how='left', suffixes=('_ehr', '_ra'))
print(f"{ra_case_ehr_code.shape[0]} has EHR record")

3294 has EHR record


In [13]:
# Create ra_code_YN column (1 if one has ra code, 0 if one doesn't have RA code but has EHR record, None if no EHR record)
ra_case_ehr_code['condition_start_datetime'] = ra_case_ehr_code[['condition_start_datetime_ehr', 'condition_start_datetime_ra']].min(axis=1)
ra_case_ehr_code['ra_code_YN'] = ra_case_ehr_code['condition_start_datetime_ra'].notna().astype(int)

In [14]:
# Merge RA case data with EHR diagnosis information
ra_case = ra_case.merge(ra_case_ehr_code[['person_id', 'condition_start_datetime', 'condition_start_datetime_ehr', 'ra_code_YN']],
                        on = 'person_id', how = 'left')

In [15]:
# Create ra_survey column (1 if asnwer yes in RA_survey, 0 if answer no, NA if missing)
survey_ans = [
    ra_case['answer'].str.contains("Yes", na=False),    # Yes
    ra_case['answer'].str.contains("No", na=False) | 
    ra_case['answer'].str.contains("Skip", na=False)    # No or Skip
]

ra_case['ra_event_survey'] = np.select(survey_ans, [1, 0], default=np.nan)

In [16]:
# Analyze RA diagnosis code distribution
print("RA Diagnosis Code Analysis")
print("=" * 40)

ra_code_counts = ra_case['ra_code_YN'].value_counts(dropna=False)
total_count = len(ra_case)

for value, count in ra_code_counts.items():
    label = {1: "Has RA code", 0: "No RA code", pd.NA: "Missing"}.get(value, f"Value: {value}")
    percentage = (count / total_count * 100)
    print(f"{label:<15}: {count:>6} ({percentage:>5.1f}%)")

print(f"\nTotal participants: {total_count}")

RA Diagnosis Code Analysis
Has RA code    :   2491 ( 64.2%)
No RA code     :    803 ( 20.7%)
Value: nan     :    585 ( 15.1%)

Total participants: 3879


In [17]:
# Check distribution of RA healthcare provider survey responses
print("RA Healthcare Provider Survey Response Distribution:")
print("=" * 55)
ra_survey_counts = ra_case['ra_event_survey'].value_counts(dropna=False)
total_responses = len(ra_case)

# Define labels for better readability
response_labels = {
   1: "Still seeing provider (Yes)",
   0: "Not seeing provider (No)", 
   pd.NA: "Missing/No response"
}

for value, count in ra_survey_counts.items():
   label = response_labels.get(value, f"Value: {value}")
   percentage = (count / total_responses * 100)
   print(f"{label:<25}: {count:>6} ({percentage:>5.1f}%)")

print(f"\nTotal participants: {total_responses}")

RA Healthcare Provider Survey Response Distribution:
Still seeing provider (Yes):   3879 (100.0%)

Total participants: 3879


In [18]:
# Calculate age at RA diagnosis for participants with main diagnosis code
time_to_code_ra = ra_case.merge(ra_case_ehr_code[['person_id','condition_start_datetime_ra']], on='person_id', how='left').apply(
   lambda row: ((row['condition_start_datetime_ra'] - row['date_of_birth']).days // 365) 
   if pd.notna(row['survey_datetime']) else None, 
   axis=1
)

# Convert to DataFrame and calculate summary statistics
time_to_code_ra = pd.DataFrame(time_to_code_ra, columns=['time_to_code'])
ra_code_median = time_to_code_ra['time_to_code'].median()
ra_code_Q1 = time_to_code_ra['time_to_code'].quantile(0.25)
ra_code_Q3 = time_to_code_ra['time_to_code'].quantile(0.75)

# Display summary statistics
print(f"Age at RA diagnosis (years):")
print(f"  Median: {ra_code_median:.1f}")
print(f"  IQR: {ra_code_Q1:.1f} - {ra_code_Q3:.1f}")

Age at RA diagnosis (years):
  Median: 54.0
  IQR: 44.0 - 62.0


<span style="font-size:20pt; font-weight:bold;">RA Control</span>

In [19]:
RA_Control_person_df = pd.read_parquet("RA_Control_person_df.parquet")

In [20]:
ra_control_id = RA_Control_person_df['person_id']
print(f"Total size of control cohort: {len(ra_control_id)}")
n_sampled = 0.1 * len(ra_control_id)
# Downsample
# ra_control_id = ra_control_id.sample(
#         n= int(n_sampled), 
#         random_state=123).tolist()
ra_control_id2 = pd.read_csv("person_ids.csv")['x'].tolist()
ra_case_id = ra_case['person_id'].tolist()
ra_control_id = list(set(ra_control_id2) - set(ra_case_id))
print(f"Size of downsampled control cohort: {len(ra_control_id)}")

Total size of control cohort: 105702
Size of downsampled control cohort: 14473


In [21]:
RA_Control_survey_df = pd.read_parquet("RA_Control_survey_df.parquet")

In [22]:
# a participant may provide multiple answers at the same datetime.
RA_Control_survey_df = RA_Control_survey_df.drop_duplicates(subset='person_id', keep='first')

In [23]:
RA_Control_survey_df['answer'].value_counts()

answer
PMI: Skip                                                                                  24590
Including yourself, who in your family has had rheumatoid arthritis (RA)? - Grandparent     2027
Including yourself, who in your family has had rheumatoid arthritis (RA)? - Mother          1996
Including yourself, who in your family has had rheumatoid arthritis (RA)? - Sibling          764
Including yourself, who in your family has had rheumatoid arthritis (RA)? - Father           671
Including yourself, who in your family has had rheumatoid arthritis (RA)? - Daughter         115
Including yourself, who in your family has had rheumatoid arthritis (RA)? - Son               42
Name: count, dtype: int64

In [24]:
# person with datetime answered "No" in RA survey, with NaT datetime didn't take the survey.
RA_Control_person_df = RA_Control_person_df.merge(RA_Control_survey_df[["person_id", "survey_datetime"]], how = "left")

In [25]:
ra_control_ehr_code_sql = """
    SELECT
        person_id,
        condition_concept_id,
        condition_start_datetime
    FROM
        `""" + os.environ["WORKSPACE_CDR"] + f""".condition_occurrence` 
    WHERE person_id in {tuple(ra_control_id)}
    """
ra_control_ehr_code = pd.read_gbq(
    ra_control_ehr_code_sql,
    dialect="standard",
    use_bqstorage_api=("BIGQUERY_STORAGE_API_ENABLED" in os.environ),
    progress_bar_type="tqdm_notebook")

ra_control_ehr_code.shape

Downloading:   0%|          | 0/1038090 [00:00<?, ?rows/s]

(1038090, 3)

In [26]:
# last ehr record
ra_control_ehr_code = ra_control_ehr_code.loc[ra_control_ehr_code.groupby('person_id')['condition_start_datetime'].idxmax()]
ra_control_ehr_code.shape

(8407, 3)

In [27]:
RA_Control_person_df_sampled = RA_Control_person_df[
    RA_Control_person_df['person_id'].isin(ra_control_id)
]

print(f"Sampled DataFrame shape: {RA_Control_person_df_sampled.shape}")

Sampled DataFrame shape: (10273, 7)


In [28]:
# In control, age at condition start time (X) is equivalent to last EHR follow-up time (C)
ra_control = RA_Control_person_df_sampled.merge(ra_control_ehr_code[['person_id', 'condition_start_datetime']], how = 'left')

In [29]:
ra_control['condition_start_datetime_ehr'] = ra_control['condition_start_datetime']
ra_control['ra_code_YN'] = np.where(ra_control['condition_start_datetime'].notna(), 0, None)
ra_control['ra_event_survey'] = np.where(ra_control['survey_datetime'].notna(), 0, None)

In [30]:
ra_df = pd.concat([ra_case, ra_control])
ra_df.shape

(14152, 12)

In [31]:
# Compute follow-up time (C)
ra_df['age_at_survey_event'] = ra_df.apply(
    lambda row: ((row['survey_datetime'] - row['date_of_birth']).days // 365) if pd.notna(row['survey_datetime']) else None, 
    axis=1
)

# Compute age at condition start time (X)
ra_df['age_at_condition_event'] = ra_df.apply(
    lambda row: ((row['condition_start_datetime'] - row['date_of_birth']).days // 365) if pd.notna(row['condition_start_datetime']) else None, 
    axis=1
)

# Compute last ehr follow-up time
ra_df['age_at_last_ehr'] = ra_df.apply(
    lambda row: ((row['condition_start_datetime_ehr'] - row['date_of_birth']).days // 365) if pd.notna(row['condition_start_datetime_ehr']) else None, 
    axis=1
)

In [32]:
# Filter for participants who answered 'yes' in the RA main questionnaire (ra_event_survey == 1.0)
ra_survey_yes = ra_df.loc[ra_df['ra_event_survey'] == 1.0]

# Calculate age statistics
survey_yes_median = ra_survey_yes['age_at_survey_event'].median()
survey_yes_Q1 = ra_survey_yes['age_at_survey_event'].quantile(0.25)
survey_yes_Q3 = ra_survey_yes['age_at_survey_event'].quantile(0.75)

# Display results with formatted output
print("=" * 100)
print("Participants who answered 'yes' in the RA main questionnaire - Age Statistics")
print("=" * 100)
print(f"Number of participants: {len(ra_survey_yes)}")
print(f"Median age: {survey_yes_median} years")
print(f"25th percentile: {survey_yes_Q1} years")
print(f"75th percentile: {survey_yes_Q3} years")
print("=" * 100)

Participants who answered 'yes' in the RA main questionnaire - Age Statistics
Number of participants: 3879
Median age: 61.0 years
25th percentile: 51.0 years
75th percentile: 69.0 years


In [33]:
# Filter for control participants
control_group = ra_df[ra_df['person_id'].isin(ra_control_id)]

# Calculate age statistics for control participants at condition event
control_ehr_followup_median = control_group['age_at_last_ehr'].median()
control_ehr_followup_Q1 = control_group['age_at_last_ehr'].quantile(0.25)
control_ehr_followup_Q3 = control_group['age_at_last_ehr'].quantile(0.75)

# Display results with formatted output
print("=" * 55)
print("Control Group - EHR Follow-up Statistics")
print("=" * 55)
print(f"Number of participants: {len(control_group)}")
print(f"Median age: {control_ehr_followup_median} years")
print(f"25th percentile: {control_ehr_followup_Q1} years")
print(f"75th percentile: {control_ehr_followup_Q3} years")
print("=" * 55)

Control Group - EHR Follow-up Statistics
Number of participants: 10273
Median age: 44.0 years
25th percentile: 30.0 years
75th percentile: 59.0 years


In [34]:
# Calculate age statistics for control participants at survey events
control_survey_median = control_group['age_at_survey_event'].median()
control_survey_Q1 = control_group['age_at_survey_event'].quantile(0.25)
control_survey_Q3 = control_group['age_at_survey_event'].quantile(0.75)

# Display results with academic formatting
print("=" * 60)
print("Control Group - Survey Age Distribution")
print("=" * 60)
print(f"Number of participants: {len(control_group)}")
print(f"Median age: {control_survey_median} years")
print(f"25th percentile: {control_survey_Q1} years")
print(f"75th percentile: {control_survey_Q3} years")
print("=" * 60)


Control Group - Survey Age Distribution
Number of participants: 10273
Median age: 50.0 years
25th percentile: 33.0 years
75th percentile: 64.0 years


# Combine with SNPs

In [35]:
# This snippet assumes you run setup first

# This code copies file in your Google Bucket and loads it into a dataframe

# Replace 'test.csv' with THE NAME of the file you're going to download from the bucket (don't delete the quotation marks)
name_of_file_in_bucket = 'RA_wgs.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file from the bucket to the current working space
os.system(f"gsutil cp '{my_bucket}/data/{name_of_file_in_bucket}' .")

print(f'[INFO] {name_of_file_in_bucket} is successfully downloaded into your working space')
# save dataframe in a csv file in the same workspace as the notebook
snp_df = pd.read_csv(name_of_file_in_bucket)

Copying gs://fc-secure-28df46b0-6f9d-4443-ae5f-cb0492e90c24/data/RA_wgs.csv...
/ [1 files][ 19.2 MiB/ 19.2 MiB]                                                
Operation completed over 1 objects/19.2 MiB.                                     


[INFO] RA_wgs.csv is successfully downloaded into your working space


In [36]:
# merge data
ra_df = ra_df.merge(snp_df[['person_id','12:111446804:T:C','12:45976333:C:G','13:39781776:T:C','14:104920174:G:A','14:68287978:G:A','1:116738074:C:T','5:143224856:A:G','6:159082054:A:G','6:36414159:G:GA','9:34710263:G:A']], on = 'person_id', how = 'left')
ra_df.shape

(14152, 25)

# Save document

In [37]:
# set up
import os
import subprocess
import numpy as np
import pandas as pd

In [38]:
# This snippet assumes you run setup first

# This code saves your dataframe into a csv file in a "data" folder in Google Bucket

# Replace df with THE NAME OF YOUR DATAFRAME
my_dataframe = ra_df

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'RA_data3_5.csv'

########################################################################
##
################# DON'T CHANGE FROM HERE ###############################
##
########################################################################

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

b'Copying file://./RA_data3_5.csv [Content-Type=text/csv]...\n/ [0 files][    0.0 B/  2.7 MiB]                                                \r-\r- [1 files][  2.7 MiB/  2.7 MiB]                                                \r\\\r\nOperation completed over 1 objects/2.7 MiB.                                      \n'