## Extracting Time-To-Dementia Follow Up Data

In [1]:
# Load all packages
import numpy as np
import pandas as pd
import openpyxl

In [2]:
# First step: Extract relevant PIDs (Those we have EEG/SpO2 data for) from mastersheet 
spo2_data = pd.read_excel("mastersheet.xlsx", sheet_name = "SPO2Final")
relevant_pids = spo2_data[["nsrrid", "PID", "Parent Study"]].copy()
print("Number participants in total:", len(relevant_pids))

Number participants in total: 5793


In [3]:
# Load in the FHS and ARIC datasets with dementia follow up data, along with days from ARIC/FHS Visit 1 to PSG date 

aric_outcomes = pd.read_csv("datasets_FHS_ARIC_CHS_SHHS\\dataset_ARIC_2024b\\Main_Study\\V8T\\CSV\\status81_np.csv")
fhs_outcomes = pd.read_csv("datasets_FHS_ARIC_CHS_SHHS\\dataset_Framingham_Offspring_2023b\\Datasets\\CSV\\vr_demsurv_2018_a_1281d.csv")
days_to_psg=pd.read_csv("parent_shhs_pids.csv")
days_to_psg = days_to_psg[["PID", "days_studyv1"]].copy()

### FHS-OFF1 Data

In [4]:
# How many participants do we have in spo2 dataset that are missing from fhs_outcomes? 

# Standardize the data type 
spo2_data["PID"] = spo2_data["PID"].astype(str).str.strip()
spo2_data["Parent Study"] = spo2_data["Parent Study"].astype(str).str.strip()
fhs_outcomes["PID"] = fhs_outcomes["PID"].astype(str).str.strip()

# Filter SpO2 for FOFF participants only 
spo2_foff_filtered = spo2_data[(spo2_data["Parent Study"] == "FOFF")]

# Find PIDs in SPO2 data who are missing dementia follow-up data
spo2_missing_foff_participants = set(spo2_foff_filtered["PID"]) - set(fhs_outcomes["PID"])

# Output
print(f"Number of FOFF PIDs in spo2 but not in fhs_outcomes: {len(spo2_missing_foff_participants)}")

# Note: FHS-OFF1 documentation says that these 6 PIDs may not have consented to information sharing 

Number of FOFF PIDs in spo2 but not in fhs_outcomes: 6


In [5]:
# Extract relevant follow up data for FOFF 

# If PID in SpO2 pids, pull out the relevant information 
spo2_pids = set(spo2_data["PID"])
fhs_filtered = fhs_outcomes[fhs_outcomes["PID"].isin(spo2_pids)]

fhs_dementia = fhs_filtered[["PID", "DEM_STATUS", "DEM_SURVDATE"]].copy()

# Checking count of participants with dementia (1) and without (0)
print(fhs_dementia["DEM_STATUS"].value_counts(dropna=False))

DEM_STATUS
0    553
1     62
Name: count, dtype: int64


In [6]:
# Pull out competing event (if date of death recorded) from relevant follow up dataset 
# Note: dod = date of death 
fhs_dod = pd.read_csv("datasets_FHS_ARIC_CHS_SHHS//dataset_Framingham_Offspring_2023b\\Datasets\\CSV\\vr_survdth_2019_a_1337d.csv")

# Standardize datatypes 
fhs_dod["PID"] = fhs_dod["PID"].astype(str).str.strip()
fhs_dod["DATEDTH"] = pd.to_numeric(fhs_dod["DATEDTH"], errors="coerce")

# Filter out only those we have SpO2 data for and copy relevant columns 
fhs_dod_filtered = fhs_dod[fhs_dod["PID"].isin(spo2_pids)]
fhs_dod_final = fhs_dod_filtered[["PID", "DATEDTH"]].copy()

print("Number participants total:", len(fhs_dod_final))
print(f"Number participants with DoD: {fhs_dod_final["DATEDTH"].notna().sum()}")

Number participants total: 615
Number participants with DoD: 217


In [7]:
# Merge DOD with dementia outcomes 
fhs_merged = pd.merge(fhs_dementia, fhs_dod_final, on="PID", how="left")
print(len(fhs_merged))

615


In [8]:
# Merge with PSG date 
fhs_final_dementia = pd.merge(fhs_merged, days_to_psg, on="PID", how="left")

In [9]:
# Verify if any participants have a dementia diagnosis recorded after DOD to ensure data harmony 

print(f"Number participants with dementia and dod: {((fhs_final_dementia["DATEDTH"].notna()) & (fhs_final_dementia["DEM_STATUS"] == 1)).sum()}")

dod_dementia_comparison = fhs_final_dementia[
    (fhs_final_dementia["DEM_STATUS"] == 1) &
    (fhs_final_dementia["DEM_SURVDATE"] < fhs_final_dementia["DATEDTH"])
]

print(f"Number with accurate data: {len(dod_dementia_comparison)}")

# All 56 participants with a DOD have a dementia diagnosis occurring before the dod, and none on the same day 

Number participants with dementia and dod: 56
Number with accurate data: 56


In [10]:
# Calculate DOD and dementia diagnosis date from PSG date 
# Note: DEM_SURVDATE is either the date of dementia diagnosis OR the date of last follow up for participants without dementia 
fhs_final_dementia["fhs_dementia_from_psg"] = fhs_final_dementia["DEM_SURVDATE"] - fhs_final_dementia["days_studyv1"]
fhs_final_dementia["fhs_dod_from_psg"] = fhs_final_dementia["DATEDTH"] - fhs_final_dementia["days_studyv1"]

print(f"Number FHS participants with data: {len(fhs_final_dementia)}")
print(f"Number with dementia diagnosis: {(fhs_final_dementia["DEM_STATUS"]==1).sum()}")

Number FHS participants with data: 615
Number with dementia diagnosis: 62


### ARIC Data

In [11]:
# How many participants do we have in the Spo2 dataset that are missing ARIC follow up data? 
aric_outcomes["ID_C"] = aric_outcomes["ID_C"].astype(str).str.strip()

# Filter spo2 for ARIC participants 
spo2_aric_filtered = spo2_data[(spo2_data["Parent Study"] == "ARIC")]

# Find PIDs in SPO2 dataset but not in ARIC outcomes
spo2_aric = set(spo2_aric_filtered["PID"]) - set(aric_outcomes["ID_C"])

# Output
print(f"Number of ARIC PIDs in spo2 but not in aric_outcomes: {len(spo2_aric)}")

Number of ARIC PIDs in spo2 but not in aric_outcomes: 90


In [12]:
# Extract follow up data 
aric_filtered = aric_outcomes[aric_outcomes["ID_C"].isin(spo2_pids)]
aric_dementia = aric_filtered[["ID_C", "DEMDXL3_81", "COXDATE_DEMDXL3_81_FOLLOWUPDAYS", "DATEOFDEATH_FOLLOWUPDAYS"]].copy()
aric_dementia.rename(columns={"ID_C":"PID"}, inplace=True)

print(aric_dementia["DEMDXL3_81"].value_counts(dropna=False))

DEMDXL3_81
0    1400
1     424
Name: count, dtype: int64


In [13]:
# Merge days from psg 
aric_final_dementia = pd.merge(aric_dementia, days_to_psg, on="PID", how="left")

In [14]:
# Ensure that dod is on/after dementia diagnosis date 

print(f"Number participants with dementia and dod: {(aric_final_dementia["DATEOFDEATH_FOLLOWUPDAYS"].notna() & (aric_final_dementia["DEMDXL3_81"] == 1)).sum()}")

dod_dementia_comparison = aric_final_dementia[
    (aric_final_dementia["DEMDXL3_81"] == 1) &
    (aric_final_dementia["COXDATE_DEMDXL3_81_FOLLOWUPDAYS"] <= aric_final_dementia["DATEOFDEATH_FOLLOWUPDAYS"])
]

print("Number with accurate data:", len(dod_dementia_comparison))

# all participants with a date of death occurred after dementia diagnosis (none on same day)

Number participants with dementia and dod: 318
Number with accurate data: 318


In [15]:
# Calculate DOD/dementia diagnosis date from PSG date 
# Note: COXDATE_DEMDXL3_81_FOLLOWUPDAYS is either the date of dementia diagnosis OR the date of last follow up for participants without dementia 
aric_final_dementia["aric_dementia_from_psg"] = aric_final_dementia["COXDATE_DEMDXL3_81_FOLLOWUPDAYS"] - aric_final_dementia["days_studyv1"]
aric_final_dementia["aric_dod_from_psg"] = aric_final_dementia["DATEOFDEATH_FOLLOWUPDAYS"] - aric_final_dementia["days_studyv1"]

print(f"Number ARIC participants with data: {len(aric_final_dementia)}")
print(f"Number with dementia diagnosis: {(aric_final_dementia["DEMDXL3_81"]==1).sum()}")

Number ARIC participants with data: 1824
Number with dementia diagnosis: 424


### Merge datasets to mastersheet

In [16]:
#with pd.ExcelWriter("mastersheet.xlsx", mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
    #fhs_final_dementia.to_excel(writer, sheet_name="DementiaFHS", index=False)

In [17]:
#with pd.ExcelWriter("mastersheet.xlsx", mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
    #aric_final_dementia.to_excel(writer, sheet_name="DementiaARIC", index=False)