# Preparing Mediation Dataset 
This code compiles all relevant data, applies exclusion/inclusion criteria (i.e., removes participants with missing data/exclusionary medications), and exports final dataset to a CSV for mediation analysis. Additionally, the dataset is stratified into separate files for mediation analysis by stratification. 

Finally, descriptive statistics for covariates and exposure/mediator/outcome variables are generated. 

In [1]:
# Load packages
import numpy as np
import pandas as pd
import openpyxl
from openpyxl import load_workbook
from functools import reduce

In [2]:
# Load in main EEG/SpO2 data
filepath = "mastersheet.xlsx"
spo2_df = pd.read_excel(filepath, sheet_name="SPO2Final")
eeg_df = pd.read_excel(filepath, sheet_name="EEGFinal")

In [3]:
# Load participants we have dementia follow up data from 
dementia_fhs_df = pd.read_excel(filepath, sheet_name="DementiaFHS")
dementia_aric_df = pd.read_excel(filepath, sheet_name="DementiaARIC")

In [4]:
# Load in all covariates 
covariates = pd.read_excel(filepath, sheet_name="covariates")
medications = pd.read_excel(filepath, sheet_name="medications")
aric_phys = pd.read_excel(filepath, sheet_name="Aric_phys")
fhs_phys = pd.read_excel(filepath, sheet_name="FHS_phys")

In [5]:
# Combine both dementia follow up dfs into one, ensure that only relevant PIDs are copied 
pids_spo2 = set(spo2_df['PID'])
pids_eeg = set(eeg_df['PID'])
dementia_fhs_df["PID"] = dementia_fhs_df["PID"].astype(str).str.strip()
pids_dementia = set(dementia_fhs_df['PID']).union(set(dementia_aric_df['PID']))

pids_final = pids_spo2 & pids_eeg & pids_dementia
print(len(pids_final))

2439


In [6]:
# Standardize and filter by PID 
dementia_fhs_df["PID"] = dementia_fhs_df["PID"].astype(str).str.strip()
dementia_aric_df["PID"] = dementia_aric_df["PID"].astype(str).str.strip()

fhs_filtered = dementia_fhs_df[dementia_fhs_df["PID"].isin(pids_final)]
aric_filtered = dementia_aric_df[dementia_aric_df["PID"].isin(pids_final)]

# Combine both dfs into one final df 
dementia_final = pd.concat([fhs_filtered, aric_filtered], ignore_index=True)

print(f"dementia_final shape: {len(dementia_final)}")

dementia_final shape: 2439


### Apply exclusions (exclusionary meds/missing data/etc) 

In [7]:
print(covariates.columns)

Index(['PID', 'nsrr_age', 'nsrr_sex', 'nsrr_race', 'nsrr_bmi', 'educat',
       'alcoh', 'evsmok15', 'smknow15', 'cigday15', 'avesmk15'],
      dtype='object')


In [8]:
# Checking Tier 1 covariates first 
stage1_cols = covariates.loc[:, "nsrr_age":"nsrr_bmi"].columns

valid_data = covariates[stage1_cols].notna().all(axis=1)
valid_pid_count = covariates.loc[valid_data, "PID"].nunique()

print(f"PIDs with all stage 1 covariates: {valid_pid_count}")
print("stage 1 cov = age, race, bmi, sex")

PIDs with all stage 1 covariates: 2438
stage 1 cov = age, race, bmi, sex


In [9]:
# Checking Tier 2 
stage2_cols = ["educat", "alcoh", "evsmok15"]

valid_data = covariates[stage2_cols].notna().all(axis=1)
valid_pid_count = covariates.loc[valid_data, "PID"].nunique()

print(f"PIDs with all stage 2 covariates: {valid_pid_count}")
print("stage 2 cov = education, alc use, smoking history")

PIDs with all stage 2 covariates: 2399
stage 2 cov = education, alc use, smoking history


In [10]:
# Tiers 1 and 2 combined 

stage1_and2 = list(stage1_cols)+stage2_cols 

valid_data = covariates[stage1_and2].notna().all(axis=1)
valid_pid_count = covariates.loc[valid_data, "PID"].nunique()

print(f"PIDs with all stage 1 and 2 covariates: {valid_pid_count}")

PIDs with all stage 1 and 2 covariates: 2398


In [11]:
# Tier 3 -- exclusionary meds (oral hypoglycemics, insulins, oral steroids, sympathomimetics, asthma steroids, benzodiazepines) 
exclusionary_meds = medications.loc[:, ["PID"] + list(medications.loc[:, "ohga1":"benzod1"].columns)]
print("exclusionary med columns:", exclusionary_meds.columns)

# Use mask to find if participants are taking any exclusionary med
mask = exclusionary_meds.loc[:, "ohga1":"benzod1"].eq(1).any(axis=1)
excluded_df = exclusionary_meds[mask]

# Check number PIDs taking exclusionary meds
excluded_pids = excluded_df["PID"]
print(f"number taking exclusionary meds: {len(excluded_pids)}")

exclusionary med columns: Index(['PID', 'ohga1', 'insuln1', 'ostrd1', 'sympth1', 'istrd1', 'benzod1'], dtype='object')
number taking exclusionary meds: 336


In [12]:
# Tier 3--how many participants are missing medication data? 

med_cols = medications.loc[:, "tca1":"thry1"].columns
valid_data = medications[med_cols].notna().all(axis=1)
valid_pid_count = medications.loc[valid_data, "PID"].nunique()
print(f"PIDs with all medication data: {valid_pid_count}")

# Exclude PIDs who are taking exclusionary meds
valid_med_pids = medications.loc[valid_data & ~medications["PID"].isin(excluded_pids),"PID"].unique()
print(f"Number of included participants after medicine exclusions: {len(valid_med_pids)}")

PIDs with all medication data: 2431
Number of included participants after medicine exclusions: 2095


In [13]:
# Now that we have numbers, apply exclusions
# Step 1: Get PIDs with all Tier 1 + 2 covariate data
valid_covariate_data = covariates[stage1_and2].notna().all(axis=1)
valid_covariate_pids = covariates.loc[valid_covariate_data, "PID"]

# Step 2: Find intersection with included_pids from medication step
final_valid_pids = set(valid_med_pids) & set(valid_covariate_pids)

# Step 3: Print result
print(f"Final participant count with complete Stage 1+2 covariates and valid medication data : {len(final_valid_pids)}")

Final participant count with complete Stage 1+2 covariates and valid medication data : 2067


In [14]:
# Now we have valid PIDs from Tier1-Tier3 covariates 
# Need to check which participants are missing physical activity data 

# Starting with ARIC, check if RPAC8 (binary for exercise) is NaN  
aric_phys["PID"] = aric_phys["PID"].astype(str).str.strip()
aric_phys = aric_phys[["PID","RPAC8"]].copy()

In [15]:
# Now for FHS, which has physical activity split by sport/activity type, we will sum relevant variables (those ending in _2) 
activity_cols = fhs_phys.filter(regex='_2$', axis=1)

# Sum across those columns for each row
fhs_phys['total_activity_score'] = activity_cols.sum(axis=1)

# Keep only relevant columns
relevant_fhs = fhs_phys[["PID", "total_activity_score"]].copy()


In [16]:
# Check if any column with _2 is missing it's pair ending in _4 
# First, find all columns ending in _2
cols_2 = [col for col in fhs_phys.columns if col.endswith("_2")]

# Track mismatched pairs
mismatched = []

# For each _2 column, find its _4 pair and compare missingness
for col_2 in cols_2:
    col_4 = col_2[:-2] + "_4"
    if col_4 in fhs_phys.columns:
        # Check where one is null and the other is not
        mismatch_mask = fhs_phys[col_2].isna() != fhs_phys[col_4].isna()
        if mismatch_mask.any():
            mismatched.append((col_2, col_4, mismatch_mask.sum()))

# Show results
print("Number of column pairs with mismatches:", len(mismatched))

Number of column pairs with mismatches: 11


In [17]:
# Now exclude participants with mismatched data 

# Initialize a mask with all False (no mismatch)
mismatch_rows = pd.Series(False, index=fhs_phys.index)

# Loop through mismatched column pairs and accumulate mismatches
for col_2, col_4, _ in mismatched:
    mismatch_rows |= fhs_phys[col_2].isna() != fhs_phys[col_4].isna()

# Exclude those rows
fhs_phys_cleaned = fhs_phys[~mismatch_rows].copy()

print(f"Excluded {mismatch_rows.sum()} participants due to data mismatches")
print(f"Remaining participants: {len(fhs_phys_cleaned)}")

# Remove missing participants from the dataframe
fhs_phys_cleaned = fhs_phys.loc[~mismatch_rows].copy()


Excluded 29 participants due to data mismatches
Remaining participants: 575


In [18]:
# Checking if any data is NaN for ARIC participnats 
aric_nan_count = aric_phys["RPAC8"].isna().sum()
aric_phys["PID"] = aric_phys["PID"].astype(str).str.strip()
print(f"ARIC participants with any NaN in physical activity data: {aric_nan_count}")

# Checking NaN data for FHS participants 
fhs_nan_count = fhs_phys_cleaned["total_activity_score"].isna().sum()
fhs_phys_cleaned["PID"] = fhs_phys_cleaned["PID"].astype(str).str.strip()
print(f"Number of NaNs in total-activity_score: {fhs_nan_count}")


ARIC participants with any NaN in physical activity data: 0
Number of NaNs in total-activity_score: 0


In [19]:
# Now, getting pids with valid data 
pids_fhs_phys = set(fhs_phys_cleaned["PID"])
print(f"FHS valid pids: {len(pids_fhs_phys)}")

pids_aric_phys = set(aric_phys["PID"])  
print(f"ARIC valid pids: {len(pids_aric_phys)}")

# Combine physical activity PIDs
pids_phys_valid = pids_fhs_phys.union(pids_aric_phys)
print("ARIC and FHS pids together:", len(pids_phys_valid))

# Find intersection with the 2067 PIDs with valid Tier1-Tier 3 data 
pids_with_all_data = final_valid_pids & pids_phys_valid

# Step 3: Output the result
print(f"Total participants with complete covariates, meds, and physical activity data: {len(pids_with_all_data)}")


FHS valid pids: 575
ARIC valid pids: 1813
ARIC and FHS pids together: 2388
Total participants with complete covariates, meds, and physical activity data: 2028


### Creating CSV File for Mediation

Now that we have 2028 participants with every single data point needed, we can put together the csv file to pull into R for the mediation analysis.

Columns needed: 
* PID
* Data for 2 SpO2 exposures
* Data for 3 EEG mediators 
* The number of days to dementia diagnosis (from PSG) OR the number of follow up days
* The dementia event (as 0 or 1)
* Tier 1 covariates: bmi, race, age, sex
* Tier 2 covariates: education, alc, smoking
* Tier 3 covariates: medications binary for two groups 
* Physical activity data (RPAC8 from ARIC and the created total_activity_score for FHS) 

In [20]:
# Will create one big dataframe with ALL data before applying final checks and exclusions 

# Starting with exposures 
spo2_df["PID"] = spo2_df["PID"].astype(str)
filtered_spo2 = spo2_df.loc[spo2_df["PID"].isin(pids_with_all_data), ["PID", "hb_desat", "avg_spo2_no_desat_NREM"]]
print(len(filtered_spo2))
print(filtered_spo2.columns)

2028
Index(['PID', 'hb_desat', 'avg_spo2_no_desat_NREM'], dtype='object')


In [21]:
# Exposures 
eeg_df["PID"] = eeg_df["PID"].astype(str)
filtered_eeg = eeg_df.loc[eeg_df["PID"].isin(pids_with_all_data), ["PID", "SP_DENS_all_C", "SP_CHIRP_all_C", "SP_R_PHASE_IF_all_C"]]
print(len(filtered_eeg))
print(filtered_eeg.columns)

2028
Index(['PID', 'SP_DENS_all_C', 'SP_CHIRP_all_C', 'SP_R_PHASE_IF_all_C'], dtype='object')


In [22]:
# FHS dementia follow up 
dementia_fhs_df["PID"] = dementia_fhs_df["PID"].astype(str)
dementia_fhs = dementia_fhs_df.loc[dementia_fhs_df["PID"].isin(pids_with_all_data), ["PID", "DEM_STATUS","fhs_dementia_from_psg","fhs_dod_from_psg"]]

print(len(dementia_fhs))
print(dementia_fhs.columns)

489
Index(['PID', 'DEM_STATUS', 'fhs_dementia_from_psg', 'fhs_dod_from_psg'], dtype='object')


In [23]:
# ARIC dementia follow up 

dementia_aric_df["PID"] = dementia_aric_df["PID"].astype(str)
dementia_aric = dementia_aric_df.loc[dementia_aric_df["PID"].isin(pids_with_all_data), ["PID", "DEMDXL3_81","aric_dementia_from_psg","aric_dod_from_psg"]]

print(len(dementia_aric))
print(dementia_aric.columns)

1539
Index(['PID', 'DEMDXL3_81', 'aric_dementia_from_psg', 'aric_dod_from_psg'], dtype='object')


In [24]:
## Note: ARIC + FHS participants total the 2028 participants 
1539+489

2028

In [25]:
# Combining both dementia dataframes into one and adding dementia_event column
fhs = dementia_fhs.rename(columns={"fhs_dementia_from_psg": "days_psg_to_dementia", "fhs_dod_from_psg": "days_psg_to_death", "DEM_STATUS":"dementia_event"})
aric = dementia_aric.rename(columns={"aric_dementia_from_psg": "days_psg_to_dementia","aric_dod_from_psg": "days_psg_to_death", "DEMDXL3_81":"dementia_event"})

dementia = pd.concat([fhs, aric], ignore_index=True)
print("Number participants:", len(dementia))
print("Columns:", dementia.columns)
print("PIDs with follow up data:", dementia["days_psg_to_dementia"].notna().sum())
print("PIDs with DoD:", dementia["days_psg_to_death"].notna().sum())

Number participants: 2028
Columns: Index(['PID', 'dementia_event', 'days_psg_to_dementia', 'days_psg_to_death'], dtype='object')
PIDs with follow up data: 2028
PIDs with DoD: 882


In [26]:
# Starting with Tier 1 covariates
covariates["PID"] = covariates["PID"].astype(str)
filtered_cov = covariates.loc[covariates["PID"].isin(pids_with_all_data), ["PID", "nsrr_age", "nsrr_sex", "nsrr_race", "nsrr_bmi"]]
print(len(filtered_cov))

2028


In [27]:
# For Tier 2 covariates, need to recategorize
# Recategorize smoking data into binary 

# Recode: smknow15 == 1 → 1; everything else → 0
filtered_cov["smoking"] = (covariates["smknow15"] == 1).astype(int)

print(filtered_cov["smoking"].value_counts(dropna=False).sort_index())
print(filtered_cov["smoking"].sum())


smoking
0    1816
1     212
Name: count, dtype: int64
212


In [28]:
# Recategorize alcohol data into binary 

# Recode: alcoh > 0 → 1; if alcoh==0 → 0 
filtered_cov["alcohol"] = (covariates["alcoh"] > 0).astype(int)

print(filtered_cov["alcohol"].value_counts(dropna=False).sort_index())
print(filtered_cov["alcohol"].sum())


alcohol
0     980
1    1048
Name: count, dtype: int64
1048


In [29]:
# Recategorize education into binary 

# Recode: educat 1–2 → 0; 3–4 → 1
filtered_cov["education"] = covariates["educat"].isin([3, 4]).astype(int)

print(filtered_cov["education"].value_counts(dropna=False).sort_index())
print(filtered_cov["education"].sum())

education
0    1173
1     855
Name: count, dtype: int64
855


In [30]:
# Map sex and race into binary 
filtered_cov["nsrr_sex"] = filtered_cov["nsrr_sex"].astype(str).str.strip().str.lower()
filtered_cov["sex"] = filtered_cov["nsrr_sex"].map({"male": 0, "female": 1})
print(filtered_cov["sex"].value_counts(dropna=False))
filtered_cov = filtered_cov.drop(columns=["nsrr_sex"])

# Race 
filtered_cov["nsrr_race"] = filtered_cov["nsrr_race"].astype(str).str.strip().str.lower()
filtered_cov["race"] = filtered_cov["nsrr_race"].map({"white": 0, "black or african american": 1})
print(filtered_cov["race"].value_counts(dropna=False))
filtered_cov = filtered_cov.drop(columns=["nsrr_race"])

# Also rename age and BMI columns
filtered_cov = filtered_cov.rename(columns={"nsrr_bmi":"bmi", "nsrr_age":"age"})

sex
0    1014
1    1014
Name: count, dtype: int64
race
0    2027
1       1
Name: count, dtype: int64


In [31]:
# To avoid colinearity, dropping 1 Black participant 
pid_to_drop = filtered_cov.loc[filtered_cov["race"] == 1, "PID"].values[0]
filtered_cov = filtered_cov[filtered_cov["PID"] != pid_to_drop].copy()
print(len(filtered_cov))
print(filtered_cov["race"].value_counts())

2027
race
0    2027
Name: count, dtype: int64


In [32]:
# Reordering covariate columns for ease 
new_order = ["PID", "age", "sex", "race", "bmi", "education", "smoking", "alcohol"]
filtered_cov = filtered_cov[new_order]
print(filtered_cov.columns)

Index(['PID', 'age', 'sex', 'race', 'bmi', 'education', 'smoking', 'alcohol'], dtype='object')


In [33]:
# Now moving to Tier 3 (medications)
medications.columns

Index(['PID', 'tca1', 'ntca1', 'slpill15', 'htnmed1', 'loop1', 'hctz1',
       'hctzk1', 'ccbir1', 'ccbsr1', 'alpha1', 'alphad1', 'beta1', 'betad1',
       'ccb1', 'ace1', 'aced1', 'vaso1', 'vasod1', 'pvdl1', 'diuret1',
       'lipid1', 'warf1', 'pdei1', 'dig1', 'anar1a1', 'anar1b1', 'anar1c1',
       'anar31', 'asa1', 'ntg1', 'ohga1', 'insuln1', 'ostrd1', 'sympth1',
       'istrd1', 'benzod1', 'estrgn1', 'progst1', 'premar1', 'nsaid1', 'basq1',
       'niac1', 'thry1'],
      dtype='object')

In [34]:
# Filter by relevant PIDs
medications = medications[medications["PID"].isin(pids_with_all_data)].copy()

# Define med groups 
group1_meds = medications.loc[:, "tca1":"slpill15"].columns   
group2_meds = medications.loc[:, "htnmed1":"ntg1"].columns   
exclude_meds = medications.loc[:, "ohga1":"benzod1"].columns  

# Set binary for group 1 and 2 meds 
medications["group1_med"] = medications[group1_meds].eq(1).any(axis=1).astype(int)
medications["group2_med"] = medications[group2_meds].eq(1).any(axis=1).astype(int)

# Optional check
print(medications[["group1_med", "group2_med"]].value_counts(dropna=False))
#print(medications["group2_med"].value_counts(dropna=False))

# Final dataframe
med_relevant_pids = medications[["PID", "group1_med", "group2_med"]].copy()


group1_med  group2_med
0           1             1058
            0              800
1           1              107
            0               63
Name: count, dtype: int64


In [35]:
# Copy in the FHS and ARIC exercise data for sensitivity analysis
filtered_aric_phys = aric_phys[aric_phys["PID"].isin(pids_with_all_data)].copy()
filtered_fhs_phys = fhs_phys_cleaned[fhs_phys_cleaned["PID"].isin(pids_with_all_data)].copy()
fhs_phys_for_dataset = filtered_fhs_phys[["PID", "total_activity_score"]].copy()

In [36]:
# Merge into one column for exercise 
fhs_phys_for_dataset = fhs_phys_for_dataset.rename(columns={"total_activity_score": "exercise"})
filtered_aric_phys = filtered_aric_phys.rename(columns={"RPAC8": "exercise"})

exercise = pd.concat([fhs_phys_for_dataset, filtered_aric_phys], ignore_index=True)


### Merge all dfs together 

In [37]:
# Dfs to merge: filtered_spo2, filtered_eeg, dementia, filtered_cov, med_relevant_pids, exercise 

dfs_to_merge = [filtered_spo2, filtered_eeg, dementia, filtered_cov, med_relevant_pids, exercise]

# Merge them one by one using reduce
merged_df = reduce(lambda left, right: pd.merge(left, right, on="PID", how="inner"), dfs_to_merge)
print("Columns:", merged_df.columns)
print("PIDs:", len(merged_df))

Columns: Index(['PID', 'hb_desat', 'avg_spo2_no_desat_NREM', 'SP_DENS_all_C',
       'SP_CHIRP_all_C', 'SP_R_PHASE_IF_all_C', 'dementia_event',
       'days_psg_to_dementia', 'days_psg_to_death', 'age', 'sex', 'race',
       'bmi', 'education', 'smoking', 'alcohol', 'group1_med', 'group2_med',
       'exercise'],
      dtype='object')
PIDs: 2027


In [38]:
# Perform last checks
# First, is any data missing? Check all columns OTHER THAN Date of Death (since not every participant has a recorded DoD) 
cols_to_check = merged_df.columns.difference(["days_psg_to_death"])
rows_with_missing = merged_df[merged_df[cols_to_check].isna().any(axis=1)]


print("Number PIDs missing data:", len(rows_with_missing))
for idx, row in rows_with_missing.iterrows():
    missing_cols = row[cols_to_check][row[cols_to_check].isna()].index.tolist()
    print(f"Missing columns: {missing_cols}")

# Need to drop 2 PIDs with missing data

Number PIDs missing data: 2
Missing columns: ['SP_R_PHASE_IF_all_C']
Missing columns: ['SP_R_PHASE_IF_all_C']


In [39]:
# Find PIDs with missing values
bad_pids = rows_with_missing["PID"]

# Drop these PIDs from merged_df
merged_df_cleaned = merged_df[~merged_df["PID"].isin(bad_pids)].copy()

print(f"Final cleaned merged_df: {len(merged_df_cleaned)} participants")

Final cleaned merged_df: 2025 participants


In [40]:
# Now, check if any participant has a dementia diagnosis at time of PSG 

invalid_pids = merged_df_cleaned[(merged_df_cleaned["days_psg_to_dementia"] < 0)]
print("number invalid pids:", len(invalid_pids))

# Extract PIDs
bad_pids = invalid_pids["PID"].tolist()

# Drop PIDs from main_df
final_cleaned_df = merged_df_cleaned[~merged_df_cleaned["PID"].isin(bad_pids)].reset_index(drop=True)

# Check final number participants 
print("Final cleaned participants:", len(final_cleaned_df))

number invalid pids: 2
Final cleaned participants: 2023


In [41]:
# Export final df to csv 
#final_cleaned_df.to_csv("NAME FILE.csv", index=False)

### Exporting medication counts for reference 

In [42]:
final_pids = set(final_cleaned_df["PID"])
meds_check = medications[medications["PID"].isin(final_pids)]

results = []

for col in meds_check.columns:
    if col != "PID":
        counts = meds_check[col].value_counts(dropna=False).sort_index()
        for value, count in counts.items():
            results.append({"Medication": col, "Value": value, "Count": count})

# Convert to DataFrame and export to CSV
results_df = pd.DataFrame(results)
#results_df.to_csv("medication_value_counts.csv", index=False)


### Prepping datasets for stratifications

In [43]:
total_df = pd.read_csv("med_model_dataset_7.8.2025.csv")

In [44]:
# By sex 
df_male = total_df[total_df["sex"] == 0]
df_female = total_df[total_df["sex"] == 1]

print("Number males:", len(df_male))
print("Number females:", len(df_female))
#0 is male, 1 is female

# Save to CSV
#df_female.to_csv("mediation_dataset_female.csv", index=False)
#df_male.to_csv("mediation_dataset_male.csv", index=False)

# Check number of dementia events 
print("Female dementia count:", df_female["dementia_event"].sum())
print("Male dementia count:", df_male["dementia_event"].sum())


Number males: 1010
Number females: 1013
Female dementia count: 208
Male dementia count: 186


In [45]:
# By age, with 65 as threshold 
df_below_65= total_df[total_df["age"] < 65]
df_above_65= total_df[total_df["age"] >= 65]
print("Number below 65:", len(df_below_65))
print("Number above 65:", len(df_above_65))

# Save to CSV 
#df_below_65.to_csv("mediation_dataset_below65.csv", index=False)
#df_above_65.to_csv("mediation_dataset_above65.csv", index=False)

# Check dementia counts 
print("Above 65 dementia count:", df_above_65["dementia_event"].sum())
print("Below 65 dementia count:", df_below_65["dementia_event"].sum())

Number below 65: 1310
Number above 65: 713
Above 65 dementia count: 250
Below 65 dementia count: 144


In [46]:
# By BMI
normal_bmi = total_df[(total_df["bmi"] >= 18.5) & (total_df["bmi"] < 25)]
overweight_bmi = total_df[(total_df["bmi"] >= 25) & (total_df["bmi"] < 30)]
obese_bmi = total_df[total_df["bmi"] >= 30]

# Number participants 
print(f"number with normal BMI: {len(normal_bmi)}")
print(f"number with overweight BMI: {len(overweight_bmi)}")
print(f"number with obese BMI: {len(obese_bmi)}")

# Dementia event counts
print("Normal BMI dementia cases:", normal_bmi["dementia_event"].sum())
print("Overweight BMI dementia cases:", overweight_bmi["dementia_event"].sum())
print("Obese BMI dementia cases:", obese_bmi["dementia_event"].sum())

# Save to CSV
#normal_bmi.to_csv("mediation_dataset_normal.csv", index=False)
#overweight.to_csv("mediation_dataset_overweight.csv", index=False)
#obese.to_csv("mediation_dataset_obese.csv", index=False)

number with normal BMI: 464
number with overweight BMI: 867
number with obese BMI: 688
Normal BMI dementia cases: 94
Overweight BMI dementia cases: 172
Obese BMI dementia cases: 126


In [47]:
# By age and sex 
females_below_65 = total_df[(total_df["age"] < 65) & (total_df["sex"]==1)]
females_above_65 = total_df[(total_df["age"] >= 65) & (total_df["sex"]==1)]
print("Females below 65:", females_below_65["dementia_event"].sum())
print("Females above 65:", females_above_65["dementia_event"].sum())

males_below_65 = total_df[(total_df["age"] < 65) & (total_df["sex"]==0)]
males_above_65 = total_df[(total_df["age"] >= 65) & (total_df["sex"]==0)]
print("Males below 65:", males_below_65["dementia_event"].sum())
print("Males above 65:", males_above_65["dementia_event"].sum())

# Save to CSV 
females_above_65.to_csv("mediation_dataset_females_above65.csv", index=False)
males_above_65.to_csv("mediation_dataset_males_above65.csv", index=False)


# Check number PIDs
print(f"Number females above 65: {len(females_above_65)}")
print(f"Number males above 65: {len(males_above_65)}")

Females below 65: 79
Females above 65: 129
Males below 65: 65
Males above 65: 121
Number females above 65: 321
Number males above 65: 392


### Generating descriptive stats from final mediation dataset 

In [48]:
final_df = pd.read_csv("final_mediation_data_7.18.csv")

In [49]:
final_df.columns

Index(['PID', 'hb_desat', 'avg_spo2_no_desat_NREM', 'SP_DENS_all_C',
       'SP_CHIRP_all_C', 'SP_R_PHASE_IF_all_C', 'dementia_event',
       'days_psg_to_dementia', 'days_psg_to_death', 'age', 'sex', 'race',
       'bmi', 'education', 'smoking', 'alcohol', 'group1_med', 'group2_med',
       'exercise'],
      dtype='object')

In [50]:
# Tier 1-Tier 3 covariates 
cols = ("sex", "education", "smoking", "group1_med", "group2_med")

for i in cols:
    print(f"Column: {i}")
    print(final_df[i].value_counts(dropna=False).sort_index())
    print("\n")

print(f"Number with no medications: {((final_df['group1_med'] == 0) & (final_df['group2_med'] == 0)).sum()}")

Column: sex
sex
0    1010
1    1013
Name: count, dtype: int64


Column: education
education
0    1170
1     853
Name: count, dtype: int64


Column: smoking
smoking
0    1812
1     211
Name: count, dtype: int64


Column: group1_med
group1_med
0    1854
1     169
Name: count, dtype: int64


Column: group2_med
group2_med
0     861
1    1162
Name: count, dtype: int64


Number with no medications: 799


In [51]:
# Checking age by groupings

print(f"Mean: {final_df['age'].mean().round(2)}")
print(f"Std: {round(final_df['age'].std(), 2)}")

# Create groups from 40 to 90, step by 10 
groups = pd.cut(final_df['age'], bins=range(40, 100, 10), right=False)  
age_counts=groups.value_counts().sort_index()
age_counts.index =  age_counts.index.map(lambda x: f"{x.left}-{x.right - 1}")

print(age_counts)

# NOTE: final group is only 80-81 
print(f"Oldest age: {final_df['age'].max()}")

Mean: 61.49
Std: 6.87
age
40-49     72
50-59    763
60-69    903
70-79    281
80-89      4
Name: count, dtype: int64
Oldest age: 81


In [52]:
# Checking BMI by groupings

# Define categories by clinical definitions 
bmi_bins = [0, 18.5, 25, 30, 35, 40, float('inf')]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obesity I', 'Obesity II', 'Obesity III']

bmi_groups = pd.cut(final_df['bmi'], bins=bmi_bins, labels=bmi_labels, right=False)
bmi_counts = bmi_groups.value_counts().sort_index()

print(bmi_counts)

print(f"Mean: {final_df['bmi'].mean().round(2)}")
print(f"Std: {round(final_df['bmi'].std(), 2)}")

bmi
Underweight      4
Normal         464
Overweight     867
Obesity I      476
Obesity II     153
Obesity III     59
Name: count, dtype: int64
Mean: 28.67
Std: 5.0


In [53]:
# Alchol consumption, using clinical definitions 

# First need to pull in all data rather than the simplified binary
final_pids = set(final_df["PID"])
covariates_subset = covariates[covariates["PID"].isin(final_pids)].copy()
print(covariates_subset.columns)
# in females 
female_alcoh = covariates_subset.loc[covariates_subset["nsrr_sex"] == "female", ["PID", "alcoh"]]
print(len(female_alcoh))

no_drinking = (female_alcoh['alcoh'] == 0)
moderate = (female_alcoh['alcoh'] == 1)
mod_heavy = (female_alcoh['alcoh'] > 1) & (female_alcoh['alcoh'] < 4)
heavy = (female_alcoh['alcoh'] >= 4)

female_drinking_counts = {
    "No drinking": no_drinking.sum(),
    "Moderate (1 drink)": moderate.sum(),
    "Mod_Heavy (2-4 drinks)": mod_heavy.sum(),
    "Heavy (>=4 drinks)": heavy.sum(),
}

# Print counts
for category, count in female_drinking_counts.items():
    print(f"{category}: {count}")

Index(['PID', 'nsrr_age', 'nsrr_sex', 'nsrr_race', 'nsrr_bmi', 'educat',
       'alcoh', 'evsmok15', 'smknow15', 'cigday15', 'avesmk15'],
      dtype='object')
1013
No drinking: 575
Moderate (1 drink): 138
Mod_Heavy (2-4 drinks): 129
Heavy (>=4 drinks): 171


In [54]:
# Repeat for males

male_alcoh = covariates_subset.loc[covariates_subset["nsrr_sex"] == "male", ["PID", "alcoh"]]
print(len(male_alcoh))

no_drinking = (male_alcoh['alcoh'] == 0)
moderate = (male_alcoh['alcoh'] > 0) & (male_alcoh['alcoh'] <= 2)
mod_heavy = (male_alcoh['alcoh'] > 2) & (male_alcoh['alcoh'] < 5)
heavy = (male_alcoh['alcoh'] >=5 )

male_drinking_counts = {
    "No drinking": no_drinking.sum(),
    "Moderate (2 drinks or less)": moderate.sum(),
    "Mod_Heavy (2-5 drinks)": mod_heavy.sum(),
    "Heavy (>=5 drinks)": heavy.sum(),
}

# Print counts
for category, count in male_drinking_counts.items():
    print(f"{category}: {count}")


1010
No drinking: 401
Moderate (2 drinks or less): 169
Mod_Heavy (2-5 drinks): 113
Heavy (>=5 drinks): 327


In [55]:
results = []

variables = ["avg_spo2_no_desat_NREM", "hb_desat", "SP_DENS_all_C", "SP_CHIRP_all_C", "SP_R_PHASE_IF_all_C"]

for metric in variables:
    row = {"Metric": metric}
    for group_name, group_df in [
        ("Male", final_df[final_df['sex'] == 0]),
        ("Female", final_df[final_df['sex'] == 1]),
        ("Total", final_df)]:
        
        vals = pd.to_numeric(group_df[metric], errors='coerce')
        if not vals.empty:
            mean_val = vals.mean().round(2)
            sd_val = round(vals.std(), 2)
            row[group_name] = f"{mean_val} ± {sd_val}"
        else:
            row[group_name] = "N/A"
    results.append(row)

table = pd.DataFrame(results)
print(table)

                   Metric           Male         Female          Total
0  avg_spo2_no_desat_NREM   94.94 ± 1.51   95.57 ± 1.54   95.25 ± 1.56
1                hb_desat  73.44 ± 68.13  49.15 ± 50.91  61.27 ± 61.33
2           SP_DENS_all_C    2.46 ± 0.93     2.77 ± 0.9    2.62 ± 0.93
3          SP_CHIRP_all_C   -0.19 ± 0.12   -0.24 ± 0.12   -0.22 ± 0.12
4     SP_R_PHASE_IF_all_C    0.47 ± 0.21    0.52 ± 0.22    0.49 ± 0.22


In [56]:
# Stats for dementia events 
dementia_event = final_df[final_df["dementia_event"]==1].copy()
print("Num events:", len(dementia_event))

print(f"Days to dementia: {dementia_event['days_psg_to_dementia'].mean().round(2)} ± {round(dementia_event['days_psg_to_dementia'].std(), 2)}")

for label, value in [("Female", 1), ("Male", 0)]:
    group = dementia_event[dementia_event["sex"] == value]
    if not group.empty:
        mean_val = round(group["days_psg_to_dementia"].mean(), 2)
        std_val = round(group["days_psg_to_dementia"].std(), 2)
        print(f"{label}: {mean_val} ± {std_val}")
        print(f"Number in {label}: {len(group)}")
    else:
        print(f"{label}: No events")

Num events: 394
Days to dementia: 6086.17 ± 1691.55
Female: 6080.77 ± 1678.44
Number in Female: 208
Male: 6092.21 ± 1710.61
Number in Male: 186


In [57]:
# Stats for follow ups 
followup = final_df[final_df["dementia_event"]==0].copy()
print("Num events:", len(followup))

print(f"Days of follow up time: {followup['days_psg_to_dementia'].mean().round(2)} ± {round(followup['days_psg_to_dementia'].std(), 2)}")

for label, value in [("Female", 1), ("Male", 0)]:
    group = followup[followup["sex"] == value]
    if not group.empty:
        mean_val = round(group["days_psg_to_dementia"].mean(), 2)
        std_val = round(group["days_psg_to_dementia"].std(), 2)
        print(f"{label}: {mean_val} ± {std_val}")
        print(f"Number in {label}: {len(group)}")
    else:
        print(f"{label}: No events")

Num events: 1629
Days of follow up time: 6849.91 ± 2219.45
Female: 7153.89 ± 2013.0
Number in Female: 805
Male: 6552.95 ± 2367.88
Number in Male: 824


In [58]:
# Stats for PIDs with date of death 
dod = final_df[final_df["days_psg_to_death"].notna()].copy()
print("Num events:", len(dod))

print(f"Days until death: {dod['days_psg_to_death'].mean().round(2)} ± {round(dod['days_psg_to_death'].std(), 2)}")

for label, value in [("Female", 1), ("Male", 0)]:
    group = dod[dod["sex"] == value]
    if not group.empty:
        mean_val = round(group["days_psg_to_death"].mean(), 2)
        std_val = round(group["days_psg_to_death"].std(), 2)
        print(f"{label}: {mean_val} ± {std_val}")
        print(f"Number in {label}: {len(group)}")
    else:
        print(f"{label}: No events")


Num events: 878
Days until death: 5612.15 ± 2154.89
Female: 5871.24 ± 2008.03
Number in Female: 376
Male: 5418.09 ± 2241.05
Number in Male: 502


In [59]:
# Stats for dementia diagnosis to DoD
dementia_dod = final_df[final_df["days_psg_to_death"].notna() & final_df["dementia_event"]==1].copy()
print("Num events:", len(dementia_dod))

dementia_dod["dementia_to_dod"] = dementia_dod["days_psg_to_death"] - dementia_dod["days_psg_to_dementia"]

print(f"Days from dem diagnosis to death: {dementia_dod['dementia_to_dod'].mean().round(2)} ± {round(dementia_dod['dementia_to_dod'].std(), 2)}")

for label, value in [("Female", 1), ("Male", 0)]:
    group = dementia_dod[dementia_dod["sex"] == value]
    if not group.empty:
        mean_val = round(group["dementia_to_dod"].mean(), 2)
        std_val = round(group["dementia_to_dod"].std(), 2)
        print(f"{label}: {mean_val} ± {std_val}")
        print(f"Number in {label}: {len(group)}")
    else:
        print(f"{label}: No events")


Num events: 303
Days from dem diagnosis to death: 1094.03 ± 1069.57
Female: 1219.12 ± 1150.14
Number in Female: 155
Male: 963.02 ± 964.68
Number in Male: 148
