## Generating dataset with physical activity as a covariate

This code converts physical activity data into a binary variable to use in the relevant mediation model script. 

In [1]:
# Load necessary libraries 
import numpy as np
import pandas as pd

In [2]:
# Load physical activity data from mastersheet
aric_data = pd.read_excel("mastersheet.xlsx", sheet_name="Aric_phys")
fhs_data = pd.read_excel("mastersheet.xlsx", sheet_name="FHS_phys")

In [3]:
# Load final pid list from mediation_model_dataset
mediation_dataset = pd.read_csv("mediation_model_dataset.csv")
mediation_dataset["PID"] = mediation_dataset["PID"].astype(str)
pids = mediation_dataset["PID"].tolist()


In [4]:
# Standardize PID columns 
fhs_data["PID"] = fhs_data["PID"].astype(str)
aric_data["PID"] = aric_data["PID"].astype(str)

fhs_final = fhs_data[fhs_data["PID"].isin(pids)].copy()
aric_final = aric_data[aric_data["PID"].isin(pids)].copy()

print(len(fhs_final))
print(len(aric_final))

486
1537


In [5]:
# RPAC8 is exercise/play sports during the year
aric_final["RPAC8"].value_counts(dropna=False)

RPAC8
Y    1135
N     402
Name: count, dtype: int64

In [6]:
# ARIC activity first  

# Map 'Y' → 1, 'N' → 0 based on RPAC8 values 
rpac8_map = aric_final.set_index("PID")["RPAC8"].map({"Y": 1, "N": 0})

# Create new column in med_dataset using the map
mediation_dataset["exercise"] = mediation_dataset["PID"].map(rpac8_map)

print(mediation_dataset["exercise"].value_counts(dropna=False).sort_index())

# 486 NaN values are from FHS 

exercise
0.0     402
1.0    1135
NaN     486
Name: count, dtype: int64


In [7]:
# Repeat process for FHS 

# Variables ending in _2 contain the number of sports/activities played over the year. FHS breaks them down into categories 
# Sum together all the variables to create an exercise score 
fhs_subset = fhs_final.filter(regex="_2$")
fhs_final["exercise_score"] = fhs_subset.sum(axis=1)    
print(fhs_final["exercise_score"].value_counts(dropna=False).sort_index())

exercise_score
0.0      2
1.0      2
3.0      3
4.0      6
5.0      5
        ..
133.0    1
146.0    1
157.0    1
171.0    1
285.0    1
Name: count, Length: 99, dtype: int64


In [8]:
# Now convert these values into a binary 
fhs_final["exercise_binary"] = (fhs_final["exercise_score"] > 0).astype(int)

print(fhs_final["exercise_binary"].value_counts(dropna=False).sort_index())

exercise_binary
0      2
1    484
Name: count, dtype: int64


In [9]:
# Merge the FHS values into the overall df now using a map 
# Create mapping
exercise_map = fhs_final.set_index("PID")["exercise_binary"].to_dict()
mediation_dataset["PID"] = mediation_dataset["PID"].astype(str)
fhs_final["PID"] = fhs_final["PID"].astype(str)

# Only update missing values, rather than replacing the ARIC participant values 
for i, row in mediation_dataset.iterrows():
    if pd.isna(row["exercise"]):
        pid = row["PID"]
        if pid in exercise_map:
            mediation_dataset.at[i, "exercise"] = exercise_map[pid]

print(mediation_dataset["exercise"].value_counts(dropna=False).sort_index())

exercise
0.0     404
1.0    1619
Name: count, dtype: int64


In [10]:
# Now storing data as a CSV to run mediation model  

#med_dataset.to_csv("exercise_med_datset.csv", index=False)

## Mediation Figures 
Creating a clean dataset to format as a table for performance metrics and mediation results. 

In [11]:
# Making formatted df with performance results for the significant pathways only 

# First import xlsx file and drop irrelevant columns
results = pd.read_excel("med_results_physAct_7.9.2025.xlsx", sheet_name="Group4")
results = results.drop(columns={"ACME_Bootstrap_Success", "Covariate_Set", "N_Used", "ADE_Bootstrap_Success", "AUC_5y"})
results = results.drop(columns=results.loc[:, "ACME":"ADE_pval_empirical"].columns)

# Rename the relevant columns 
results = results.rename(columns={"Std_Dev_Y":"SD (Mediator)",
                                         "C_Index":"C-Index", "AUC_10y":"AUC (10y)", "AUC_15y":"AUC (15y)"})

# Rename exposure/mediator variables with mapping 
exposure_map = {
    "avg_spo2_no_desat_NREM": "Avg Baseline SpO₂ NREM",
    "hb_desat": "Hypoxic Burden"
}
mediator_map = {
    "SP_DENS_all_C": "Spindle Density",
    "SP_CHIRP_all_C": "Spindle Chirp",
    "SP_R_PHASE_IF_all_C": "SO Phase Coupling"
}
results["Exposure"] = results["Exposure"].replace(exposure_map)
results["Mediator"] = results["Mediator"].replace(mediator_map)

# Round values 
cols_to_round = ["RMSE", "SD (Mediator)", "C-Index", "AUC (10y)", "AUC (15y)"] 
results[cols_to_round] = results[cols_to_round].round(3)

# Format to show CI values in brackets under the performance value 
for metric, lower, upper in [
    ("C-Index", "C_Index_CI_Lower", "C_Index_CI_Upper"),
    ("AUC (10y)", "AUC_10y_CI_Lower", "AUC_10y_CI_Upper"),
    ("AUC (15y)", "AUC_15y_CI_Lower", "AUC_15y_CI_Upper")
]:
    results[f"{metric} [95% CI]"] = results.apply(
        lambda row: f"{row[metric]:.3f}\n[{row[lower]:.3f}–{row[upper]:.3f}]", axis=1
    )

# Drop individual columns 
results.drop(columns=["C-Index", "AUC (10y)", "AUC (15y)", "C_Index_CI_Lower", "C_Index_CI_Upper",
                      "AUC_10y_CI_Lower", "AUC_10y_CI_Upper",
                      "AUC_15y_CI_Lower", "AUC_15y_CI_Upper"], inplace=True)

print(results)

                 Exposure           Mediator   RMSE  SD (Mediator)  \
0  Avg Baseline SpO₂ NREM    Spindle Density  0.866          0.932   
1  Avg Baseline SpO₂ NREM      Spindle Chirp  0.116          0.123   
2  Avg Baseline SpO₂ NREM  SO Phase Coupling  0.204          0.215   
3          Hypoxic Burden    Spindle Density  0.864          0.932   
4          Hypoxic Burden      Spindle Chirp  0.116          0.123   
5          Hypoxic Burden  SO Phase Coupling  0.204          0.215   

       C-Index [95% CI]    AUC (10y) [95% CI]    AUC (15y) [95% CI]  
0  0.787\n[0.765–0.808]  0.833\n[0.773–0.894]  0.822\n[0.786–0.858]  
1  0.784\n[0.762–0.806]  0.826\n[0.764–0.888]  0.816\n[0.779–0.854]  
2  0.784\n[0.762–0.806]  0.825\n[0.763–0.887]  0.816\n[0.779–0.853]  
3  0.787\n[0.765–0.809]  0.834\n[0.773–0.894]  0.823\n[0.786–0.859]  
4  0.784\n[0.762–0.806]  0.826\n[0.764–0.889]  0.817\n[0.779–0.854]  
5  0.784\n[0.762–0.806]  0.825\n[0.763–0.887]  0.817\n[0.779–0.854]  


In [12]:
# Export df to excel
#results.to_excel("NAME FILE.xlsx", index=False)

In [13]:
# Repeating process, but now creating the results df 

# Pick out columns from data 
results = pd.read_excel("med_results_physAct_7.9.2025.xlsx", sheet_name="Group4")
data = results[["Exposure","Mediator", "ACME", "ACME_CI_Lower", "ACME_CI_Upper", "ACME_pval_empirical", 
                "ADE", "ADE_CI_Lower", "ADE_CI_Upper", "ADE_pval_empirical",
                "Total_Effect", "Total_Effect_CI_Lower", "Total_Effect_CI_Upper", "TE_pval_empirical"]].copy()

# Rename exposure/mediator variables 
exposure_map = {
    "avg_spo2_no_desat_NREM": "Avg Desat-Removed Basal SpO₂ (NREM)",
    "hb_desat": "Hypoxic Burden"
}
mediator_map = {
    "SP_DENS_all_C": "Spindle Density",
    "SP_CHIRP_all_C": "Spindle Chirp",
    "SP_R_PHASE_IF_all_C": "Spindle-SO Coupling Phase"
}
data["Exposure"] = data["Exposure"].replace(exposure_map)
data["Mediator"] = data["Mediator"].replace(mediator_map)


# Sign transform for Hypoxic Burden rows (multiplying by -1 to enhance interpretability) 
mask = data["Exposure"] == "Hypoxic Burden"
cols_to_flip = [
    "ACME", "ACME_CI_Lower", "ACME_CI_Upper",
    "ADE", "ADE_CI_Lower", "ADE_CI_Upper",
    "Total_Effect", "Total_Effect_CI_Lower", "Total_Effect_CI_Upper"
]
for col in cols_to_flip:
    data.loc[mask, col] = pd.to_numeric(data.loc[mask, col], errors='coerce') * -1


# Format p-values to always show 3 decimal places 
for col in ["ACME_pval_empirical", "ADE_pval_empirical", "TE_pval_empirical"]:
    data[col] = data[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "")

# Format other numeric columns (except Exposure/Mediator) to 1 decimal places
for col in data.columns:
    if col not in ["Exposure", "Mediator", "ACME_pval_empirical", "ADE_pval_empirical", "TE_pval_empirical"]:
        if pd.api.types.is_numeric_dtype(data[col]):
            data[col] = data[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else "")


# Format to show CI values in brackets under the performance value 
# Also need to flip hypoxic burden CI to upper-lower, since we multiplied by -1 
for metric, lower, upper in [
    ("ACME", "ACME_CI_Lower", "ACME_CI_Upper"),
    ("ADE", "ADE_CI_Lower", "ADE_CI_Upper"),
    ("Total_Effect", "Total_Effect_CI_Lower", "Total_Effect_CI_Upper")
]:
    data[f"{metric} [95% CI]"] = data.apply(
        lambda row: (
            f"{row[metric]}\n[{row[upper]}, {row[lower]}]"
            if row["Exposure"] == "Hypoxic Burden"
            else f"{row[metric]}\n[{row[lower]}, {row[upper]}]"
        ),
        axis=1
    )

print(data.columns)

# Drop the separated CI columns
data.drop(columns=[
    "ACME_CI_Lower", "ACME_CI_Upper",
    "ADE_CI_Lower", "ADE_CI_Upper",
    "Total_Effect_CI_Lower", "Total_Effect_CI_Upper"
], inplace=True)


# Rename columns 
data = data.rename(columns={"ACME_pval_empirical":"ACME \np-value",
                            "ADE_pval_empirical": "ADE p-value",
                            "TE_pval_empirical":"TE p-value",
                            "Total_Effect [95% CI]":"TE [95% CI]"})

# Reorder columns 
col_order = [
    "Exposure", "Mediator",
    "ACME [95% CI]", "ACME \np-value",
     "ADE [95% CI]", "ADE p-value",
     "TE [95% CI]", "TE p-value"
]
data = data[col_order]


print(data)

Index(['Exposure', 'Mediator', 'ACME', 'ACME_CI_Lower', 'ACME_CI_Upper',
       'ACME_pval_empirical', 'ADE', 'ADE_CI_Lower', 'ADE_CI_Upper',
       'ADE_pval_empirical', 'Total_Effect', 'Total_Effect_CI_Lower',
       'Total_Effect_CI_Upper', 'TE_pval_empirical', 'ACME [95% CI]',
       'ADE [95% CI]', 'Total_Effect [95% CI]'],
      dtype='object')
                              Exposure                   Mediator  \
0  Avg Desat-Removed Basal SpO₂ (NREM)            Spindle Density   
1  Avg Desat-Removed Basal SpO₂ (NREM)              Spindle Chirp   
2  Avg Desat-Removed Basal SpO₂ (NREM)  Spindle-SO Coupling Phase   
3                       Hypoxic Burden            Spindle Density   
4                       Hypoxic Burden              Spindle Chirp   
5                       Hypoxic Burden  Spindle-SO Coupling Phase   

          ACME [95% CI] ACME \np-value              ADE [95% CI] ADE p-value  \
0   138.5\n[4.9, 354.6]          0.034  539.3\n[-1675.8, 2676.2]       0.645   
1  

In [14]:
# Export to excel 
#data.to_excel("NAME FILE.xlsx", index=False)