In [15]:
import pandas as pd
import pingouin as pg
import numpy as np
from pathlib import Path
from tqdm import tqdm

# suppress warnings
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", message=".*column_view.*")

anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


display(f"Running in root dir: {anondir(root_dir)}")
stats_dir = Path(root_dir) / "stats_QCed" / "sampled"
print(f"Stats directory: {anondir(stats_dir)}")

output_dir = root_dir / "ancova_ieee"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {anondir(output_dir)}")

'Running in root dir: <living-park>'

Stats directory: <living-park>/stats_QCed/sampled
Output directory: <living-park>/ancova_ieee


In [16]:
def get_cohort_stats():
    filename = root_dir / "cohort" / "longitudinal_cohort_qced.csv"
    df_clinical = pd.read_csv(filename)
    print(f"Load cohort stats: {anondir(filename)}")
    columns = [
        "PATNO",
        "first_visit",
        "second_visit",
        "PD_status",
        "SEX",
        "AGE_AT_VISIT",
        "UPDRS",
        "durationT2_T1_y",
    ]
    df_clinical["PD_status"] = df_clinical["dx_group"].replace(
        {"PD-non-MCI": "PD", "HC": "HC"}
    )
    df_clinical.rename(columns={"NP3TOT": "UPDRS"}, inplace=True)
    n_pd = df_clinical[df_clinical["PD_status"] == "PD"]["PATNO"].nunique()
    n_hc = df_clinical[df_clinical["PD_status"] == "HC"]["PATNO"].nunique()
    print(f"Number of PD-non-MCI subjects: {n_pd}")
    print(f"Number of HC subjects: {n_hc}")
    print(f"Total number of subjects: {df_clinical['PATNO'].nunique()}")
    return df_clinical[columns]


df_clinical = get_cohort_stats()

Load cohort stats: <living-park>/cohort/longitudinal_cohort_qced.csv
Number of PD-non-MCI subjects: 112
Number of HC subjects: 89
Total number of subjects: 201


# ANCOVA

## Cortical

In [17]:
def read_table(filename, hemi, measure):
    filename = root_dir / "table_ieee" / f"{hemi}.aparc.{measure}.tsv"
    df = pd.read_csv(filename, sep="\t")
    df["hemi"] = hemi
    df.columns = [c.replace(f"{hemi}.", "") for c in df.columns]
    df.columns = [c.replace(f"{hemi}_", "") for c in df.columns]
    df.columns = [c.replace(f"_{measure}", "") for c in df.columns]
    df.rename(columns={f"aparc.{measure}": "PATNO_id"}, inplace=True)
    return df


def read_measure(measure):
    filename = root_dir / "table_ieee" / f"lh.aparc.{measure}.tsv"
    lh = read_table(filename, "lh", measure)
    filename = root_dir / "table_ieee" / f"rh.aparc.{measure}.tsv"
    rh = read_table(filename, "rh", measure)
    return pd.concat([lh, rh], axis=0)


def get_metric_visit(metric, cohort_df, visit):
    # Validate visit parameter
    if visit not in [1, 2]:
        raise ValueError("Visit must be 1 or 2")

    df = read_measure(metric)
    id_vars = ["PATNO_id", "hemi"]
    df = df.melt(id_vars=id_vars, var_name="region", value_name=metric)

    visit_col = "first_visit" if visit == 1 else "second_visit"

    clinical_columns = [
        visit_col,
        "AGE_AT_VISIT",
        "SEX",
        "durationT2_T1_y",
        "PD_status",
    ]

    merged_df = pd.merge(
        df,
        cohort_df[clinical_columns],
        left_on="PATNO_id",
        right_on=visit_col,
        how="inner",
    )

    # Clean up data types
    numeric_cols = [metric, "AGE_AT_VISIT", "durationT2_T1_y"]
    for col in numeric_cols:
        if col in merged_df.columns:
            merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")

    return merged_df


def get_longitudinal_metric(metric, cohort_df):
    baseline_df = get_metric_visit(metric, cohort_df=cohort_df, visit=1)
    next_df = get_metric_visit(metric, cohort_df=cohort_df, visit=2)

    if baseline_df.empty:
        raise ValueError("No baseline data available")
    if next_df.empty:
        raise ValueError("No longitudinal data available")

    baseline_df["PATNO"] = baseline_df["PATNO_id"].str.split("_").str[0]
    next_df["PATNO"] = next_df["PATNO_id"].str.split("_").str[0]

    # Compute change
    columns_to_merge = ["PATNO", "region", "hemi"]

    change_df = pd.merge(
        baseline_df,
        next_df,
        on=columns_to_merge,
        suffixes=("_baseline", "_next"),
    )

    if change_df.empty:
        raise ValueError("No matching records found between baseline and next visit")

    change_df[f"{metric}_change"] = (
        change_df[f"{metric}_next"] - change_df[f"{metric}_baseline"]
    ) / change_df[f"{metric}_baseline"]

    change_df.drop(columns=change_df.filter(regex="_baseline$").columns, inplace=True)
    change_df.rename(columns=lambda x: x.replace("_next", ""), inplace=True)

    return change_df

In [18]:
import pingouin as pg


def compute_ancova(measure, cohort_df, force):
    df = get_longitudinal_metric(measure, cohort_df)

    ancova_df = pd.DataFrame(columns=["hemi", "region", "F", "pval"])
    for hemi in df["hemi"].unique():
        for region in df["region"].unique():
            df_region = df[(df["hemi"] == hemi) & (df["region"] == region)]
            ancova = pg.ancova(
                data=df_region,
                dv=f"{measure}_change",
                between="PD_status",
                covar=["AGE_AT_VISIT", "SEX", "durationT2_T1_y"],
            )
            (F, pval) = ancova["F"].values[0], ancova["p-unc"].values[0]
            ancova_df.loc[len(ancova_df)] = [hemi, region, F, pval]

    ancova_df.rename(columns={"hemi": "hemisphere"}, inplace=True)
    filename = output_dir / f"ancova_longitudinal_{measure}.csv"
    ancova_df.to_csv(filename, index=False)

    return ancova_df

In [19]:
ancova_volume = compute_ancova("volume", df_clinical, force=True)
ancova_thickness = compute_ancova("thickness", df_clinical, force=True)
ancova_area = compute_ancova("area", df_clinical, force=True)

In [20]:
ancova_volume[ancova_volume["pval"] < 0.05].sort_values("F", ascending=False)

Unnamed: 0,hemisphere,region,F,pval
54,rh,parstriangularis,5.263752,0.022885
40,rh,entorhinal,4.491546,0.035382
6,lh,inferiorparietal,4.134082,0.043441
28,lh,superiortemporal,3.966986,0.047856


In [21]:
ancova_thickness[ancova_thickness["pval"] < 0.05].sort_values("F", ascending=False)

Unnamed: 0,hemisphere,region,F,pval
16,lh,parsopercularis,4.47463,0.035726


In [22]:
ancova_area[ancova_area["pval"] < 0.05].sort_values("F", ascending=False)

Unnamed: 0,hemisphere,region,F,pval
16,lh,parsopercularis,5.953314,0.015622
6,lh,inferiorparietal,5.003512,0.026477
28,lh,superiortemporal,4.172141,0.042497
37,rh,bankssts,4.071769,0.045035


## Subcortical Volume

In [None]:
def get_subcortical_volume_visit(cohort_df, visit):
    filename = root_dir / "table_ieee" / "aseg.volume.tsv"
    df = pd.read_csv(filename, sep="\t")
    df.rename(columns={"Measure:volume": "PATNO_id"}, inplace=True)

    # Validate visit parameter
    if visit not in [1, 2]:
        raise ValueError("Visit must be 1 or 2")

    df = df.melt(id_vars=["PATNO_id"], var_name="region", value_name="volume")

    visit_col = "first_visit" if visit == 1 else "second_visit"

    clinical_columns = [
        visit_col,
        "AGE_AT_VISIT",
        "SEX",
        "PD_status",
        "durationT2_T1_y",
    ]

    merged_df = pd.merge(
        df,
        cohort_df[clinical_columns],
        left_on="PATNO_id",
        right_on=visit_col,
        how="inner",
    )

    # Clean up data types
    numeric_cols = ["volume", "AGE_AT_VISIT", "durationT2_T1_y"]
    for col in numeric_cols:
        if col in merged_df.columns:
            merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")

    return merged_df


def get_ancova_subcortical_volume_longitudinal(cohort_df):
    baseline_df = get_subcortical_volume_visit(cohort_df, visit=1)
    next_df = get_subcortical_volume_visit(cohort_df, visit=2)

    if baseline_df.empty:
        raise ValueError("No baseline data available")
    if next_df.empty:
        raise ValueError("No longitudinal data available")

    baseline_df["PATNO"] = baseline_df["PATNO_id"].str.split("_").str[0]
    next_df["PATNO"] = next_df["PATNO_id"].str.split("_").str[0]

    # Compute change
    columns_to_merge = ["PATNO", "region"]

    change_df = pd.merge(
        baseline_df,
        next_df,
        on=columns_to_merge,
        suffixes=("_baseline", "_next"),
    )

    if change_df.empty:
        raise ValueError("No matching records found between baseline and next visit")

    change_df["volume_change"] = (
        change_df["volume_next"] - change_df["volume_baseline"]
    ) / change_df["volume_baseline"]

    change_df.drop(columns=change_df.filter(regex="_baseline$").columns, inplace=True)
    change_df.rename(columns=lambda x: x.replace("_next", ""), inplace=True)

    ancova_df = pd.DataFrame(columns=["region", "F", "pval"])
    for region in change_df["region"].unique():
        df_region = change_df[change_df["region"] == region]
        try:
            ancova = pg.ancova(
                data=df_region,
                dv="volume_change",
                between="PD_status",
                covar=["AGE_AT_VISIT", "SEX", "durationT2_T1_y"],
            )
            (F, pval) = ancova["F"].values[0], ancova["p-unc"].values[0]
            ancova_df.loc[len(ancova_df)] = [region, F, pval]
        except Exception as e:
            print("Error:", e)
            print(f"Skipping region: {region}")
            ancova_df.loc[len(ancova_df)] = [region, np.nan, np.nan]

    filename = output_dir / "ancova_longitudinal_subcortical_volume.csv"
    ancova_df.to_csv(filename, index=False)

    return ancova_df


ancova_results = get_ancova_subcortical_volume_longitudinal(df_clinical)

In [26]:
ancova_results

Unnamed: 0,region,F,pval
0,Left-Lateral-Ventricle,0.075442,0.783876
1,Left-Inf-Lat-Vent,0.557336,0.456273
2,Left-Cerebellum-White-Matter,0.447747,0.504232
3,Left-Cerebellum-Cortex,0.486503,0.486358
4,Left-Thalamus,0.059013,0.808328
...,...,...,...
59,MaskVol-to-eTIV,0.308570,0.579223
60,lhSurfaceHoles,0.215870,0.642745
61,rhSurfaceHoles,0.001155,0.972924
62,SurfaceHoles,0.549602,0.459410
