In [336]:
import livingpark_utils
import pandas as pd
import numpy as np

from collections import defaultdict
from itertools import combinations
from typing import Dict
from pathlib import Path

from collections.abc import Iterable

import rich
from rich.console import Console
from rich.table import Table

anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


cohort_dir = root_dir / "cohort"
if not anonymizer:
    print(f"Current working directory: {root_dir}")
else:
    print(f"Current working directory: {anondir(root_dir)}")
print(f"Cohort directory: {anondir(cohort_dir)}")

utils = livingpark_utils.LivingParkUtils()
random_seed = 2
utils.notebook_init()

import warnings

warnings.filterwarnings("ignore")

Current working directory: <living-park>
Cohort directory: <living-park>/cohort
This notebook was run on 2025-11-12 21:37:18 UTC +0000


In [337]:
import livingpark_utils
from livingpark_utils.download import ppmi

utils = livingpark_utils.LivingParkUtils()
print("Study files directory:", anondir(utils.study_files_dir))
downloader = ppmi.Downloader(utils.study_files_dir)

required_files = [
    "Demographics.csv",
    "Age_at_visit.csv",
    "Primary_Clinical_Diagnosis.csv",
    "Cognitive_Categorization.csv",
    "Medical_Conditions_Log.csv",
    "Concomitant_Medication_Log.csv",
    "MDS-UPDRS_Part_III.csv",
    "Participant_Status.csv",
    "Socio-Economics.csv",
    "Montreal_Cognitive_Assessment__MoCA_.csv",
    "PD_Diagnosis_History.csv",
    "LEDD_Concomitant_Medication_Log.csv",
]

utils.notebook_init()
utils.get_study_files(required_files, default=downloader)

Study files directory: <living-park>/inputs/study_files
This notebook was run on 2025-11-12 21:37:18 UTC +0000
Download skipped: No missing files!


In [338]:
from livingpark_utils.scripts import run

run.mri_metadata()

# Cohort preparation

We will build a cohort using data from Parkinson's Progression Markers Initiative (PPMI). We will select patients with Parkinson's disease (PD) with (PD-MCI) and without Mild Cognitive Impairment (PD-non-MCI) as well as healthy controls (HC). The cohort will be built directly from PPMI Study Data so that it can be replicated and updated whenever necessary.

We will use the LivingPark utils library to download files from the notebook. If files are already present in the notebook cache, they won't be downloaded again. Otherwise, you will need to enter your PPMI username and password. **In case you don't have a PPMI account, you can request one [here](http://ppmi-info.org).**

In [339]:
import os
import pandas as pd


# Read data files
def pd_from_csv(file_name):
    return pd.read_csv(os.path.join(utils.study_files_dir, file_name))


# Demographics
fields = ["PATNO", "SEX", "BIRTHDT"]
dem = pd_from_csv("Demographics.csv")[fields]

# Age at visit
fields = ["PATNO", "EVENT_ID", "AGE_AT_VISIT"]
age = pd_from_csv("Age_at_visit.csv")[fields]

# Education
fields = ["PATNO", "EDUCYRS"]
edu = pd_from_csv("Socio-Economics.csv")[fields]

# Diagnosis
fields = ["PATNO", "EVENT_ID", "PRIMDIAG", "OTHNEURO"]
diag = pd_from_csv("Primary_Clinical_Diagnosis.csv")[fields]

# Dx status
fields = ["PATNO", "COHORT"]
dx_status = pd_from_csv("Participant_Status.csv")[fields]

# PD dx history / disease duration calc
fields = ["PATNO", "EVENT_ID", "PDDXDT"]
pd_hist = pd_from_csv("PD_Diagnosis_History.csv")[fields]

# Cognitive Categorization
fields = ["PATNO", "EVENT_ID", "COGSTATE"]
cog_cat = pd_from_csv("Cognitive_Categorization.csv")[fields]

# UPDRS and Hoehh Yahr
fields = ["PATNO", "EVENT_ID", "PDSTATE", "NP3TOT", "NHY"]
updrs = pd_from_csv("MDS-UPDRS_Part_III.csv")[fields]

# Clean UPDRS file. Impute missing ON/OFF values.
# It produces MDS_UPDRS_Part_III_clean.csv file
# from livingpark_utils.scripts import pd_status
fields = ["PATNO", "EVENT_ID", "PDSTATE", "NP3TOT", "NHY", "PDTRTMNT"]
updrs = pd_from_csv("MDS_UPDRS_Part_III_clean.csv")[fields]

We will also need file `MRI_info.csv` produced by another LivingPark notebook available at https://github.com/LivingPark-MRI/livingpark-utils/blob/main/livingpark_utils/notebooks/mri_metadata.ipynb. This file contains a list of T1-weighted MRI images. 
    

In [340]:
import numpy as np

# Calculate disease duration

from dateutil.parser import parse
from dateutil.relativedelta import relativedelta

pdxhist = pd_hist[(pd_hist["EVENT_ID"] == "SC") & pd_hist["PDDXDT"].notna()]

fields = ["PATNO", "EVENT_ID", "INFODT"]
pd_dur = pd_from_csv("MDS_UPDRS_Part_III_clean.csv")[fields]

PDDXDT_map = dict(zip(pdxhist["PATNO"].values, pdxhist["PDDXDT"].values))
pd_dur["PDDXDT"] = pd_dur["PATNO"].map(PDDXDT_map)


def parse_date(row):
    if row["PDDXDT"] is not np.nan:
        return relativedelta(parse(row["INFODT"]), parse(row["PDDXDT"])).months
    else:
        return np.nan


pd_dur["PDXDUR"] = pd_dur.apply(parse_date, axis=1)
pd_dur.drop(labels=["INFODT", "PDDXDT"], inplace=True, axis=1)

In [341]:
# MRI availability

mri = pd_from_csv("MRI_info.csv")
mri["EVENT_ID"] = mri["Visit code"]
mri["PATNO"] = mri["Subject ID"]
mri["Sex"] = mri["Sex"].map({"F": 0, "M": 1})
mri = mri.drop(["Subject ID", "Visit code", "Visit", "Age", "Sex"], axis=1)
mri.groupby("EVENT_ID").size().sort_values(ascending=False).head(5)

EVENT_ID
BL     2937
V04     653
V10     468
V06     437
ST       10
dtype: int64

## Pair visits

In [342]:
# Find visit pairs

from collections import defaultdict
from itertools import combinations
from typing import Dict

visit2month = {
    "BL": 0,
    "V01": 3,
    "V02": 6,
    "V03": 9,
    "V04": 12,
    "V05": 18,
    "V06": 24,
    "V07": 30,
    "V08": 36,
    "V09": 42,
    "V10": 48,
    "V11": 54,
    "V12": 60,
    "V13": 72,
    "V14": 84,
    "V15": 96,
    "V16": 108,
    "V17": 120,
    "V18": 132,
    "V19": 144,
    "V20": 156,
}


def find_visit_pairs(months: int) -> Dict[str, str]:
    """Return the pairs of visits closest to each other, given a target time difference in months."""

    closest_diff = float("inf")
    closest_pairs = defaultdict(dict)

    for (visit1, month1), (visit2, month2) in combinations(visit2month.items(), 2):
        current_diff = abs(abs(month1 - month2) - months)
        if current_diff <= closest_diff:
            closest_diff = current_diff
            closest_pairs[closest_diff][visit1] = visit2

    return closest_pairs[closest_diff]

# Select HC

In [343]:
# diagnosis - use screening instead of baseline when PRIMDIAG is missing at baseline

# Diagnosis
# fields = ["PATNO", "EVENT_ID", "PRIMDIAG", "OTHNEURO"]
# diag = pd_from_csv("Primary_Clinical_Diagnosis.csv")[fields]

diag_bl = diag[diag["EVENT_ID"] == "BL"]
diag_other = diag[diag["EVENT_ID"] != "BL"]
diag_other["EVENT_ID"].mask(diag_other["EVENT_ID"] == "SC", "BL", inplace=True)

diag_hc = pd.concat([diag_bl, diag_other])
diag_hc = diag_hc.drop_duplicates()

In [344]:
# merge into a single df
df_hc = (
    mri.merge(diag_hc, on=["PATNO", "EVENT_ID"])
    .merge(age, on=["PATNO", "EVENT_ID"], how="left")
    .merge(dem, on=["PATNO"], how="left")
    .merge(dx_status, on=["PATNO"], how="left")  # check
    .merge(edu, on=["PATNO"], how="left")
    .merge(cog_cat, on=["PATNO", "EVENT_ID"], how="left")
    .merge(pd_hist, on=["PATNO", "EVENT_ID"], how="left")
    .drop_duplicates()
    .groupby("PATNO")
    .filter(lambda g: g["EVENT_ID"].nunique() > 1)
)

In [345]:
# find how many visit pairs are available for specific group
def find_unique_visit_pairs(df, events, group, get_set_event):
    print(f"Unique {group} subjects per visit pairs:")
    for c in combinations(events, 2):
        v0 = get_set_event(df, c[0])
        v1 = get_set_event(df, c[1])
        if len(v0 & v1):
            print(
                f"{c[0]:3} & {c[1]:3} = {len(v0 & v1):>3}"
                f" | Month difference: {visit2month[c[1]] - visit2month[c[0]]}"
            )
    #       print(v0 & v1)

In [346]:
# find how many visit pairs are available for HC group
def get_set_event(df, event):
    return set(df[(df["EVENT_ID"] == event) & (df["PRIMDIAG"] == 17)]["PATNO"].values)


events = ["BL", "V04", "V06", "V08", "V10"]
find_unique_visit_pairs(df_hc, events, "HC", get_set_event)

Unique HC subjects per visit pairs:
BL  & V04 = 104 | Month difference: 12
BL  & V06 =  45 | Month difference: 24
BL  & V08 =   2 | Month difference: 36
BL  & V10 = 116 | Month difference: 48
V04 & V06 =  16 | Month difference: 12
V04 & V08 =   1 | Month difference: 24
V04 & V10 =  13 | Month difference: 36
V06 & V08 =   2 | Month difference: 12
V06 & V10 =  19 | Month difference: 24


In [347]:
def pairs_hc(arg):
    """
    Return HC subjects that have a visit pair with the specified time difference in months
    """
    visit_pairs = find_visit_pairs(arg)
    visit_df = df_hc.copy()
    visit_df["NEXT_VISIT"] = visit_df["EVENT_ID"].map(visit_pairs)

    visit_df = visit_df.merge(
        visit_df.drop(
            ["AGE_AT_VISIT", "SEX", "NEXT_VISIT", "EDUCYRS"],
            axis=1,
        ),
        left_on=[
            "PATNO",
            "NEXT_VISIT",
        ],
        right_on=[
            "PATNO",
            "EVENT_ID",
        ],
        suffixes=(None, "_NX"),
    ).drop_duplicates()

    return visit_df.loc[(visit_df["PRIMDIAG"] == 17) & (visit_df["PRIMDIAG_NX"] == 17)]

In [348]:
# build database of all available HC
hc_12 = pairs_hc(12)
hc_24 = pairs_hc(24)
hc_36 = pairs_hc(36)
hc = pd.concat([hc_12, hc_24, hc_36], ignore_index=True)
hc = hc.loc[hc["COHORT"].isin([2, 4])]
hc = hc.drop_duplicates(subset=["PATNO"])
hc["dx_group"] = "HC"
print("Unique HC number before selection: ", hc["PATNO"].unique().size)

Unique HC number before selection:  138


# Data aggregation for PD

In [349]:
# Merge into a single df for PD
df = (
    mri.merge(diag, on=["PATNO", "EVENT_ID"])
    .merge(age, on=["PATNO", "EVENT_ID"], how="left")
    .merge(dem, on=["PATNO"])
    .merge(edu, on=["PATNO"], how="left")
    .merge(dx_status, on=["PATNO"])
    .merge(pd_hist, on=["PATNO", "EVENT_ID"], how="left")
    .merge(cog_cat, on=["PATNO", "EVENT_ID"])
    .drop_duplicates()
    .groupby("PATNO")
    .filter(lambda g: g["EVENT_ID"].nunique() > 1)
)

In [350]:
# Pair PD-non-MCI
def get_set_event(df, event):
    return set(
        df[
            (df["EVENT_ID"] == event)
            & (df["PRIMDIAG"] == 1)
            & (df["COGSTATE"] == 1)
            & (df["COHORT"] == 1)
            & (df["OTHNEURO"].isnull())
        ]["PATNO"].values
    )


events = ["BL", "V04", "V06", "V08", "V10"]
find_unique_visit_pairs(df, events, "PD-non-MCI", get_set_event)

Unique PD-non-MCI subjects per visit pairs:
BL  & V04 = 151 | Month difference: 12
BL  & V06 =  60 | Month difference: 24
V04 & V06 = 123 | Month difference: 12
V04 & V10 =  58 | Month difference: 36
V06 & V10 =  81 | Month difference: 24


In [351]:
# Pair PD-MCI
def get_set_event(df, event):
    return set(
        df[
            (df["EVENT_ID"] == event)
            & (df["PRIMDIAG"] == 1)
            & (df["COGSTATE"] == 2)
            & (df["COHORT"] == 1)
            & (df["OTHNEURO"].isnull())
        ]["PATNO"].values
    )


events = ["BL", "V04", "V06", "V08", "V10"]
find_unique_visit_pairs(df, events, "PD-MCI", get_set_event)

Unique PD-MCI subjects per visit pairs:
BL  & V04 =  12 | Month difference: 12
BL  & V06 =   3 | Month difference: 24
V04 & V06 =  11 | Month difference: 12
V04 & V10 =   6 | Month difference: 36
V06 & V10 =  16 | Month difference: 24


# Select PD-MCI patients

In [352]:
def get_mci_patients(df):
    return df.loc[
        (df["COGSTATE"] == 2)
        & (df["PRIMDIAG"] == 1)
        & (df["COHORT"] == 1)
        & (df["OTHNEURO"].isnull())
        & (df["COGSTATE_NX"] == 2)
        & (df["PRIMDIAG_NX"] == 1)
        & (df["COHORT_NX"] == 1)
        & (df["OTHNEURO_NX"].isnull())
    ]


def pairs_mci(arg):
    visit_pairs = find_visit_pairs(arg)
    visit_df = df.copy()
    visit_df["NEXT_VISIT"] = visit_df["EVENT_ID"].map(visit_pairs)

    visit_df = visit_df.merge(
        visit_df.drop(
            ["AGE_AT_VISIT", "SEX", "NEXT_VISIT", "EDUCYRS"],
            axis=1,
        ),
        left_on=[
            "PATNO",
            "NEXT_VISIT",
        ],
        right_on=[
            "PATNO",
            "EVENT_ID",
        ],
        suffixes=(None, "_NX"),
    ).drop_duplicates()

    return get_mci_patients(visit_df)

In [353]:
mci = pairs_mci(12)
mci = mci.drop_duplicates(subset=["PATNO"])

mci_24 = pairs_mci(24)
mci = pd.concat([mci, mci_24], ignore_index=True)
mci = mci.drop_duplicates(subset=["PATNO"])

mci_36 = pairs_mci(36)
mci = pd.concat([mci, mci_36], ignore_index=True)
mci = mci.drop_duplicates(subset=["PATNO"])

mci["dx_group"] = "PD-MCI"
print("There are ", len(mci), " PD-MCI patients.")

There are  36  PD-MCI patients.


# Select PD-non-MCI patients

In [354]:
def get_nonmci_patients(df):
    return df.loc[
        (df["COGSTATE"] == 1)
        & (df["PRIMDIAG"] == 1)
        & (df["COHORT"] == 1)
        & (df["OTHNEURO"].isnull())
        & (df["COGSTATE_NX"] == 1)
        & (df["PRIMDIAG_NX"] == 1)
        & (df["COHORT_NX"] == 1)
        & (df["OTHNEURO_NX"].isnull())
    ]


def pairs_nonmci(arg):
    visit_pairs = find_visit_pairs(arg)
    visit_df = df.copy()
    visit_df["NEXT_VISIT"] = visit_df["EVENT_ID"].map(visit_pairs)

    visit_df = visit_df.merge(
        visit_df.drop(
            ["AGE_AT_VISIT", "SEX", "NEXT_VISIT", "EDUCYRS"],
            axis=1,
        ),
        left_on=[
            "PATNO",
            "NEXT_VISIT",
        ],
        right_on=[
            "PATNO",
            "EVENT_ID",
        ],
        suffixes=(None, "_NX"),
    ).drop_duplicates()

    return get_nonmci_patients(visit_df)

In [355]:
wo_mci_12 = pairs_nonmci(12)
wo_mci_24 = pairs_nonmci(24)
wo_mci_36 = pairs_nonmci(36)
wo_mci_all = pd.concat([wo_mci_12, wo_mci_24, wo_mci_36], ignore_index=True)
wo_mci_all = wo_mci_all.drop_duplicates(subset=["PATNO"])
wo_mci_all["dx_group"] = "PD-non-MCI"

print("There are ", len(wo_mci_all), " PD-non-MCI patients.")

There are  273  PD-non-MCI patients.


## cohort to download

In [357]:
cohort = pd.concat([mci, wo_mci_all, hc], ignore_index=True)
cohort = cohort.drop_duplicates(subset=["PATNO"])
print("There are ", len(cohort), " unique subjects.")

There are  447  unique subjects.


In [358]:
# calculate time difference between the visits (Duration T2-T1)
cohort["Study Date"] = pd.to_datetime(cohort["Study Date"])
cohort["Study Date_NX"] = pd.to_datetime(cohort["Study Date_NX"])
cohort["durationT2_T1"] = cohort["Study Date_NX"] - cohort["Study Date"]
cohort["durationT2_T1_y"] = cohort["durationT2_T1"].dt.days / 365.25

## Descriptive statistics (computational cohort)

To calculate descriptive statistics we exclude images that failed preprocessing in the next steps. PPMI's Data Usage Agreement prevents us from publicaly sharing subjects' identifiers.

In [359]:
def set_patno_id(df):
    df["PATNO_id"] = "sub-" + df["PATNO"].astype(str) + "_ses-" + df["EVENT_ID"]

In [None]:
def into_cross_sectional(cohort: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    """Convert longitudinal cohort dataframe into cross-sectional format."""
    columns_all = cohort.columns.to_list()
    columns_followup = [col for col in cohort.columns if col.endswith("_NX")]
    columns_baseline = [col.replace("_NX", "") for col in columns_followup]
    columns_followup += ["NEXT_VISIT"]
    columns_shared = list(
        set(columns_all) - (set(columns_baseline) | set(columns_followup))
    )
    if verbose:
        print(f"All columns {columns_all}")
        print(f"Follow-up columns {columns_followup}")
        print(f"Baseline columns {columns_baseline}")
        print(f"Shared columns {columns_shared}")

    cohort_stat = cohort.copy()
    cohort_stat_baseline = cohort[columns_shared + columns_baseline].copy()
    cohort_stat_followup = cohort[columns_shared + columns_followup].copy()
    cohort_stat_baseline = cohort_stat_baseline.rename(
        columns={col: col.replace("_NX", "") for col in columns_baseline}
    )
    cohort_stat_followup = cohort_stat_followup.rename(
        columns={
            col: col.replace("_NX", "")
            for col in columns_followup
            if col != "NEXT_VISIT"
        }
    )
    cohort_stat_followup["AGE_AT_VISIT"] = round(
        cohort_stat_followup["AGE_AT_VISIT"] + cohort_stat_followup["durationT2_T1_y"],
        1,
    )
    cohort_stat_followup.drop(columns=["NEXT_VISIT"], inplace=True)

    set_patno_id(cohort_stat_baseline)
    set_patno_id(cohort_stat_followup)

    cohort_stat_baseline.rename(columns={"EVENT_ID": "visit"}, inplace=True)
    cohort_stat_followup.rename(columns={"EVENT_ID": "visit"}, inplace=True)

    assert (
        cohort_stat_baseline.columns.to_list() == cohort_stat_followup.columns.to_list()
    )

    cohort_stat_baseline["timepoint"] = "T1"
    cohort_stat_followup["timepoint"] = "T2"

    cohort_stat = pd.concat(
        [cohort_stat_baseline, cohort_stat_followup], ignore_index=True
    )
    cohort_stat.sort_values(by=["PATNO_id"], inplace=True)
    return cohort_stat

In [387]:
cohort_stat_raw = into_cross_sectional(cohort)
unique_patno_ids = cohort_stat_raw["PATNO"].nunique()
filename = cohort_dir / "cross-sectional_cohort_raw.csv"
cohort_stat_raw.to_csv(filename, index=False)
print(f"{unique_patno_ids} unique subjects found")
print(f"Saved cross-sectional_cohort_raw.csv to {anondir(filename.resolve())}")

447 unique subjects found
Saved cross-sectional_cohort_raw.csv to <living-park>/cohort/cross-sectional_cohort_raw.csv


In [388]:
vip_results = pd.read_csv(root_dir / "cohort" / "vip_executions_stats_info.csv")
# Select only subjects that has been processed on VIP platform
# since rerunning the cohort builder will download new MRI metadata
cohort_stat_exp = cohort_stat_raw[
    cohort_stat_raw["PATNO_id"].isin(vip_results["subject_visit"])
]
unique_patno_ids = cohort_stat_exp["PATNO"].nunique()
filename = cohort_dir / "cross-sectional_cohort_exp.csv"
cohort_stat_exp.to_csv(filename, index=False)
print(f"{unique_patno_ids} unique subjects found")
print(f"Saved cross-sectional_cohort_exp.csv to {anondir(filename.resolve())}")

317 unique subjects found
Saved cross-sectional_cohort_exp.csv to <living-park>/cohort/cross-sectional_cohort_exp.csv


In [389]:
qc_results = pd.read_csv(
    root_dir
    / "cohort"
    / "vip_executions_stats_info_2visits_passed_qc_with_26_repetitions.csv"
)
cohort_stat_qced = cohort_stat_raw[
    cohort_stat_raw["PATNO_id"].isin(qc_results["subject_visit"])
]
unique_patno_ids = cohort_stat_qced["PATNO"].nunique()
filename = cohort_dir / "cross-sectional_cohort_qced.csv"
cohort_stat_qced.to_csv(filename, index=False)
print(f"{unique_patno_ids} unique PATNO id found")
print(f"Saved cross-sectional_cohort_qced.csv to {anondir(filename.resolve())}")

267 unique PATNO id found
Saved cross-sectional_cohort_qced.csv to <living-park>/cohort/cross-sectional_cohort_qced.csv


In [307]:
from collections.abc import Iterable

import rich
from rich.console import Console
from rich.table import Table


def cohort_summary(*, hc, nc, mci, title):
    def gen_row(D, *, agg, col, f="4.1f", sep=" ± "):
        if not isinstance(agg, str) and isinstance(agg, Iterable):
            return [f"{sep}".join([f"{d.loc[a][col]:{f}}" for a in agg]) for d in D]
        else:
            return [f"{d.loc[agg][col]:{f}}" for d in D]

    def gender_ratio(df):
        male_count = df[df["SEX"] == 1]["PATNO"].nunique()
        return f"{male_count:.0f}, {male_count / df['PATNO'].nunique() * 100:.1f}%"

    D = [hc.describe(), nc.describe(), mci.describe()]

    table = Table(title=title, box=rich.box.SIMPLE_HEAVY, show_footer=True)

    table.add_column("Subject groups", footer="Values expressed as mean ± SD.")
    table.add_column("HC")
    table.add_column("PD-non-MCI")
    table.add_column("PD-MCI")

    table.add_row("n", *gen_row(D, agg="count", col="PATNO", f=".0f"))
    table.add_row("Age (y)", *gen_row(D, agg=["mean", "std"], col="AGE_AT_VISIT"))
    table.add_row(
        "Age range", *gen_row(D, agg=["min", "max"], col="AGE_AT_VISIT", sep=" - ")
    )
    table.add_row(
        "Gender (male, %)", gender_ratio(hc), gender_ratio(nc), gender_ratio(mci)
    )
    table.add_row("Education (y)", *gen_row(D, agg=["mean", "std"], col="EDUCYRS"))

    console = Console()
    console.print(table)

### Demographic of raw data

In [None]:
def show_demographics(cohort, timepoint="T1"):
    cohort = cohort[cohort["timepoint"] == timepoint]
    hc = cohort[cohort["dx_group"] == "HC"]
    nc = cohort[cohort["dx_group"] == "PD-non-MCI"]
    mci = cohort[cohort["dx_group"] == "PD-MCI"]

    cohort_summary(
        hc=hc,
        nc=nc,
        mci=mci,
        title="Demographic and clinical characteristics",
    )

In [392]:
show_demographics(cohort_stat_raw)

### Demographic of data preprocess with freesurfer-fuzzy

In [393]:
show_demographics(cohort_stat_exp)

### Demographic of data QCed with 26 repetitions

In [394]:
show_demographics(cohort_stat_qced)

In [None]:
from scipy.stats import ttest_ind, chi2_contingency
import numpy as np
import pandas as pd
from rich.console import Console
from rich.table import Table
from rich import box

console = Console()


def test_group_diff_age(group1, group2):
    t, p = ttest_ind(group1["AGE_AT_VISIT"], group2["AGE_AT_VISIT"])
    return t, p


def test_group_diff_edu(group1, group2):
    t, p = ttest_ind(group1["EDUCYRS"], group2["EDUCYRS"], nan_policy="omit")
    return t, p


def test_group_diff_duration(group1, group2):
    t, p = ttest_ind(group1["durationT2_T1_y"], group2["durationT2_T1_y"])
    return t, p


def test_group_diff_sex(group1, group2):
    a1 = group1["SEX"].value_counts()[0]
    a2 = group1["SEX"].value_counts()[1]
    b1 = group2["SEX"].value_counts()[0]
    b2 = group2["SEX"].value_counts()[1]

    obs = np.array([[a1, a2], [b1, b2]])
    chi2_test = chi2_contingency(obs)
    return chi2_test


def show_group_diff_rich(
    group1,
    group2,
    alpha: float = 0.05,
):
    """
    Computes:
        - Age t-test
        - Education t-test
        - Duration t-test
        - Sex chi² test
    And prints results in a Rich table with:
        - bold red p-values if significant (< alpha)
        - bold statistics for significant results
    """
    # ---- Compute tests using your existing functions ----
    t_age, p_age = test_group_diff_age(group1, group2)
    t_edu, p_edu = test_group_diff_edu(group1, group2)
    t_dur, p_dur = test_group_diff_duration(group1, group2)
    chi2_stat, chi2_p, _, chi2_table = test_group_diff_sex(group1, group2)

    # ---- Rich Table ----
    table = Table(
        title="Group Difference Tests",
        box=box.SIMPLE_HEAVY,
        show_lines=False,
        header_style="bold cyan",
    )

    table.add_column("Variable", justify="left")
    table.add_column("Statistic", justify="right")
    table.add_column("p-value", justify="right")
    table.add_column("Significant", justify="center")

    # Helper to format and color p-values
    def fmt_p(p):
        if p < alpha:
            return f"[bold red]{p:.4f}[/]"
        else:
            return f"{p:.4f}"

    def fmt_stat(stat, p):
        if p < alpha:
            return f"[bold]{stat:.4f}[/]"
        else:
            return f"{stat:.4f}"

    # Add rows
    table.add_row(
        "Age", fmt_stat(t_age, p_age), fmt_p(p_age), "✅" if p_age < alpha else "–"
    )

    table.add_row(
        "Education",
        fmt_stat(t_edu, p_edu),
        fmt_p(p_edu),
        "✅" if p_edu < alpha else "–",
    )

    table.add_row(
        "Duration (T2 - T1)",
        fmt_stat(t_dur, p_dur),
        fmt_p(p_dur),
        "✅" if p_dur < alpha else "–",
    )

    table.add_row(
        "Sex (Chi²)",
        fmt_stat(chi2_stat, chi2_p),
        fmt_p(chi2_p),
        "✅" if chi2_p < alpha else "–",
    )

    console.print(table)

    # Optional: print contingency table below
    console.print("\n[bold]Sex contingency table:[/]")
    console.print(pd.DataFrame(chi2_table))


def show_group_differences(cohort, timepoint="T1"):
    cohort = cohort[cohort["timepoint"] == timepoint]
    hc = cohort[cohort["dx_group"] == "HC"]
    nc = cohort[cohort["dx_group"] == "PD-non-MCI"]
    mci = cohort[cohort["dx_group"] == "PD-MCI"]
    pd_all = pd.concat([nc, mci], ignore_index=True)
    show_group_diff_rich(pd_all, hc)

In [402]:
show_group_differences(cohort_stat_raw)

In [403]:
show_group_differences(cohort_stat_exp)

In [404]:
show_group_differences(cohort_stat_qced)

# Define PD cohorts with UPDRS

In [315]:
# Merge into a single df for PD

df_clinical = (
    mri.merge(diag, on=["PATNO", "EVENT_ID"])
    .merge(age, on=["PATNO", "EVENT_ID"], how="left")
    .merge(dem, on=["PATNO"])
    .merge(edu, on=["PATNO"], how="left")
    .merge(dx_status, on=["PATNO"])
    .merge(pd_hist, on=["PATNO", "EVENT_ID"], how="left")
    .merge(cog_cat, on=["PATNO", "EVENT_ID"])
    .merge(updrs, on=["PATNO", "EVENT_ID"])  #
    .drop_duplicates()
    .groupby("PATNO")
    .filter(lambda g: g["EVENT_ID"].nunique() > 1)
)

In [316]:
# find PD-non-MCI with UPDRS score


def get_nonmci_with_UPDRS_patients(df):
    return df.loc[
        (df["COGSTATE"] == 1)
        & (df["PRIMDIAG"] == 1)
        & (df["COHORT"] == 1)
        & (df["OTHNEURO"].isnull())
        & (df["PDSTATE"] == "OFF")
        & (df["COGSTATE_NX"] == 1)
        & (df["PRIMDIAG_NX"] == 1)
        & (df["COHORT_NX"] == 1)
        & (df["OTHNEURO_NX"].isnull())
        & (df["PDSTATE_NX"] == "OFF")
    ]


def pairs_nonmci_with_UPDRS(arg):
    visit_pairs = find_visit_pairs(arg)
    visit_df = df_clinical.copy()
    visit_df["NEXT_VISIT"] = visit_df["EVENT_ID"].map(visit_pairs)

    visit_df = visit_df.merge(
        visit_df.drop(
            ["AGE_AT_VISIT", "SEX", "NEXT_VISIT", "EDUCYRS"],
            axis=1,
        ),
        left_on=[
            "PATNO",
            "NEXT_VISIT",
        ],
        right_on=[
            "PATNO",
            "EVENT_ID",
        ],
        suffixes=(None, "_NX"),
    ).drop_duplicates()

    return get_nonmci_with_UPDRS_patients(visit_df)

In [317]:
wo_mci_12 = pairs_nonmci_with_UPDRS(12)
wo_mci_24 = pairs_nonmci_with_UPDRS(24)
wo_mci_36 = pairs_nonmci_with_UPDRS(36)
wo_mci_all = pd.concat([wo_mci_12, wo_mci_24, wo_mci_36], ignore_index=True)
wo_mci_all = wo_mci_all.drop_duplicates(subset=["PATNO"])
wo_mci_all["dx_group"] = "PD-non-MCI"

wo_mci_all = wo_mci_all[~wo_mci_all["NP3TOT"].isna()]
wo_mci_all = wo_mci_all[~wo_mci_all["NP3TOT_NX"].isna()]

print("There are ", len(wo_mci_all), " PD-non-MCI patients.")

There are  139  PD-non-MCI patients.


In [318]:
# calculate the time difference between the visits
pd_clinical_raw = wo_mci_all

pd_clinical_raw["Study Date"] = pd.to_datetime(pd_clinical_raw["Study Date"])
pd_clinical_raw["Study Date_NX"] = pd.to_datetime(pd_clinical_raw["Study Date_NX"])
pd_clinical_raw["durationT2_T1"] = (
    pd_clinical_raw["Study Date_NX"] - pd_clinical_raw["Study Date"]
)
pd_clinical_raw["durationT2_T1_y"] = pd_clinical_raw["durationT2_T1"].dt.days / 365.25

# visit ID
set_patno_id(pd_clinical_raw)

# calculate the change of UPDRS score
pd_clinical_raw["NP3TOT_change"] = (
    pd_clinical_raw["NP3TOT"] - pd_clinical_raw["NP3TOT_NX"]
)

unique_patno_ids = pd_clinical_raw["PATNO_id"].nunique()
print(f"{unique_patno_ids} unique PD PATNO id found")
filename = cohort_dir / "pd_longitudinal_raw.csv"
pd_clinical_raw.to_csv(filename, index=False)
print(f"Saved pd_longitudinal_raw.csv to {anondir(filename.resolve())}")

139 unique PD PATNO id found
Saved pd_longitudinal_raw.csv to <living-park>/cohort/pd_longitudinal_raw.csv


In [319]:
pd_clinical_exp = pd_clinical_raw[
    pd_clinical_raw["PATNO_id"].isin(vip_results["subject_visit"])
]
unique_patno_ids = pd_clinical_exp["PATNO_id"].nunique()
print(f"{unique_patno_ids} unique PD PATNO id found")
filename = cohort_dir / "pd_longitudinal_exp.csv"
pd_clinical_exp.to_csv(filename, index=False)
print(f"Saved pd_longitudinal_exp.csv to {anondir(filename.resolve())}")

125 unique PD PATNO id found
Saved pd_longitudinal_exp.csv to <living-park>/cohort/pd_longitudinal_exp.csv


In [320]:
pd_clinical_qced = pd_clinical_raw[
    pd_clinical_raw["PATNO_id"].isin(qc_results["subject_visit"])
]
unique_patno_ids = pd_clinical_qced["PATNO_id"].nunique()
print(f"{unique_patno_ids} unique PD PATNO id found")
filename = cohort_dir / "pd_longitudinal_qced.csv"
pd_clinical_qced.to_csv(filename, index=False)
print(f"Saved pd_longitudinal_qced.csv to {anondir(filename.resolve())}")

112 unique PD PATNO id found
Saved pd_longitudinal_qced.csv to <living-park>/cohort/pd_longitudinal_qced.csv


## Descriptive statistics (clinical cohort)

In [321]:
from collections.abc import Iterable

import rich
from rich.console import Console
from rich.table import Table


def clinical_cohort_summary(*, hc, nc, title):
    def gen_row(D, *, agg, col, f="4.1f", sep=" ± "):
        if not isinstance(agg, str) and isinstance(agg, Iterable):
            return [f"{sep}".join([f"{d.loc[a][col]:{f}}" for a in agg]) for d in D]
        else:
            return [f"{d.loc[agg][col]:{f}}" for d in D]

    def gender_ratio(df):
        male_count = df[df["SEX"] == 1]["PATNO"].nunique()
        return f"{male_count:.0f}, {male_count / df['PATNO'].nunique() * 100:.1f}%"

    D = [hc.describe(), nc.describe()]

    table = Table(title=title, box=rich.box.SIMPLE_HEAVY, show_footer=True)

    table.add_column("Subject groups", footer="Values expressed as mean ± SD.")
    table.add_column("HC")
    table.add_column("PD-non-MCI")
    # table.add_column("[italic]p")  # TODO

    table.add_row("n", *gen_row(D, agg="count", col="PATNO", f=".0f"))
    table.add_row("Age (y)", *gen_row(D, agg=["mean", "std"], col="AGE_AT_VISIT"))
    table.add_row(
        "Age range", *gen_row(D, agg=["min", "max"], col="AGE_AT_VISIT", sep=" - ")
    )
    table.add_row("Gender (male, %)", gender_ratio(hc), gender_ratio(nc))
    table.add_row("Education (y)", *gen_row(D, agg=["mean", "std"], col="EDUCYRS"))
    table.add_row(
        "UPDRS III OFF baseline", "", *gen_row(D[1:], agg=["mean", "std"], col="NP3TOT")
    )
    table.add_row(
        "UPDRS III OFF follow-up",
        "",
        *gen_row(D[1:], agg=["mean", "std"], col="NP3TOT_NX"),
    )
    table.add_row(
        "Duration T2 - T1 (y)", *gen_row(D, agg=["mean", "std"], col="durationT2_T1_y")
    )

    console = Console()
    console.print(table)

### Demographic clinical raw population

In [322]:
filename = cohort_dir / "pd_longitudinal_raw.csv"
pd_clinical_raw = pd.read_csv(filename)
filename = cohort_dir / "hc_longitudinal_raw.csv"
hc_raw.to_csv(filename, index=False)

pd_nonmci_clinical_raw = pd_clinical_raw[pd_clinical_raw["dx_group"] == "PD-non-MCI"]
pd_nonmci_clinical_raw.to_csv("pd_nonmci_longitudinal_raw.csv", index=False)

clinical_cohort_summary(
    hc=hc_raw,
    nc=pd_nonmci_clinical_raw,
    title="Demographic and clinical characteristics",
)

In [323]:
filename = cohort_dir / "pd_longitudinal_exp.csv"
pd_clinical_exp = pd.read_csv(filename)
filename = cohort_dir / "hc_longitudinal_exp.csv"
hc_exp.to_csv(filename, index=False)

pd_nonmci_clinical_exp = pd_clinical_exp[pd_clinical_exp["dx_group"] == "PD-non-MCI"]
pd_nonmci_clinical_exp.to_csv("pd_nonmci_longitudinal_exp.csv", index=False)

clinical_cohort_summary(
    hc=hc_exp,
    nc=pd_nonmci_clinical_exp,
    title="Demographic and clinical characteristics",
)

In [324]:
filename = cohort_dir / "pd_longitudinal_qced.csv"
pd_clinical_qced = pd.read_csv(filename)
filename = cohort_dir / "hc_longitudinal_qced.csv"
hc_qced.to_csv(filename, index=False)

pd_nonmci_clinical_qced = pd_clinical_qced[pd_clinical_qced["dx_group"] == "PD-non-MCI"]
pd_nonmci_clinical_qced.to_csv("pd_nonmci_longitudinal_qced.csv", index=False)

clinical_cohort_summary(
    hc=hc_qced,
    nc=pd_nonmci_clinical_qced,
    title="Demographic and clinical characteristics",
)

In [325]:
show_group_diff_rich(pd_nonmci_clinical_raw, hc_raw, alpha=0.05)

In [326]:
show_group_diff_rich(pd_nonmci_clinical_exp, hc_exp, alpha=0.05)

In [327]:
show_group_diff_rich(pd_nonmci_clinical_qced, hc_qced, alpha=0.05)

In [328]:
def build_clinical_cohort(df):
    df["first_visit"] = "sub-" + df["PATNO"].astype(str) + "_ses-" + df["EVENT_ID"]
    df["second_visit"] = "sub-" + df["PATNO"].astype(str) + "_ses-" + df["EVENT_ID_NX"]
    set_patno_id(df)
    return df

In [329]:
clinical_cohort_raw = pd.concat([pd_nonmci_clinical_raw, hc_raw], ignore_index=True)
clinical_cohort_raw = build_clinical_cohort(clinical_cohort_raw)
filename = cohort_dir / "longitudinal_cohort_raw.csv"
clinical_cohort_raw.to_csv(filename, index=False)
print(f"Saved longitudinal_cohort_raw.csv to {anondir(filename.resolve())}")

Saved longitudinal_cohort_raw.csv to <living-park>/cohort/longitudinal_cohort_raw.csv


In [330]:
clinical_cohort_exp = pd.concat([pd_nonmci_clinical_exp, hc_exp], ignore_index=True)
clinical_cohort_exp = build_clinical_cohort(clinical_cohort_exp)
filename = cohort_dir / "longitudinal_cohort_exp.csv"
clinical_cohort_exp.to_csv(filename, index=False)
print(f"Saved longitudinal_cohort_exp.csv to {anondir(filename.resolve())}")

Saved longitudinal_cohort_exp.csv to <living-park>/cohort/longitudinal_cohort_exp.csv


In [331]:
clinical_cohort_qced = pd.concat([pd_nonmci_clinical_qced, hc_qced], ignore_index=True)
clinical_cohort_qced = build_clinical_cohort(clinical_cohort_qced)
filename = cohort_dir / "longitudinal_cohort_qced.csv"
clinical_cohort_qced.to_csv(filename, index=False)
print(f"Saved longitudinal_cohort_qced.csv to {anondir(filename.resolve())}")

Saved longitudinal_cohort_qced.csv to <living-park>/cohort/longitudinal_cohort_qced.csv
