# Partial correlation longitudinal

## Purpose

Compute partial correlation between metrics and UPDRS score longitudinal.
Metrics are cortical thickness, area, volume and subcortical volume.

- UPDRS score change: $T_2 - T_1$
- Metric rate change: $\left| \dfrac{T_2-T_1}{T_1}\right|$


## Definition

Pingouin [method](https://pingouin-stats.org/build/html/generated/pingouin.partial_corr.html#pingouin.partial_corr)

Partial correlation [1] measures the degree of association between x and y, after removing the effect of one or more controlling variables (covar or $Z$). Practically, this is achieved by calculating the correlation coefficient between the residuals of two linear regressions:

$$x \sim Z, y \sim Z$$

Like the correlation coefficient, the partial correlation coefficient takes on a value in the range from –1 to 1, where 1 indicates a perfect positive association.

The semipartial correlation is similar to the partial correlation, with the exception that the set of controlling variables is only removed for either x or y, but not both.

Pingouin uses the method described in [2] to calculate the (semi)partial correlation coefficients and associated p-values. This method is based on the inverse covariance matrix and is significantly faster than the traditional regression-based method. Results have been tested against the ppcor R package.

## Get info about subjects

In [34]:
import pandas as pd
from IPython.display import display
import os
from tqdm import tqdm
from pathlib import Path

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*column_view.*")

force = True
anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


display(f"Running in root dir: {anondir(root_dir)}")
stats_dir = Path(root_dir) / "stats_QCed" / "sampled"
print(f"Stats directory: {anondir(stats_dir)}")

output_dir = root_dir / "partial_correlation"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {anondir(output_dir)}")

'Running in root dir: <living-park>'

Stats directory: <living-park>/stats_QCed/sampled
Output directory: <living-park>/partial_correlation


In [35]:
def get_cohort_stats():
    filename = root_dir / "cohort" / "longitudinal_cohort_qced.csv"
    df_clinical = pd.read_csv(filename)
    print(f"Load cohort stats: {anondir(filename)}")
    columns = [
        "PATNO",
        "first_visit",
        "second_visit",
        "PD_status",
        "SEX",
        "AGE_AT_VISIT",
        "UPDRS_change",
        "durationT2_T1_y",
    ]

    df_clinical["PD_status"] = df_clinical["dx_group"].replace(
        {"PD-non-MCI": "PD", "HC": "HC"}
    )
    df_clinical = df_clinical[df_clinical.dx_group == "PD-non-MCI"]
    df_clinical.rename(columns={"NP3TOT_change": "UPDRS_change"}, inplace=True)
    n_pd = df_clinical[df_clinical["PD_status"] == "PD"]["PATNO"].nunique()
    n_hc = df_clinical[df_clinical["PD_status"] == "HC"]["PATNO"].nunique()
    print(f"Number of PD-non-MCI subjects: {n_pd}")
    print(f"Number of HC subjects: {n_hc}")
    print(f"Total number of subjects: {df_clinical['PATNO'].nunique()}")
    return df_clinical[columns]


df_clinical = get_cohort_stats()

Load cohort stats: <living-park>/cohort/longitudinal_cohort_qced.csv
Number of PD-non-MCI subjects: 112
Number of HC subjects: 0
Total number of subjects: 112


In [36]:
def assert_number_of_repetitions(df, hemisphere=None, repetitions=26):
    """
    Assert that each subjects/region has exactly N repetitions.
    """
    groups = ["subjects", "region"] + (["hemisphere"] if hemisphere else [])
    grouped = df.groupby(groups).count() == repetitions
    assert (
        grouped.all().all()
    ), f"Not all subjects/regions have {repetitions} repetitions. {grouped[grouped == False].count()}"

## Code
### Create longitudinal dataframe

In [37]:
from pathlib import Path
import numpy as np
import pandas as pd


def _ensure_parent(path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)


def _cached_parquet(path: Path, force: bool = False) -> pd.DataFrame | None:
    if path.exists() and not force:
        return pd.read_parquet(path)
    return None


def get_visit_at_metric_df(
    metric: str,
    visit: int,
    hemisphere: bool,
    cohort_df: pd.DataFrame,
    stats_dir: Path,
    output_dir: Path,
    force: bool = False,
) -> pd.DataFrame:
    """
    Long-form df for one visit:
      ['repetition','subject_visit','dx_group','PD_status',('hemisphere',) 'region', metric,
       'AGE_AT_VISIT','SEX','UPDRS_change','durationT2_T1_y' (visit 2)]
    """
    if visit not in (1, 2):
        raise ValueError("visit must be 1 or 2")
    visit_name = "baseline" if visit == 1 else "longitudinal"

    cache = output_dir / f"partial_correlation_visit-{visit_name}_df_{metric}.parquet"
    cached = _cached_parquet(cache, force=force)
    if cached is not None:
        return cached

    wide = pd.read_parquet(stats_dir / f"{metric}.parquet")
    if "PATNO_id" in wide.columns:
        wide = wide.drop(columns=["PATNO_id"])
    if hemisphere and "hemi" in wide.columns:
        wide = wide.rename(columns={"hemi": "hemisphere"})

    # repetition per subject(/hemisphere)
    gcols = ["subject_visit"] + (["hemisphere"] if hemisphere else [])
    wide["repetition"] = wide.groupby(gcols).cumcount() + 1

    # melt to long
    id_vars = ["repetition", "subject_visit", "dx_group", "PD_status"] + (
        ["hemisphere"] if hemisphere else []
    )
    long = wide.melt(id_vars=id_vars, var_name="region", value_name=metric)
    long["region"] = long["region"].str.replace(f"_{metric}", "", regex=False)

    # merge clinical
    visit_col = "first_visit" if visit == 1 else "second_visit"
    clinical_columns = [
        visit_col,
        "AGE_AT_VISIT",
        "SEX",
        "UPDRS_change",
        "durationT2_T1_y",
    ]
    base = long.merge(
        cohort_df[clinical_columns],
        left_on="subject_visit",
        right_on=visit_col,
        how="inner",
    ).drop(columns=[visit_col])

    # numerics
    for c in (metric, "AGE_AT_VISIT", "UPDRS_change", "durationT2_T1_y"):
        if c in base:
            base[c] = pd.to_numeric(base[c], errors="coerce")

    _ensure_parent(cache)
    base.to_parquet(cache)
    return base


# ===================== partial correlation + change ===================


def compute_partial_correlation(
    change_df: pd.DataFrame,
    metric: str,
    hemisphere: bool,
    output_dir: Path,
    force: bool = False,
) -> pd.DataFrame:
    """
    Compute per-(repetition, region[, hemisphere]) partial correlation:
      x = metric_change, y = UPDRS_change, covar = [AGE_AT_VISIT, SEX, durationT2_T1_y]
    Returns columns: ['repetition','region',('hemisphere',) 'r','p-val','n']
    """
    import pingouin as pg

    cache = output_dir / f"partial_correlation_longitudinal_{metric}.parquet"
    cached = _cached_parquet(cache, force=force)
    if cached is not None:
        return cached

    # required columns
    req = {
        f"{metric}_change",
        "UPDRS_change",
        "AGE_AT_VISIT",
        "SEX",
        "repetition",
        "region",
        "durationT2_T1_y",
    }
    if hemisphere:
        req.add("hemisphere")
    missing = [c for c in req if c not in change_df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    hemis = change_df["hemisphere"].unique() if hemisphere else [None]
    reps = sorted(change_df["repetition"].unique())
    regions = change_df["region"].unique()

    rows = []
    for rep in reps:
        for region in regions:
            for hemi in hemis:
                if hemisphere and hemi is not None:
                    data = change_df[
                        (change_df["repetition"] == rep)
                        & (change_df["region"] == region)
                        & (change_df["hemisphere"] == hemi)
                    ]
                else:
                    data = change_df[
                        (change_df["repetition"] == rep)
                        & (change_df["region"] == region)
                    ]

                if data.empty:
                    continue

                try:
                    pc = pg.partial_corr(
                        data=data,
                        x=f"{metric}_change",
                        y="UPDRS_change",
                        covar=["AGE_AT_VISIT", "SEX", "durationT2_T1_y"],
                        method="pearson",
                    )
                    r, pval, n = pc["r"].iloc[0], pc["p-val"].iloc[0], pc["n"].iloc[0]
                    if hemisphere and hemi is not None:
                        rows.append((rep, region, hemi, r, pval, n))
                    else:
                        rows.append((rep, region, r, pval, n))
                except Exception:
                    # Skip invalid cells silently
                    continue

    cols = (
        ["repetition", "region", "hemisphere", "r", "p-val", "n"]
        if hemisphere
        else ["repetition", "region", "r", "p-val", "n"]
    )
    out = pd.DataFrame(rows, columns=cols)

    _ensure_parent(cache)
    out.to_parquet(cache)
    return out


# ============================== significance ==========================


def compute_significance(
    pcorr_df: pd.DataFrame,
    hemisphere: bool = False,
    alpha: float = 0.05,
) -> pd.DataFrame:
    """
    Aggregate partial-correlation results per region (and hemisphere if present).
    Returns basic summary stats and significance counts. No side-effects.
    """
    if pcorr_df.empty:
        return pd.DataFrame()

    df = pcorr_df.copy()
    pcol = "p-val"

    df["significant"] = df[pcol] < alpha
    gcols = ["region"] + (
        ["hemisphere"] if hemisphere and "hemisphere" in df.columns else []
    )

    agg = (
        df.groupby(gcols, dropna=False)
        .agg(
            n_tests=("r", "count"),
            n_significant=("significant", "sum"),
            proportion_significant=("significant", "mean"),
            mean_r=("r", "mean"),
            std_r=("r", "std"),
            min_r=("r", "min"),
            max_r=("r", "max"),
            mean_p=(pcol, "mean"),
            std_p=(pcol, "std"),
            min_p=(pcol, "min"),
            max_p=(pcol, "max"),
            mean_n=("n", "mean"),
            min_n=("n", "min"),
            max_n=("n", "max"),
        )
        .reset_index()
    )

    return agg.sort_values("proportion_significant", ascending=False)


# ============================ orchestration ===========================


def run_partial_correlation_pipeline(
    metric: str,
    hemisphere: bool,
    cohort_df: pd.DataFrame,
    stats_dir: Path,
    output_dir: Path,
    alpha: float = 0.05,
    force: bool = False,
) -> pd.DataFrame:
    """
    Full pipeline (visit 1 & 2 → change → partial corr → aggregated significance).
    Returns only the final aggregated significance DataFrame. No printing/saving CSV.
    """
    # visit dfs
    v1 = get_visit_at_metric_df(
        metric,
        visit=1,
        hemisphere=hemisphere,
        cohort_df=cohort_df,
        stats_dir=stats_dir,
        output_dir=output_dir,
        force=force,
    )
    v2 = get_visit_at_metric_df(
        metric,
        visit=2,
        hemisphere=hemisphere,
        cohort_df=cohort_df,
        stats_dir=stats_dir,
        output_dir=output_dir,
        force=force,
    )
    if v1.empty or v2.empty:
        return pd.DataFrame()

    # robust subject key
    v1 = v1.assign(PATNO=v1["subject_visit"].astype(str).str.split("_").str[0])
    v2 = v2.assign(PATNO=v2["subject_visit"].astype(str).str.split("_").str[0])

    merge_keys = ["PATNO", "region", "repetition"] + (
        ["hemisphere"] if hemisphere else []
    )
    m = v1.merge(v2, on=merge_keys, suffixes=("_baseline", "_next"))
    if m.empty:
        return pd.DataFrame()

    # compute relative change and keep visit-2 covariates
    m[f"{metric}_change"] = (m[f"{metric}_next"] - m[f"{metric}_baseline"]) / m[
        f"{metric}_baseline"
    ]
    m = m.drop(columns=[c for c in m.columns if c.endswith("_baseline")])
    m = m.rename(columns=lambda x: x.replace("_next", ""))

    # partial corr + aggregation
    pcorr = compute_partial_correlation(
        m, metric=metric, hemisphere=hemisphere, output_dir=output_dir, force=force
    )
    sig = compute_significance(pcorr, hemisphere=hemisphere, alpha=alpha)

    out_csv = output_dir / f"partial_correlation_longitudinal_{metric}.csv"
    _ensure_parent(out_csv)
    sig.to_csv(out_csv, index=False)

    return sig

### Compute partial correlation

### Get significant partial correlation among repetitions

## Cortical volume

In [38]:
# Enhanced single metric analysis
significance_volume_df = run_partial_correlation_pipeline(
    metric="volume",
    cohort_df=df_clinical,
    hemisphere=True,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [39]:
significance_volume_df

Unnamed: 0,region,hemisphere,n_tests,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
65,superiortemporal,rh,26,23,0.884615,-0.246935,0.044037,-0.334088,-0.139742,0.024223,0.034047,0.000681,0.165545,103.0,103,103
47,pericalcarine,rh,26,14,0.538462,-0.208797,0.043951,-0.306372,-0.101547,0.056222,0.064649,0.001934,0.314752,103.0,103,103
25,isthmuscingulate,rh,26,12,0.461538,-0.205774,0.078341,-0.335865,-0.075474,0.101281,0.124746,0.000635,0.455479,103.0,103,103
50,posteriorcingulate,lh,26,11,0.423077,-0.175459,0.058740,-0.308590,-0.086083,0.127361,0.114796,0.001786,0.394440,103.0,103,103
31,lingual,rh,26,8,0.307692,-0.160721,0.045732,-0.238967,-0.071027,0.144332,0.112354,0.016646,0.482540,103.0,103,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,superiorfrontal,lh,26,0,0.000000,-0.076954,0.050750,-0.189074,0.014178,0.484581,0.260005,0.059565,0.961157,103.0,103,103
67,supramarginal,rh,26,0,0.000000,-0.081056,0.037152,-0.166014,0.008771,0.443998,0.207675,0.098791,0.950306,103.0,103,103
69,temporalpole,rh,26,0,0.000000,0.047960,0.035786,-0.023370,0.132164,0.641787,0.211750,0.189931,0.974971,103.0,103,103
70,transversetemporal,lh,26,0,0.000000,-0.004633,0.044954,-0.095999,0.075429,0.730744,0.185563,0.342052,0.964362,103.0,103,103


## Cortical thickness

In [40]:
# Enhanced single metric analysis
significance_thickness_df = run_partial_correlation_pipeline(
    metric="thickness",
    cohort_df=df_clinical,
    hemisphere=True,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [41]:
significance_thickness_df

Unnamed: 0,region,hemisphere,n_tests,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
27,isthmuscingulate,rh,26,16,0.615385,-0.228568,0.065451,-0.398828,-0.116790,0.052784,0.067575,0.000040,0.247196,103.0,103,103
67,superiortemporal,rh,26,14,0.538462,-0.208142,0.030961,-0.272072,-0.159081,0.046312,0.029201,0.006174,0.113902,103.0,103,103
32,lingual,lh,26,9,0.346154,-0.176968,0.049827,-0.269073,-0.103176,0.111925,0.091687,0.006789,0.307010,103.0,103,103
37,middletemporal,rh,26,6,0.230769,-0.160394,0.039795,-0.227026,-0.068794,0.139022,0.117279,0.023119,0.496440,103.0,103,103
49,pericalcarine,rh,26,4,0.153846,-0.129145,0.055466,-0.241265,-0.050124,0.260588,0.193079,0.015600,0.620423,103.0,103,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,superiortemporal,lh,26,0,0.000000,-0.048172,0.040476,-0.137347,0.068927,0.615212,0.213877,0.172986,0.968824,103.0,103,103
69,supramarginal,rh,26,0,0.000000,0.036440,0.029007,-0.026066,0.104251,0.714164,0.179654,0.301971,0.930359,103.0,103,103
71,temporalpole,rh,26,0,0.000000,0.051958,0.043284,-0.026672,0.150253,0.622935,0.247032,0.135666,0.997448,103.0,103,103
72,transversetemporal,lh,26,0,0.000000,0.024412,0.037746,-0.054253,0.121505,0.737752,0.185631,0.228496,0.999259,103.0,103,103


## Cortical surface area

In [42]:
# Enhanced single metric analysis
significance_area_df = run_partial_correlation_pipeline(
    metric="area",
    cohort_df=df_clinical,
    hemisphere=True,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [43]:
significance_area_df

Unnamed: 0,region,hemisphere,n_tests,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
8,caudalmiddlefrontal,lh,26,7,0.269231,-0.138635,0.071590,-0.253730,-0.028133,0.263558,0.253996,0.010858,0.781128,103.0,103,103
11,cuneus,rh,26,6,0.230769,-0.140175,0.056307,-0.222049,-0.020206,0.225046,0.218815,0.026394,0.841844,103.0,103,103
26,isthmuscingulate,lh,26,6,0.230769,-0.137635,0.077348,-0.276742,0.031601,0.258337,0.255629,0.005315,0.986876,103.0,103,103
70,temporalpole,lh,26,6,0.230769,0.109643,0.070125,-0.012543,0.225016,0.367239,0.292223,0.024397,0.901428,103.0,103,103
49,pericalcarine,rh,26,5,0.192308,-0.149386,0.055491,-0.285944,-0.072014,0.189921,0.149350,0.003927,0.476459,103.0,103,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,superiortemporal,lh,26,0,0.000000,-0.047426,0.050403,-0.170870,0.006190,0.677184,0.290519,0.089179,0.990687,103.0,103,103
69,supramarginal,rh,26,0,0.000000,-0.092289,0.034499,-0.162801,-0.012522,0.386134,0.189986,0.105586,0.901589,103.0,103,103
68,supramarginal,lh,26,0,0.000000,-0.041134,0.072998,-0.176854,0.156024,0.571259,0.296899,0.078372,0.968563,103.0,103,103
72,transversetemporal,lh,26,0,0.000000,-0.055217,0.058489,-0.163330,0.054698,0.533187,0.245060,0.104443,0.944165,103.0,103,103


## Subcortical volume

In [None]:
# Enhanced single metric analysis
significance_subcortical_volume_df = run_partial_correlation_pipeline(
    metric="subcortical_volume",
    cohort_df=df_clinical,
    hemisphere=False,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [45]:
significance_subcortical_volume_df

Unnamed: 0,region,n_tests,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
3,Brain-Stem,26,14,0.538462,-0.207944,0.049522,-0.282099,-0.095351,0.064541,0.079899,0.004462,0.345337,103.0,103,103
1,4th-Ventricle,26,8,0.307692,-0.154761,0.05776,-0.247957,-0.046455,0.182322,0.174717,0.012869,0.646262,103.0,103,103
39,Right-Inf-Lat-Vent,26,7,0.269231,-0.160437,0.065627,-0.324879,-0.061484,0.173268,0.14891,0.000974,0.543394,103.0,103,103
29,Left-vessel,26,6,0.230769,0.140798,0.070996,-0.01461,0.271266,0.243631,0.243394,0.006335,0.885287,103.0,103,103
43,Right-Thalamus,26,4,0.153846,-0.159558,0.043368,-0.24366,-0.056292,0.144991,0.124273,0.01457,0.578016,103.0,103,103
35,Right-Caudate,26,2,0.076923,-0.139307,0.045054,-0.210987,-0.044338,0.206749,0.159174,0.035107,0.661368,103.0,103,103
47,SubCortGrayVol,26,2,0.076923,-0.113875,0.047039,-0.214412,-0.00939,0.304864,0.205776,0.032182,0.926126,103.0,103,103
17,Left-Amygdala,26,1,0.038462,-0.031571,0.076606,-0.239297,0.120336,0.582419,0.256904,0.016492,0.920234,103.0,103,103
42,Right-Putamen,26,1,0.038462,0.082097,0.067087,-0.033313,0.213103,0.43627,0.285444,0.033275,0.909844,103.0,103,103
34,Right-Amygdala,26,1,0.038462,-0.092763,0.051643,-0.227832,-0.000266,0.411993,0.259954,0.022623,0.997906,103.0,103,103
