# Partial correlation at baseline

## Purpose

Compute partial correlation between metrics and UPDRS score at baseline.
Metrics are cortical thickness, area, volume and subcortical volume.

## Definition

Pingouin [method](https://pingouin-stats.org/build/html/generated/pingouin.partial_corr.html#pingouin.partial_corr)

Partial correlation [1] measures the degree of association between x and y, after removing the effect of one or more controlling variables (covar or $Z$). Practically, this is achieved by calculating the correlation coefficient between the residuals of two linear regressions:

$$x \sim Z, y \sim Z$$

Like the correlation coefficient, the partial correlation coefficient takes on a value in the range from –1 to 1, where 1 indicates a perfect positive association.

The semipartial correlation is similar to the partial correlation, with the exception that the set of controlling variables is only removed for either x or y, but not both.

Pingouin uses the method described in [2] to calculate the (semi)partial correlation coefficients and associated p-values. This method is based on the inverse covariance matrix and is significantly faster than the traditional regression-based method. Results have been tested against the ppcor R package.

## Get info about subjects

In [32]:
import pandas as pd
from IPython.display import display
import os
from tqdm import tqdm
from pathlib import Path

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*column_view.*")

force = True
anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


display(f"Running in root dir: {anondir(root_dir)}")
stats_dir = Path(root_dir) / "stats_QCed" / "sampled"
print(f"Stats directory: {anondir(stats_dir)}")

output_dir = root_dir / "ancova"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {anondir(output_dir)}")

'Running in root dir: <living-park>'

Stats directory: <living-park>/stats_QCed/sampled
Output directory: <living-park>/ancova


In [33]:
def get_cohort_stats():
    filename = root_dir / "cohort" / "longitudinal_cohort_qced.csv"
    df_clinical = pd.read_csv(filename)
    print(f"Load cohort stats: {anondir(filename)}")
    columns = [
        "PATNO",
        "first_visit",
        "second_visit",
        "dx_group",
        "SEX",
        "AGE_AT_VISIT",
        "UPDRS",
    ]
    df_clinical.rename(columns={"NP3TOT": "UPDRS"}, inplace=True)
    print(
        f"Number of PD-non-MCI subjects: {df_clinical[df_clinical['dx_group']=='PD-non-MCI']['PATNO'].nunique()}"
    )
    print(
        f"Number of HC subjects: {df_clinical[df_clinical['dx_group']=='HC']['PATNO'].nunique()}"
    )
    print(f"Total number of subjects: {df_clinical['PATNO'].nunique()}")
    return df_clinical[columns]


df_clinical = get_cohort_stats()

Load cohort stats: <living-park>/cohort/longitudinal_cohort_qced.csv
Number of PD-non-MCI subjects: 112
Number of HC subjects: 89
Total number of subjects: 201


In [34]:
def assert_number_of_repetitions(df, hemisphere=None, repetitions=26):
    """
    Assert that each subjects/region has exactly N repetitions.
    """
    groups = ["subjects", "region"] + (["hemisphere"] if hemisphere else [])
    grouped = df.groupby(groups).count() == repetitions
    assert (
        grouped.all().all()
    ), f"Not all subjects/regions have {repetitions} repetitions. {grouped[grouped == False].count()}"

## Code
### Create baseline dataframe

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import pingouin as pg


# ============================== helpers ===============================


def _ensure_parent(path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)


def _cached(path: Path, force: bool = False) -> pd.DataFrame | None:
    if path.exists() and not force:
        return pd.read_parquet(path)
    return None


def _require_cols(df: pd.DataFrame, cols: list[str]) -> None:
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")


def get_baseline_df(
    metric: str,
    hemisphere: bool,
    cohort_df: pd.DataFrame,
    stats_dir: Path,
    output_dir: Path,
    force: bool = False,
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Build (or load) baseline long-form df with columns:
    ['repetition','subjects','dx_group','PD_status',('hemisphere',) 'region', metric,
     'AGE_AT_VISIT','SEX']
    """
    cache = output_dir / f"baseline_df_{metric}.parquet"
    df = _cached(cache, force=force)
    if df is not None:
        if verbose:
            print(f"[baseline] loaded cache: {cache}")
        return df

    src = stats_dir / f"{metric}.parquet"
    wide = pd.read_parquet(src)

    # tidy columns
    if "PATNO_id" in wide:
        wide = wide.drop(columns=["PATNO_id"])
    if hemisphere and "hemi" in wide:
        wide = wide.rename(columns={"hemi": "hemisphere"})

    # repetition per subject(/hemisphere)
    gcols = ["subject_visit"] + (["hemisphere"] if hemisphere else [])
    wide["repetition"] = wide.groupby(gcols).cumcount() + 1

    # melt to long
    id_vars = ["repetition", "subject_visit", "dx_group", "PD_status"] + (
        ["hemisphere"] if hemisphere else []
    )
    long = wide.melt(id_vars=id_vars, var_name="region", value_name=metric)
    long["region"] = long["region"].str.replace(f"_{metric}", "", regex=False)

    # merge cohort
    clinical_cols = ["first_visit", "AGE_AT_VISIT", "SEX"]
    base = long.merge(
        cohort_df[clinical_cols],
        left_on="subject_visit",
        right_on="first_visit",
        how="inner",
    ).drop(columns=["first_visit"])

    # numeric coercion
    for c in (metric, "AGE_AT_VISIT", "SEX"):
        if c in base:
            base[c] = pd.to_numeric(base[c], errors="coerce")

    _ensure_parent(cache)
    base.to_parquet(cache)
    if verbose:
        print(f"[baseline] saved: {cache}  rows={len(base)}")
    return base


def compute_ancova(
    baseline_df: pd.DataFrame,
    metric: str,
    hemisphere: bool,
    output_dir: Path,
    force: bool = False,
    verbose: bool = False,
) -> pd.DataFrame:
    """
    One ANCOVA per (repetition, region[, hemisphere]) with covariates AGE_AT_VISIT, SEX.
    Returns columns: ['repetition','region',('hemisphere',) 'F','p-val','np2','n']
    """
    cache = output_dir / f"ancova_baseline_{metric}.parquet"
    out = _cached(cache, force=force)
    if out is not None:
        if verbose:
            print(f"[ancova] loaded cache: {cache}")
        return out

    req = {metric, "dx_group", "AGE_AT_VISIT", "SEX", "repetition", "region"}
    if hemisphere:
        req.add("hemisphere")
    _require_cols(baseline_df, list(req))

    group_cols = ["repetition", "region"] + (["hemisphere"] if hemisphere else [])
    covars = ["AGE_AT_VISIT", "SEX"]

    def _one(g: pd.DataFrame) -> pd.Series:
        data = g[[metric, "dx_group"] + covars].dropna().copy()
        # require at least 2 groups and variation in covariates
        if len(data) < 4 or data["dx_group"].nunique() < 2:
            return pd.Series(
                {"F": np.nan, "p-val": np.nan, "np2": np.nan, "n": len(data)}
            )
        if any(data[c].nunique() < 2 for c in covars):
            return pd.Series(
                {"F": np.nan, "p-val": np.nan, "np2": np.nan, "n": len(data)}
            )

        # types
        data[metric] = pd.to_numeric(data[metric], errors="coerce")
        data["AGE_AT_VISIT"] = pd.to_numeric(data["AGE_AT_VISIT"], errors="coerce")
        data["SEX"] = pd.to_numeric(data["SEX"], errors="coerce")
        data["dx_group"] = data["dx_group"].astype(str)
        data = data.dropna()

        if len(data) < 4 or data["dx_group"].nunique() < 2:
            return pd.Series(
                {"F": np.nan, "p-val": np.nan, "np2": np.nan, "n": len(data)}
            )

        anc = pg.ancova(data=data, dv=metric, between="dx_group", covar=covars)
        row = anc.loc[anc["Source"] == "dx_group"].iloc[0]
        return pd.Series(
            {
                "F": float(row["F"]),
                "p-val": float(row["p-val"]),
                "np2": float(row["np2"]),
                "n": len(data),
            }
        )

    res = (
        baseline_df.groupby(group_cols, sort=False, dropna=False)
        .apply(_one, include_groups=False)
        .reset_index()
    )

    _ensure_parent(cache)
    res.to_parquet(cache)
    if verbose:
        print(f"[ancova] saved: {cache}  tests={len(res)}")
    return res


def compute_significance(
    ancova_df: pd.DataFrame,
    hemisphere: bool = False,
    alpha: float = 0.05,
) -> pd.DataFrame:
    """
    Aggregate ANCOVA results per region (and hemisphere if present).
    """
    if ancova_df.empty:
        return pd.DataFrame()

    df = ancova_df.copy()
    pcol = "p-val"

    df["significant"] = df[pcol] < alpha

    gcols = ["region"] + (
        ["hemisphere"] if hemisphere and "hemisphere" in df.columns else []
    )
    agg = (
        df.groupby(gcols, dropna=False)
        .agg(
            n_correlations=("F", "count"),  # kept name for plot compatibility
            n_significant=("significant", "sum"),
            proportion_significant=("significant", "mean"),
            mean_F=("F", "mean"),
            std_F=("F", "std"),
            min_F=("F", "min"),
            max_F=("F", "max"),
            mean_p=(pcol, "mean"),
            std_p=(pcol, "std"),
            min_p=(pcol, "min"),
            max_p=(pcol, "max"),
            mean_n=("n", "mean"),
            min_n=("n", "min"),
            max_n=("n", "max"),
        )
        .reset_index()
    )

    return agg.sort_values("proportion_significant", ascending=False)


def report_significance_df(
    metric: str,
    hemisphere: bool,
    cohort_df: pd.DataFrame,
    stats_dir: Path,
    output_dir: Path,
    alpha: float = 0.05,
    force: bool = False,
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Full ANCOVA pipeline (cache-aware) → region-level summary with corrections.
    Saves intermediate parquet caches; returns the aggregated significance DataFrame.
    """
    if verbose:
        print(
            f"\n[run] ANCOVA for metric={metric}  hemisphere={hemisphere}  alpha={alpha}"
        )

    base = get_baseline_df(
        metric,
        hemisphere,
        cohort_df,
        stats_dir,
        output_dir,
        force=force,
        verbose=verbose,
    )
    if base.empty:
        return pd.DataFrame()

    ancova = compute_ancova(
        base, metric, hemisphere, output_dir, force=force, verbose=verbose
    )
    if ancova.empty:
        return pd.DataFrame()

    sig = compute_significance(ancova, hemisphere=hemisphere, alpha=alpha)
    # also save a CSV summary for convenience
    out_csv = output_dir / f"ancova_baseline_{metric}.csv"
    _ensure_parent(out_csv)
    sig.to_csv(out_csv, index=False)
    if verbose:
        print(f"[run] summary saved: {out_csv}  regions={len(sig)}")
    return sig

### Compute partial correlation

### Get significant partial correlation among repetitions

## Cortical volume

In [None]:
significance_volume_df = report_significance_df(
    metric="volume",
    hemisphere=True,
    cohort_df=df_clinical,
    stats_dir=stats_dir,
    output_dir=output_dir,
    alpha=0.05,
    force=force,
    verbose=True,
)

In [37]:
significance_volume_df

Unnamed: 0,region,hemisphere,n_correlations,n_significant,proportion_significant,mean_F,std_F,min_F,max_F,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
0,BrainSegVolNotVent,lh,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
1,BrainSegVolNotVent,rh,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
5,caudalanteriorcingulate,rh,26,26,1.0,20.122921,0.853234,18.392172,21.695956,0.000013,0.000006,0.000006,0.000028,201.0,201.0,201.0
11,eTIV,rh,26,26,1.0,5.773366,0.003862,5.764263,5.779840,0.017199,0.000037,0.017137,0.017285,201.0,201.0,201.0
10,eTIV,lh,26,26,1.0,5.773366,0.003862,5.764263,5.779840,0.017199,0.000037,0.017137,0.017285,201.0,201.0,201.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,supramarginal,rh,26,0,0.0,1.439813,0.325592,1.039556,2.373853,0.237912,0.047523,0.124987,0.309175,201.0,201.0,201.0
73,temporalpole,rh,26,0,0.0,1.018179,0.705229,0.000001,3.068400,0.378311,0.200207,0.081384,0.999158,201.0,201.0,201.0
72,temporalpole,lh,26,0,0.0,0.522363,0.693554,0.008070,2.940053,0.587346,0.243171,0.087980,0.928510,201.0,201.0,201.0
76,visit,lh,0,0,0.0,,,,,,,,,0.0,0.0,0.0


## Cortical thickness

In [None]:
significance_thickness_df = report_significance_df(
    metric="thickness",
    hemisphere=True,
    cohort_df=df_clinical,
    stats_dir=stats_dir,
    output_dir=output_dir,
    alpha=0.05,
    force=force,
    verbose=True,
)

In [39]:
significance_thickness_df

Unnamed: 0,region,hemisphere,n_correlations,n_significant,proportion_significant,mean_F,std_F,min_F,max_F,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
0,BrainSegVolNotVent,lh,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
1,BrainSegVolNotVent,rh,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
12,eTIV,lh,26,26,1.0,5.773366,0.003862,5.764263,5.779840,0.017199,0.000037,0.017137,0.017285,201.0,201.0,201.0
13,eTIV,rh,26,26,1.0,5.773366,0.003862,5.764263,5.779840,0.017199,0.000037,0.017137,0.017285,201.0,201.0,201.0
38,paracentral,lh,26,26,1.0,9.316861,1.260675,7.537144,13.283722,0.003077,0.001649,0.000342,0.006603,201.0,201.0,201.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,temporalpole,rh,26,0,0.0,0.234257,0.315950,0.000164,1.561599,0.695486,0.178589,0.212914,0.989801,201.0,201.0,201.0
76,transversetemporal,lh,26,0,0.0,0.370424,0.253298,0.021371,1.005610,0.576617,0.142146,0.317187,0.883923,201.0,201.0,201.0
77,transversetemporal,rh,26,0,0.0,0.087573,0.076315,0.002631,0.272192,0.792918,0.102096,0.602451,0.959145,201.0,201.0,201.0
78,visit,lh,0,0,0.0,,,,,,,,,0.0,0.0,0.0


## Cortical surface area

In [None]:
significance_area_df = report_significance_df(
    metric="area",
    hemisphere=True,
    cohort_df=df_clinical,
    stats_dir=stats_dir,
    output_dir=output_dir,
    alpha=0.05,
    force=force,
    verbose=True,
)

In [41]:
significance_area_df

Unnamed: 0,region,hemisphere,n_correlations,n_significant,proportion_significant,mean_F,std_F,min_F,max_F,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
0,BrainSegVolNotVent,lh,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
1,BrainSegVolNotVent,rh,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
2,WhiteSurfArea,lh,26,26,1.0,6.355475,0.364762,5.487191,7.281893,0.012745,0.002584,0.007570,0.020154,201.0,201.0,201.0
3,WhiteSurfArea,rh,26,26,1.0,6.397003,0.305417,5.886809,7.167544,0.012379,0.001966,0.008049,0.016155,201.0,201.0,201.0
7,caudalanteriorcingulate,rh,26,26,1.0,18.513658,1.162239,15.849141,20.442008,0.000031,0.000019,0.000011,0.000096,201.0,201.0,201.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,supramarginal,lh,26,0,0.0,1.905837,0.223738,1.495186,2.309517,0.171039,0.025204,0.130187,0.222874,201.0,201.0,201.0
74,temporalpole,lh,26,0,0.0,0.405434,0.392327,0.005242,1.683223,0.592633,0.200099,0.196014,0.942355,201.0,201.0,201.0
73,supramarginal,rh,26,0,0.0,1.410813,0.310493,0.975909,2.225576,0.242470,0.048118,0.137342,0.324423,201.0,201.0,201.0
78,visit,lh,0,0,0.0,,,,,,,,,0.0,0.0,0.0


## Subcortical volume

In [None]:
significance_subcortical_volume_df = report_significance_df(
    metric="subcortical_volume",
    hemisphere=False,
    cohort_df=df_clinical,
    stats_dir=stats_dir,
    output_dir=output_dir,
    alpha=0.05,
    force=force,
    verbose=True,
)

In [43]:
significance_subcortical_volume_df

Unnamed: 0,region,n_correlations,n_significant,proportion_significant,mean_F,std_F,min_F,max_F,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
6,BrainSegVolNotVent,26,26,1.0,13.399542,0.217807,13.000617,13.764480,0.000325,0.000035,0.000269,0.000395,201.0,201.0,201.0
4,BrainSegVol,26,26,1.0,10.781256,0.186851,10.441947,11.082691,0.001219,0.000117,0.001040,0.001444,201.0,201.0,201.0
3,Brain-Stem,26,26,1.0,11.286962,0.563686,10.311742,12.790127,0.000974,0.000264,0.000438,0.001544,201.0,201.0,201.0
51,SubCortGrayVol,26,26,1.0,11.541809,0.527482,10.718958,12.739816,0.000851,0.000210,0.000450,0.001253,201.0,201.0,201.0
52,SupraTentorialVol,26,26,1.0,8.436945,0.187294,8.120364,8.806338,0.004116,0.000404,0.003374,0.004842,201.0,201.0,201.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,SurfaceHoles,26,0,0.0,1.184861,0.536308,0.001419,2.332138,0.314649,0.163843,0.128332,0.969984,201.0,201.0,201.0
61,rejected_images,0,0,0.0,,,,,,,,,201.0,201.0,201.0
64,rhSurfaceHoles,26,0,0.0,0.809578,0.698753,0.003435,2.566223,0.460402,0.236280,0.110770,0.953322,201.0,201.0,201.0
65,subject,0,0,0.0,,,,,,,,,0.0,0.0,0.0
