# Partial correlation at baseline

## Purpose

Compute partial correlation between metrics and UPDRS score at baseline.
Metrics are cortical thickness, area, volume and subcortical volume.

## Definition

Pingouin [method](https://pingouin-stats.org/build/html/generated/pingouin.partial_corr.html#pingouin.partial_corr)

Partial correlation [1] measures the degree of association between x and y, after removing the effect of one or more controlling variables (covar or $Z$). Practically, this is achieved by calculating the correlation coefficient between the residuals of two linear regressions:

$$x \sim Z, y \sim Z$$

Like the correlation coefficient, the partial correlation coefficient takes on a value in the range from –1 to 1, where 1 indicates a perfect positive association.

The semipartial correlation is similar to the partial correlation, with the exception that the set of controlling variables is only removed for either x or y, but not both.

Pingouin uses the method described in [2] to calculate the (semi)partial correlation coefficients and associated p-values. This method is based on the inverse covariance matrix and is significantly faster than the traditional regression-based method. Results have been tested against the ppcor R package.

## Get info about subjects

In [27]:
import pandas as pd
from IPython.display import display
import os
from tqdm import tqdm
from pathlib import Path

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*column_view.*")

force = True
anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


display(f"Running in root dir: {anondir(root_dir)}")
stats_dir = Path(root_dir) / "stats_QCed" / "sampled"
print(f"Stats directory: {anondir(stats_dir)}")

output_dir = root_dir / "partial_correlation"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {anondir(output_dir)}")

'Running in root dir: <living-park>'

Stats directory: <living-park>/stats_QCed/sampled
Output directory: <living-park>/partial_correlation


In [28]:
def get_cohort_stats():
    filename = root_dir / "cohort" / "longitudinal_cohort_qced.csv"
    df_clinical = pd.read_csv(filename)
    print(f"Load cohort stats: {anondir(filename)}")
    columns = [
        "PATNO",
        "first_visit",
        "second_visit",
        "dx_group",
        "SEX",
        "AGE_AT_VISIT",
        "UPDRS",
    ]
    df_clinical = df_clinical[df_clinical.dx_group == "PD-non-MCI"]
    df_clinical.rename(columns={"NP3TOT": "UPDRS"}, inplace=True)
    print(
        f"Number of PD-non-MCI subjects: {df_clinical[df_clinical['dx_group']=='PD-non-MCI']['PATNO'].nunique()}"
    )
    print(
        f"Number of HC subjects: {df_clinical[df_clinical['dx_group']=='HC']['PATNO'].nunique()}"
    )
    print(f"Total number of subjects: {df_clinical['PATNO'].nunique()}")
    return df_clinical[columns]


df_clinical = get_cohort_stats()

Load cohort stats: <living-park>/cohort/longitudinal_cohort_qced.csv
Number of PD-non-MCI subjects: 112
Number of HC subjects: 0
Total number of subjects: 112


In [29]:
def assert_number_of_repetitions(df, hemisphere=None, repetitions=26):
    """
    Assert that each subjects/region has exactly N repetitions.
    """
    groups = ["subjects", "region"] + (["hemisphere"] if hemisphere else [])
    grouped = df.groupby(groups).count() == repetitions
    assert (
        grouped.all().all()
    ), f"Not all subjects/regions have {repetitions} repetitions. {grouped[grouped == False].count()}"

## Code
### Create baseline dataframe

In [30]:
from pathlib import Path
import numpy as np
import pandas as pd
import pingouin as pg


def _ensure_parquet_parent(path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)


def _cached(path: Path, force: bool = False) -> pd.DataFrame | None:
    if path.exists() and not force:
        return pd.read_parquet(path)
    return None


def get_baseline_df(
    metric: str,
    hemisphere: bool,
    cohort_df: pd.DataFrame,
    stats_dir: Path,
    output_dir: Path,
    force: bool = False,
) -> pd.DataFrame:
    """
    Build (or load) baseline long-form df with columns:
    ['repetition','subjects','dx_group','PD_status',('hemisphere',) 'region', metric,
     'AGE_AT_VISIT','SEX','UPDRS']
    """
    cache = output_dir / f"baseline_df_{metric}.parquet"
    df = _cached(cache, force=force)
    if df is not None:
        return df

    src = stats_dir / f"{metric}.parquet"
    wide = pd.read_parquet(src)

    # tidy up columns
    if "PATNO_id" in wide:
        wide = wide.drop(columns=["PATNO_id"])
    if hemisphere and "hemi" in wide:
        wide = wide.rename(columns={"hemi": "hemisphere"})

    # repetition index per subject(/hemisphere)
    group_cols = ["subject_visit"] + (["hemisphere"] if hemisphere else [])
    wide["repetition"] = wide.groupby(group_cols).cumcount() + 1

    # melt to long format
    id_vars = ["repetition", "subject_visit", "dx_group", "PD_status"] + (
        ["hemisphere"] if hemisphere else []
    )
    long = wide.melt(id_vars=id_vars, var_name="region", value_name=metric)
    long["region"] = long["region"].str.replace(f"_{metric}", "", regex=False)

    # merge with clinical cohort
    clinical_cols = ["first_visit", "AGE_AT_VISIT", "SEX", "UPDRS"]
    base = long.merge(
        cohort_df[clinical_cols],
        left_on="subject_visit",
        right_on="first_visit",
        how="inner",
    ).drop(columns=["first_visit"])

    # numeric coercion
    for c in (metric, "AGE_AT_VISIT", "UPDRS"):
        if c in base:
            base[c] = pd.to_numeric(base[c], errors="coerce")

    _ensure_parquet_parent(cache)
    base.to_parquet(cache)
    return base


def compute_partial_correlation(
    baseline_df: pd.DataFrame,
    metric: str,
    hemisphere: bool,
    output_dir: Path,
    force: bool = False,
) -> pd.DataFrame:
    """
    Returns columns:
    ['repetition','region',('hemisphere',) 'r','p-val','n']
    """
    cache = output_dir / f"partial_correlation_baseline_{metric}.parquet"
    out = _cached(cache, force=force)
    if out is not None:
        return out

    req = {metric, "UPDRS", "AGE_AT_VISIT", "SEX", "repetition", "region"}
    if hemisphere:
        req.add("hemisphere")
    missing = req.difference(baseline_df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    group_cols = ["repetition", "region"] + (["hemisphere"] if hemisphere else [])

    def _one(g: pd.DataFrame) -> pd.Series:
        data = g[[metric, "UPDRS", "AGE_AT_VISIT", "SEX"]].dropna()
        if len(data) < 4:  # need at least 4 rows for partial corr with 2 covariates
            return pd.Series({"r": np.nan, "p-val": np.nan, "n": len(data)})
        pc = pg.partial_corr(
            data=data,
            x=metric,
            y="UPDRS",
            covar=["AGE_AT_VISIT", "SEX"],
            method="pearson",
        ).iloc[0]
        return pd.Series({"r": pc["r"], "p-val": pc["p-val"], "n": pc["n"]})

    res = (
        baseline_df.groupby(group_cols, sort=False, dropna=False)
        .apply(_one, include_groups=False)
        .reset_index()
    )

    _ensure_parquet_parent(cache)
    res.to_parquet(cache)
    return res


def compute_significance(
    pcorr_df: pd.DataFrame,
    hemisphere: bool = False,
    alpha: float = 0.05,
    correction_method: str | None = None,
) -> pd.DataFrame:
    if pcorr_df.empty:
        return pcorr_df

    df = pcorr_df.copy()
    pcol = "p-val"
    df["significant"] = df[pcol] < alpha

    group_cols = ["region"] + (["hemisphere"] if hemisphere else [])
    agg = (
        df.groupby(group_cols, dropna=False)
        .agg(
            n_correlations=("r", "count"),
            n_significant=("significant", "sum"),
            proportion_significant=("significant", "mean"),
            mean_r=("r", "mean"),
            std_r=("r", "std"),
            min_r=("r", "min"),
            max_r=("r", "max"),
            mean_p=(pcol, "mean"),
            std_p=(pcol, "std"),
            min_p=(pcol, "min"),
            max_p=(pcol, "max"),
            mean_n=("n", "mean"),
            min_n=("n", "min"),
            max_n=("n", "max"),
        )
        .reset_index()
    )

    return agg.sort_values("proportion_significant", ascending=False)


def report_significance_df(
    metric: str,
    hemisphere: bool,
    cohort_df: pd.DataFrame,
    stats_dir: Path,
    output_dir: Path,
    alpha: float = 0.05,
    force: bool = False,
) -> pd.DataFrame:
    base = get_baseline_df(
        metric, hemisphere, cohort_df, stats_dir, output_dir, force=force
    )
    if base.empty:
        return pd.DataFrame()

    pc = compute_partial_correlation(base, metric, hemisphere, output_dir, force=force)
    if pc.empty:
        return pd.DataFrame()

    sig = compute_significance(pc, hemisphere=hemisphere, alpha=alpha)
    # Save a concise CSV summary next to parquet caches
    out_csv = output_dir / f"partial_correlation_baseline_{metric}.csv"
    _ensure_parquet_parent(out_csv)
    sig.to_csv(out_csv, index=False)
    return sig

### Compute partial correlation

### Get significant partial correlation among repetitions

## Cortical volume

In [None]:
# Enhanced single metric analysis
significance_volume_df = report_significance_df(
    metric="volume",
    cohort_df=df_clinical,
    hemisphere=True,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [32]:
significance_volume_df

Unnamed: 0,region,hemisphere,n_correlations,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
19,inferiorparietal,rh,26,26,1.000000,-0.278495,0.011700,-0.306837,-0.259056,0.003447,0.001273,0.001113,0.006280,112.0,112.0,112.0
17,fusiform,rh,26,26,1.000000,-0.231369,0.016178,-0.262136,-0.203698,0.016473,0.007635,0.005667,0.032808,112.0,112.0,112.0
50,posteriorcingulate,lh,26,26,1.000000,-0.329263,0.013347,-0.357154,-0.305667,0.000509,0.000278,0.000128,0.001165,112.0,112.0,112.0
48,postcentral,lh,26,23,0.884615,-0.209897,0.014306,-0.228798,-0.176875,0.029647,0.013277,0.016207,0.064532,112.0,112.0,112.0
20,inferiortemporal,lh,26,19,0.730769,-0.199291,0.016801,-0.237965,-0.174311,0.039600,0.015333,0.012302,0.068566,112.0,112.0,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,temporalpole,lh,26,0,0.000000,-0.105046,0.041666,-0.164142,-0.008375,0.315322,0.210670,0.086618,0.930803,112.0,112.0,112.0
74,transversetemporal,lh,26,0,0.000000,-0.160955,0.013038,-0.186729,-0.138226,0.095907,0.026729,0.050787,0.149844,112.0,112.0,112.0
75,transversetemporal,rh,26,0,0.000000,-0.030325,0.019411,-0.076392,-0.002914,0.758440,0.145618,0.427653,0.975897,112.0,112.0,112.0
76,visit,lh,0,0,0.000000,,,,,,,,,0.0,0.0,0.0


## Cortical thickness

In [None]:
# Enhanced single metric analysis
significance_thickness_df = report_significance_df(
    metric="thickness",
    cohort_df=df_clinical,
    hemisphere=True,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [34]:
significance_thickness_df

Unnamed: 0,region,hemisphere,n_correlations,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
2,MeanThickness,lh,26,26,1.0,-0.213096,0.010237,-0.235564,-0.191018,0.026244,0.007354,0.013236,0.045611,112.0,112.0,112.0
49,pericalcarine,rh,26,26,1.0,-0.286054,0.020747,-0.330090,-0.238575,0.003131,0.002588,0.000429,0.012074,112.0,112.0,112.0
50,postcentral,lh,26,26,1.0,-0.268224,0.027923,-0.305487,-0.207189,0.006852,0.007150,0.001173,0.029869,112.0,112.0,112.0
51,postcentral,rh,26,26,1.0,-0.243720,0.018655,-0.268867,-0.192547,0.012082,0.009062,0.004509,0.043875,112.0,112.0,112.0
54,precentral,lh,26,26,1.0,-0.261483,0.024561,-0.314201,-0.229479,0.007417,0.005001,0.000829,0.015884,112.0,112.0,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,superiorfrontal,rh,26,0,0.0,-0.116372,0.012265,-0.148736,-0.096329,0.229594,0.047680,0.120957,0.316778,112.0,112.0,112.0
74,temporalpole,lh,26,0,0.0,-0.108149,0.037721,-0.186460,-0.041163,0.294226,0.169465,0.051127,0.669402,112.0,112.0,112.0
76,transversetemporal,lh,26,0,0.0,-0.121213,0.016965,-0.153461,-0.084337,0.213945,0.066600,0.109460,0.381033,112.0,112.0,112.0
78,visit,lh,0,0,0.0,,,,,,,,,0.0,0.0,0.0


## Cortical surface area

In [None]:
# Enhanced single metric analysis
significance_area_df = report_significance_df(
    metric="area",
    cohort_df=df_clinical,
    hemisphere=True,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [36]:
significance_area_df

Unnamed: 0,region,hemisphere,n_correlations,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
52,posteriorcingulate,lh,26,26,1.000000,-0.253218,0.018507,-0.286035,-0.214952,0.008776,0.005036,0.002452,0.024126,112.0,112.0,112.0
21,inferiorparietal,rh,26,26,1.000000,-0.217396,0.010081,-0.242718,-0.196797,0.023251,0.006169,0.010622,0.039337,112.0,112.0,112.0
33,lingual,rh,26,16,0.615385,0.192370,0.012182,0.163968,0.210751,0.045812,0.014748,0.027104,0.086957,112.0,112.0,112.0
22,inferiortemporal,lh,26,11,0.423077,-0.184302,0.018033,-0.221180,-0.152601,0.057999,0.023769,0.020228,0.111486,112.0,112.0,112.0
63,rostralmiddlefrontal,rh,26,3,0.115385,-0.168012,0.021415,-0.205105,-0.112738,0.087212,0.049328,0.031595,0.240939,112.0,112.0,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,temporalpole,rh,26,0,0.000000,-0.050410,0.033389,-0.104933,0.014883,0.597587,0.204088,0.275276,0.940880,112.0,112.0,112.0
76,transversetemporal,lh,26,0,0.000000,-0.052540,0.015283,-0.087900,-0.019084,0.590160,0.107022,0.361169,0.843132,112.0,112.0,112.0
77,transversetemporal,rh,26,0,0.000000,0.077202,0.024953,0.018463,0.115116,0.436983,0.154561,0.231094,0.848173,112.0,112.0,112.0
78,visit,lh,0,0,0.000000,,,,,,,,,0.0,0.0,0.0


## Subcortical volume

In [None]:
# Enhanced single metric analysis
significance_subcortical_volume_df = report_significance_df(
    metric="subcortical_volume",
    cohort_df=df_clinical,
    hemisphere=False,
    alpha=0.05,
    force=force,
    stats_dir=stats_dir,
    output_dir=output_dir,
)

In [38]:
significance_subcortical_volume_df

Unnamed: 0,region,n_correlations,n_significant,proportion_significant,mean_r,std_r,min_r,max_r,mean_p,std_p,min_p,max_p,mean_n,min_n,max_n
0,3rd-Ventricle,26,26,1.000000,-0.207182,0.004283,-0.212652,-0.197233,0.030041,0.003582,0.025720,0.038894,112.0,112.0,112.0
44,Right-Putamen,26,26,1.000000,-0.260590,0.014321,-0.288111,-0.236561,0.006562,0.003084,0.002272,0.012841,112.0,112.0,112.0
25,Left-Putamen,26,26,1.000000,-0.229334,0.011554,-0.249906,-0.210527,0.016728,0.005547,0.008463,0.027272,112.0,112.0,112.0
56,WM-hypointensities,26,23,0.884615,0.205034,0.013256,0.174355,0.228090,0.033386,0.013133,0.016549,0.068494,112.0,112.0,112.0
36,Right-Amygdala,26,19,0.730769,-0.204496,0.025757,-0.250011,-0.164973,0.038347,0.024256,0.008435,0.085015,112.0,112.0,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,rejected_images,0,0,0.000000,,,,,,,,,112.0,112.0,112.0
62,rhCerebralWhiteMatterVol,26,0,0.000000,0.022496,0.006259,0.009807,0.033926,0.815903,0.050421,0.724949,0.919011,112.0,112.0,112.0
63,rhCortexVol,26,0,0.000000,-0.166977,0.004282,-0.175170,-0.156850,0.081541,0.008098,0.067192,0.101753,112.0,112.0,112.0
65,subject,0,0,0.000000,,,,,,,,,0.0,0.0,0.0
