# Partial correlation longitudinal (IEEE)

## Purpose

Compute partial correlation between metrics and UPDRS score longitudinal.
Metrics are cortical thickness, area, volume and subcortical volume.

- UPDRS score change: $T_2 - T_1$
- Metric rate change: $\left| \dfrac{T_2-T_1}{T_1}\right|$

## Definition

Pingouin [method](https://pingouin-stats.org/build/html/generated/pingouin.partial_corr.html#pingouin.partial_corr)

Partial correlation [1] measures the degree of association between x and y, after removing the effect of one or more controlling variables (covar or $Z$). Practically, this is achieved by calculating the correlation coefficient between the residuals of two linear regressions:

$$x \sim Z, y \sim Z$$

Like the correlation coefficient, the partial correlation coefficient takes on a value in the range from â€“1 to 1, where 1 indicates a perfect positive association.

The semipartial correlation is similar to the partial correlation, with the exception that the set of controlling variables is only removed for either x or y, but not both.

Pingouin uses the method described in [2] to calculate the (semi)partial correlation coefficients and associated p-values. This method is based on the inverse covariance matrix and is significantly faster than the traditional regression-based method. Results have been tested against the ppcor R package.

## Get info about subjects

In [1]:
import pandas as pd
from IPython.display import display
import os
from tqdm import tqdm
from pathlib import Path

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*column_view.*")

force = True
anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


display(f"Running in root dir: {anondir(root_dir)}")
stats_dir = Path(root_dir) / "stats_QCed" / "sampled"
print(f"Stats directory: {anondir(stats_dir)}")

output_dir = root_dir / "partial_correlation_ieee"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {anondir(output_dir)}")

'Running in root dir: <living-park>'

Stats directory: <living-park>/stats_QCed/sampled
Output directory: <living-park>/partial_correlation_ieee


In [2]:
def get_cohort_stats():
    filename = root_dir / "cohort" / "longitudinal_cohort_qced.csv"
    df_clinical = pd.read_csv(filename)
    print(f"Load cohort stats: {anondir(filename)}")
    columns = [
        "PATNO",
        "first_visit",
        "second_visit",
        "PD_status",
        "SEX",
        "AGE_AT_VISIT",
        "UPDRS_change",
        "durationT2_T1_y",
    ]

    df_clinical["PD_status"] = df_clinical["dx_group"].replace(
        {"PD-non-MCI": "PD", "HC": "HC"}
    )
    df_clinical = df_clinical[df_clinical.dx_group == "PD-non-MCI"]
    df_clinical.rename(columns={"NP3TOT_change": "UPDRS_change"}, inplace=True)
    n_pd = df_clinical[df_clinical["PD_status"] == "PD"]["PATNO"].nunique()
    n_hc = df_clinical[df_clinical["PD_status"] == "HC"]["PATNO"].nunique()
    print(f"Number of PD-non-MCI subjects: {n_pd}")
    print(f"Number of HC subjects: {n_hc}")
    print(f"Total number of subjects: {df_clinical['PATNO'].nunique()}")
    return df_clinical[columns]


df_clinical = get_cohort_stats()

Load cohort stats: <living-park>/cohort/longitudinal_cohort_qced.csv
Number of PD-non-MCI subjects: 112
Number of HC subjects: 0
Total number of subjects: 112


## Partial correlation

In [None]:
def read_table(filename, hemi, measure):
    df = pd.read_csv(f"table_ieee/{hemi}.aparc.{measure}.tsv", sep="\t")
    df["hemi"] = hemi
    df.columns = [c.replace(f"{hemi}.", "") for c in df.columns]
    df.columns = [c.replace(f"{hemi}_", "") for c in df.columns]
    df.columns = [c.replace(f"_{measure}", "") for c in df.columns]
    df.rename(columns={f"aparc.{measure}": "PATNO_id"}, inplace=True)
    return df


def read_measure(measure):
    lh = read_table(f"table_ieee/lh.aparc.{measure}.tsv", "lh", measure)
    rh = read_table(f"table_ieee/rh.aparc.{measure}.tsv", "rh", measure)
    return pd.concat([lh, rh], axis=0)


def get_metric_visit(metric, cohort_df, visit):
    # Validate visit parameter
    if visit not in [1, 2]:
        raise ValueError("Visit must be 1 or 2")

    df = read_measure(metric)
    id_vars = ["PATNO_id", "hemi"]
    df = df.melt(id_vars=id_vars, var_name="region", value_name=metric)

    visit_col = "first_visit" if visit == 1 else "second_visit"

    clinical_columns = [
        visit_col,
        "AGE_AT_VISIT",
        "SEX",
        "durationT2_T1_y",
        "UPDRS_change",
    ]

    merged_df = pd.merge(
        df,
        cohort_df[clinical_columns],
        left_on="PATNO_id",
        right_on=visit_col,
        how="inner",
    )

    # Clean up data types
    numeric_cols = [metric, "AGE_AT_VISIT", "SEX", "UPDRS_change", "durationT2_T1_y"]
    for col in numeric_cols:
        if col in merged_df.columns:
            merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")

    return merged_df


def get_longitudinal_metric(metric, cohort_df):
    baseline_df = get_metric_visit(metric, cohort_df=cohort_df, visit=1)
    next_df = get_metric_visit(metric, cohort_df=cohort_df, visit=2)

    if baseline_df.empty:
        raise ValueError("No baseline data available")
    if next_df.empty:
        raise ValueError("No longitudinal data available")

    baseline_df["PATNO"] = baseline_df["PATNO_id"].str.split("_").str[0]
    next_df["PATNO"] = next_df["PATNO_id"].str.split("_").str[0]

    # Compute change
    columns_to_merge = ["PATNO", "region", "hemi"]

    change_df = pd.merge(
        baseline_df,
        next_df,
        on=columns_to_merge,
        suffixes=("_baseline", "_next"),
    )

    if change_df.empty:
        raise ValueError("No matching records found between baseline and next visit")

    change_df[f"{metric}_change"] = (
        change_df[f"{metric}_next"] - change_df[f"{metric}_baseline"]
    ) / change_df[f"{metric}_baseline"]

    change_df.drop(columns=change_df.filter(regex="_baseline$").columns, inplace=True)
    change_df.rename(columns=lambda x: x.replace("_next", ""), inplace=True)

    return change_df

### Cortical

In [None]:
from tqdm import tqdm
import pandas as pd
import pingouin as pg


def compute_partial_correlation(metric, clinical_df, force=False):
    df = get_longitudinal_metric(metric, clinical_df)

    columns = ["region", "hemisphere", "r", "p-val", "n"]
    partial_correlation_df = pd.DataFrame(columns=columns)

    hemispheres = ["lh", "rh"]
    regions = df["region"].unique()

    for hemi in hemispheres:
        for region in regions:
            data = df[(df["region"] == region) & (df["hemi"] == hemi)]
            pc = pg.partial_corr(
                data=data,
                x=f"{metric}_change",
                y="UPDRS_change",
                covar=["AGE_AT_VISIT", "SEX", "durationT2_T1_y"],
                method="pearson",
            )

            (r, pval, n) = (pc["r"][0], pc["p-val"][0], pc["n"][0])
            idx = len(partial_correlation_df)
            partial_correlation_df.loc[idx] = [region, hemi, r, pval, n]

    filename = output_dir / f"partial_correlation_longitudinal_{metric}.csv"
    partial_correlation_df.to_csv(filename, index=False)

    return partial_correlation_df

In [None]:
pcorr_thickness = compute_partial_correlation("thickness", df_clinical, force=True)
pcorr_area = compute_partial_correlation("area", df_clinical, force=True)
pcorr_volume = compute_partial_correlation("volume", df_clinical, force=True)

In [None]:
pcorr_thickness[pcorr_thickness["p-val"] < 0.05].sort_values(by="r", ascending=False)

Unnamed: 0,region,hemisphere,r,p-val,n
65,superiortemporal,rh,-0.218192,0.029196,103
11,lingual,lh,-0.222207,0.026284,103


In [None]:
pcorr_area[pcorr_area["p-val"] < 0.05].sort_values(by="r", ascending=False)

Unnamed: 0,region,hemisphere,r,p-val,n
26,superiorfrontal,lh,-0.207496,0.038314,103


In [None]:
pcorr_volume[pcorr_volume["p-val"] < 0.05].sort_values(by="r", ascending=False)

Unnamed: 0,region,hemisphere,r,p-val,n
16,parsopercularis,lh,-0.206751,0.039029,103
21,posteriorcingulate,lh,-0.216721,0.030328,103
57,posteriorcingulate,rh,-0.247867,0.012903,103
64,superiortemporal,rh,-0.293217,0.00307,103


## Subcortical Volume


In [None]:
import numpy as np


def get_subcortical_volume_visit(cohort_df, visit):
    df = pd.read_csv("table_ieee/aseg.volume.tsv", sep="\t")
    df.rename(columns={"Measure:volume": "PATNO_id"}, inplace=True)

    # Validate visit parameter
    if visit not in [1, 2]:
        raise ValueError("Visit must be 1 or 2")

    df = df.melt(id_vars=["PATNO_id"], var_name="region", value_name="volume")

    visit_col = "first_visit" if visit == 1 else "second_visit"

    clinical_columns = [
        visit_col,
        "AGE_AT_VISIT",
        "SEX",
        "UPDRS_change",
        "durationT2_T1_y",
    ]

    merged_df = pd.merge(
        df,
        cohort_df[clinical_columns],
        left_on="PATNO_id",
        right_on=visit_col,
        how="inner",
    )

    # Clean up data types
    numeric_cols = ["volume", "UPDRS_change", "AGE_AT_VISIT", "durationT2_T1_y"]
    for col in numeric_cols:
        if col in merged_df.columns:
            merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")

    return merged_df


def get_pcorr_subcortical_volume_longitudinal(cohort_df):
    baseline_df = get_subcortical_volume_visit(cohort_df, visit=1)
    next_df = get_subcortical_volume_visit(cohort_df, visit=2)

    if baseline_df.empty:
        raise ValueError("No baseline data available")
    if next_df.empty:
        raise ValueError("No longitudinal data available")

    baseline_df["PATNO"] = baseline_df["PATNO_id"].str.split("_").str[0]
    next_df["PATNO"] = next_df["PATNO_id"].str.split("_").str[0]

    # Compute change
    columns_to_merge = ["PATNO", "region"]

    change_df = pd.merge(
        baseline_df,
        next_df,
        on=columns_to_merge,
        suffixes=("_baseline", "_next"),
    )

    if change_df.empty:
        raise ValueError("No matching records found between baseline and next visit")

    change_df["volume_change"] = (
        change_df["volume_next"] - change_df["volume_baseline"]
    ) / change_df["volume_baseline"]

    change_df.drop(columns=change_df.filter(regex="_baseline$").columns, inplace=True)
    change_df.rename(columns=lambda x: x.replace("_next", ""), inplace=True)

    columns = ["region", "r", "p-val", "n"]
    partial_correlation_df = pd.DataFrame(columns=columns)

    regions = change_df["region"].unique()

    for region in regions:
        try:
            data = change_df[(change_df["region"] == region)]
            pc = pg.partial_corr(
                data=data,
                x="volume_change",
                y="UPDRS_change",
                covar=["AGE_AT_VISIT", "SEX", "durationT2_T1_y"],
                method="pearson",
            )

            (r, pval, n) = (pc["r"][0], pc["p-val"][0], pc["n"][0])
            idx = len(partial_correlation_df)
            partial_correlation_df.loc[idx] = [region, r, pval, n]
        except Exception as e:
            print(f"Error processing region {region}: {e}")
            r, pval, n = np.nan, np.nan, n
            idx = len(partial_correlation_df)
            partial_correlation_df.loc[idx] = [region, r, pval, n]

    filename = output_dir / "partial_correlation_longitudinal_subcortical_volume.csv"
    partial_correlation_df.to_csv(filename, index=False)

    return partial_correlation_df


pcorr_subcortical_volume = get_pcorr_subcortical_volume_longitudinal(df_clinical)

In [None]:
pcorr_subcortical_volume[pcorr_subcortical_volume["p-val"] < 0.05].sort_values(
    by="r", ascending=False
)

Unnamed: 0,region,r,p-val,n
16,Left-vessel,0.2066,0.039176,103
