# Statistical tests for consistency tests

In [1]:
import pandas as pd
from pathlib import Path

anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


print(f"Root directory: {anondir(root_dir)}")
output_dir = root_dir / "inconsistency"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output dir: {anondir(output_dir)}")

Root directory: <living-park>
Output dir: <living-park>/inconsistency


## Ansari-Bradley test

Perform the Ansari-Bradley test for equal scale parameters.

The Ansari-Bradley test is a non-parametric test for the equality of the scale parameter of the distributions from which two samples were drawn. The null hypothesis states that the ratio of the scale of the distribution underlying x to the scale of the distribution underlying y is 1.

In [2]:
from scipy.stats import ansari
from statsmodels.stats.multitest import fdrcorrection, multipletests

subcortical_regions = [
    "Left-Thalamus",
    "Left-Caudate",
    "Left-Putamen",
    "Left-Pallidum",
    "Left-Hippocampus",
    "Left-Amygdala",
    "Left-Accumbens-area",
    "Right-Thalamus",
    "Right-Caudate",
    "Right-Putamen",
    "Right-Pallidum",
    "Right-Hippocampus",
    "Right-Amygdala",
    "Right-Accumbens-area",
]

cortical_regions = [
    "bankssts",
    "caudalanteriorcingulate",
    "caudalmiddlefrontal",
    "cuneus",
    "entorhinal",
    "fusiform",
    "inferiorparietal",
    "inferiortemporal",
    "isthmuscingulate",
    "lateraloccipital",
    "lateralorbitofrontal",
    "lingual",
    "medialorbitofrontal",
    "middletemporal",
    "parahippocampal",
    "paracentral",
    "parsopercularis",
    "parsorbitalis",
    "parstriangularis",
    "pericalcarine",
    "postcentral",
    "posteriorcingulate",
    "precentral",
    "precuneus",
    "rostralanteriorcingulate",
    "rostralmiddlefrontal",
    "superiorfrontal",
    "superiorparietal",
    "superiortemporal",
    "supramarginal",
    "frontalpole",
    "temporalpole",
    "transversetemporal",
    "insula",
]

def compare_variances(sample_low, sample_high):
    # Perform Ansari-Bradley test for variance sample_high > variance sample_low
    ansari_result = ansari(sample_low, sample_high, alternative="less")
    return {
        "test_name": "Ansari-Bradley",
        "statistic": ansari_result.statistic,
        "p_value": ansari_result.pvalue,
        "is_greater": ansari_result.pvalue < 0.05,
    }


from pathlib import Path
import pandas as pd
import numpy as np

# expects: root_dir, subcortical_regions, cortical_regions, compare_variances


def compare_variances_all(
    test: str, metric: str, regions: list[str] | None = None
) -> pd.DataFrame:
    """
    Compare dispersion (e.g., variance) between baseline and longitudinal MCA samples
    for each region (and hemisphere when present). Works for both subcortical and
    cortical metrics by auto-detecting a 'hemisphere' column.

    Parameters
    ----------
    test : {"ancova", "partial_correlation"}
        Which statistical test produced the samples. Chooses coef: 'F' for ancova, 'r' otherwise.
    metric : str
        Metric name used in filenames, e.g. 'subcortical_volume', 'area', 'thickness', 'volume'.
    regions : list[str] | None
        Optional region list to restrict/override. If None, uses cortical_regions when
        hemisphere column exists, else subcortical_regions.

    Returns
    -------
    pd.DataFrame
        One row per (region[, hemisphere]) with the result from compare_variances(...)
        plus identifying columns.
    """
    # --- load data ---
    base_path = Path(root_dir) / test
    f_baseline = base_path / f"{test}_baseline_{metric}.parquet"
    f_longitudinal = base_path / f"{test}_longitudinal_{metric}.parquet"

    if not f_baseline.exists() or not f_longitudinal.exists():
        raise FileNotFoundError(f"Missing parquet(s): {f_baseline} or {f_longitudinal}")

    df_base = pd.read_parquet(f_baseline)
    df_long = pd.read_parquet(f_longitudinal)

    # --- choose coefficient ---
    coef = "F" if test == "ancova" else "r"
    if coef not in df_base.columns or coef not in df_long.columns:
        raise KeyError(f"Column '{coef}' not found in input Parquet files.")

    # --- detect hemispheres / choose region list ---
    has_hemi = ("hemisphere" in df_base.columns) and ("hemisphere" in df_long.columns)

    if regions is None:
        regions = cortical_regions if has_hemi else subcortical_regions

    # --- helper: center vector safely ---
    def _center(arr: np.ndarray) -> np.ndarray:
        if arr.size == 0:
            return arr
        m = np.nanmean(arr)
        return arr - (0.0 if np.isnan(m) else m)

    rows = []

    if has_hemi:
        # Cortical-style: iterate region × hemisphere
        # Use intersection of hemispheres present in both tables
        hemis = sorted(
            set(df_base["hemisphere"].unique()) & set(df_long["hemisphere"].unique())
        )
        for region in regions:
            for hemi in hemis:
                base_vals = df_base[
                    (df_base["region"] == region) & (df_base["hemisphere"] == hemi)
                ][coef].to_numpy()
                long_vals = df_long[
                    (df_long["region"] == region) & (df_long["hemisphere"] == hemi)
                ][coef].to_numpy()

                # drop NaNs
                base_vals = base_vals[~np.isnan(base_vals)]
                long_vals = long_vals[~np.isnan(long_vals)]

                if base_vals.size == 0 or long_vals.size == 0:
                    continue

                base_vals = _center(base_vals)
                long_vals = _center(long_vals)

                res = compare_variances(
                    base_vals, long_vals
                )  # expected to return a dict-like
                res = dict(res)  # ensure mutable
                res["region"] = region
                res["hemisphere"] = hemi
                res["test"] = test
                res["metric"] = metric
                rows.append(res)
    else:
        # Subcortical-style: iterate region only
        for region in regions:
            base_vals = df_base[df_base["region"] == region][coef].to_numpy()
            long_vals = df_long[df_long["region"] == region][coef].to_numpy()

            base_vals = base_vals[~np.isnan(base_vals)]
            long_vals = long_vals[~np.isnan(long_vals)]

            if base_vals.size == 0 or long_vals.size == 0:
                continue

            base_vals = _center(base_vals)
            long_vals = _center(long_vals)

            res = compare_variances(base_vals, long_vals)
            res = dict(res)
            res["region"] = region
            res["test"] = test
            res["metric"] = metric
            rows.append(res)

    if not rows:
        return pd.DataFrame()

    return pd.DataFrame(rows).reset_index(drop=True)




## Subcortical

In [3]:
df = pd.DataFrame()
metric = "subcortical_volume"
for analysis in ["ancova", "partial_correlation"]:
    results = compare_variances_all(analysis, metric)
    results_less = results[results["is_greater"]]
    perc = results_less.size / results.size if results.size > 0 else 0
    results["analysis"] = analysis
    results["metric"] = metric
    fdr_significance, fdr_pvalue = fdrcorrection(results.p_value)
    bonferroni = multipletests(results.p_value, method="bonferroni")[-1]
    print(
        f"{analysis} {metric} regions with variance lower than baseline: {results_less.size} / {results.size} ({perc:<.2%})"
    )
    print(
        f"{analysis} {metric} regions with variance lower than baseline: {results_less.shape[0]} / {results.shape[0]} ({results_less.shape[0]/results.shape[0]:<.2%})"
    )
    results["fdr"] = fdr_pvalue
    results["fdr_significance"] = fdr_significance
    results["bonferroni"] = bonferroni
    df = pd.concat([df, results])

df_subcortical_ansari = df

ancova subcortical_volume regions with variance lower than baseline: 14 / 112 (14.29%)
ancova subcortical_volume regions with variance lower than baseline: 2 / 14 (14.29%)
partial_correlation subcortical_volume regions with variance lower than baseline: 98 / 112 (100.00%)
partial_correlation subcortical_volume regions with variance lower than baseline: 14 / 14 (100.00%)


## Cortical

In [4]:
df = pd.DataFrame()
for analysis in ["ancova", "partial_correlation"]:
    for metric in ["thickness", "area", "volume"]:
        results = compare_variances_all(analysis, metric)
        results['analysis'] = analysis
        results['metric'] = metric
        fdr_significance, fdr_pvalue = fdrcorrection(results.p_value)    
        bonferroni = multipletests(results.p_value, method='bonferroni')[-1]
        results['fdr'] = fdr_pvalue
        results['fdr_significance'] = fdr_significance
        results['bonferroni'] = bonferroni
        results_less = results[results["is_greater"]]
        print(
            f"{analysis} {metric} regions with variance lower than baseline: {results_less.shape[0]} / {results.shape[0]} ({results_less.shape[0]/results.shape[0]:<.2%})"
        )
        df = pd.concat([df, results])

df_cortical_ansari = df

ancova thickness regions with variance lower than baseline: 33 / 68 (48.53%)


ancova area regions with variance lower than baseline: 40 / 68 (58.82%)
ancova volume regions with variance lower than baseline: 26 / 68 (38.24%)
partial_correlation thickness regions with variance lower than baseline: 61 / 68 (89.71%)
partial_correlation area regions with variance lower than baseline: 65 / 68 (95.59%)
partial_correlation volume regions with variance lower than baseline: 65 / 68 (95.59%)


## Save results

In [7]:
df_ansari = pd.concat([df_cortical_ansari, df_subcortical_ansari])
filename = output_dir / "variance_baseline_longitudinal_comparison.csv"
output_dir.mkdir(parents=True, exist_ok=True)
df_ansari.to_csv(filename, index=False)
df_ansari

Unnamed: 0,test_name,statistic,p_value,is_greater,region,hemisphere,test,metric,analysis,fdr,fdr_significance,bonferroni
0,Ansari-Bradley,454.0,4.842454e-05,True,bankssts,lh,ancova,thickness,ancova,1.431682e-04,True,0.000735
1,Ansari-Bradley,469.0,2.660440e-06,True,bankssts,rh,ancova,thickness,ancova,9.521575e-06,True,0.000735
2,Ansari-Bradley,322.0,8.587203e-01,False,caudalanteriorcingulate,lh,ancova,thickness,ancova,1.000000e+00,False,0.000735
3,Ansari-Bradley,501.0,3.140859e-10,True,caudalanteriorcingulate,rh,ancova,thickness,ancova,5.339460e-09,True,0.000735
4,Ansari-Bradley,479.0,2.635516e-07,True,caudalmiddlefrontal,lh,ancova,thickness,ancova,1.120094e-06,True,0.000735
...,...,...,...,...,...,...,...,...,...,...,...,...
9,Ansari-Bradley,489.0,1.749623e-08,True,Right-Putamen,,partial_correlation,subcortical_volume,partial_correlation,8.164908e-08,True,0.003571
10,Ansari-Bradley,469.0,2.660440e-06,True,Right-Pallidum,,partial_correlation,subcortical_volume,partial_correlation,6.207693e-06,True,0.003571
11,Ansari-Bradley,493.0,5.120714e-09,True,Right-Hippocampus,,partial_correlation,subcortical_volume,partial_correlation,6.704812e-08,True,0.003571
12,Ansari-Bradley,408.0,1.888417e-02,True,Right-Amygdala,,partial_correlation,subcortical_volume,partial_correlation,1.888417e-02,True,0.003571
