# Display statistics about consitency analaysis

In [1]:
import pandas as pd
from pathlib import Path

anonymizer = False

root_dir = Path.cwd().parent.parent


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


print(f"Root directory: {anondir(root_dir)}")
output_dir = root_dir / "results" / "inconsistency"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output dir: {anondir(output_dir)}")
input_dir = root_dir / "results" / "longitudinal" 
print(f"Input dir: {anondir(input_dir)}")

Root directory: /home/yohan/Work/livingpark-numerical-variability/freesurfer-fuzzy
Output dir: /home/yohan/Work/livingpark-numerical-variability/freesurfer-fuzzy/results/inconsistency
Input dir: /home/yohan/Work/livingpark-numerical-variability/freesurfer-fuzzy/results/longitudinal


## Subcortical volume

In [2]:
import os
from pathlib import Path
import pandas as pd

# --- configuration ---
regions = [
    "Left-Thalamus",
    "Left-Caudate",
    "Left-Putamen",
    "Left-Pallidum",
    "Left-Hippocampus",
    "Left-Amygdala",
    "Left-Accumbens-area",
    "Right-Thalamus",
    "Right-Caudate",
    "Right-Putamen",
    "Right-Pallidum",
    "Right-Hippocampus",
    "Right-Amygdala",
    "Right-Accumbens-area",
]


TESTS = ["ancova", "partial_correlation"]
ANALYSES = ["baseline", "longitudinal"]
METRIC = "subcortical_volume"


def print_stats_subcortical_volume(metric: str = METRIC) -> pd.DataFrame:
    """
    Load CSVs for the given metric, keep the target regions, and compute
    a 'fluctuating' flag: proportion_significant strictly between 0 and 1.
    Returns a tidy dataframe with columns: region, test, analysis, fluctuating.
    """
    rows = []
    for test in TESTS:
        for analysis in ANALYSES:
            csv_path = input_dir / test / f"{test}_{analysis}_{metric}_stats.csv"
            if not csv_path.exists():
                print(f"Warning: missing file {anondir(csv_path)}")
                # Skip missing files rather than crashing
                continue            
            df = pd.read_csv(csv_path)
            df = df[df["region"].isin(regions)].copy()

            # Ensure numeric then flag strictly between 0 and 1
            ps = pd.to_numeric(df["proportion_significant"], errors="coerce")
            df["fluctuating"] = (0 < ps) & (ps < 1)

            df["test"] = test
            df["analysis"] = analysis
            rows.append(df[["region", "test", "analysis", "fluctuating"]])

    if not rows:
        # Return an empty, well-typed frame if nothing was found
        return pd.DataFrame(columns=["region", "test", "analysis", "fluctuating"])

    return pd.concat(rows, ignore_index=True)


def print_stats_summary() -> pd.DataFrame:
    """
    Summarize fluctuations by:
      - baseline (across tests)
      - longitudinal (across tests)
      - ancova (across analyses)
      - partial_correlation (across analyses)
      - at_least_one_fluctuation (across all four)
    For each category, report:
      - n_fluctuating_pairs (sum over relevant pairs)
      - n_possible_pairs (regions x number of pairs in the category)
      - percentage = 100 * n_fluctuating_pairs / n_possible_pairs
    """
    tidy = print_stats_subcortical_volume(METRIC)
    if tidy.empty:
        return pd.DataFrame(
            columns=[
                "category",
                "n_fluctuating_pairs",
                "n_possible_pairs",
                "percentage",
            ]
        )

    # Pivot to have one column per pair (test_analysis) with boolean values
    tidy["pair"] = tidy["test"] + "_" + tidy["analysis"]
    pivot = tidy.pivot_table(
        index="region",
        columns="pair",
        values="fluctuating",
        aggfunc="any",  # if multiple rows per region/pair, treat as any fluctuation
        fill_value=False,
    ).astype(bool)

    # Ensure all four expected columns exist
    for col in [
        "ancova_baseline",
        "ancova_longitudinal",
        "partial_correlation_baseline",
        "partial_correlation_longitudinal",
    ]:
        if col not in pivot.columns:
            pivot[col] = False

    n_regions = pivot.shape[0]

    # Helper to compute counts & percentage for a set of pair columns
    def summarize(category_name: str, pair_cols: list[str]) -> dict:
        n_fluct = (
            pivot[pair_cols].to_numpy().sum()
        )  # sums True over all regions & pairs
        n_pairs = n_regions * len(pair_cols)
        pct = 100.0 * n_fluct / n_pairs if n_pairs > 0 else 0.0
        return {
            "category": category_name,
            "n_fluctuating_pairs": int(n_fluct),
            "n_possible_pairs": int(n_pairs),
            "percentage": round(pct, 0),
        }

    # Define categories as sets of pairs
    ab = "ancova_baseline"
    al = "ancova_longitudinal"
    pb = "partial_correlation_baseline"
    pl = "partial_correlation_longitudinal"

    summaries = [
        summarize("ancova baseline", [ab]),
        summarize("ancova longitudinal", [al]),
        summarize("partial_correlation baseline", [pb]),
        summarize("partial_correlation longitudinal",[pl]),
        summarize("baseline", [ab, pb]),
        summarize("longitudinal", [al, pl]),
        summarize("ancova", [ab, al]),
        summarize("partial_correlation", [pb, pl]),
        summarize("at_least_one_fluctuation", [ab, al, pb, pl]),
    ]

    return pd.DataFrame(summaries)

In [3]:
sv = print_stats_summary()
sv['metric'] = "subcortical_volume"
sv

Unnamed: 0,category,n_fluctuating_pairs,n_possible_pairs,percentage,metric
0,ancova baseline,4,14,29.0,subcortical_volume
1,ancova longitudinal,2,14,14.0,subcortical_volume
2,partial_correlation baseline,4,14,29.0,subcortical_volume
3,partial_correlation longitudinal,5,14,36.0,subcortical_volume
4,baseline,8,28,29.0,subcortical_volume
5,longitudinal,7,28,25.0,subcortical_volume
6,ancova,6,28,21.0,subcortical_volume
7,partial_correlation,9,28,32.0,subcortical_volume
8,at_least_one_fluctuation,15,56,27.0,subcortical_volume


## Cortical region

In [4]:
import os
from pathlib import Path
import pandas as pd

# ---- configuration ----
cortical_regions = [
    "bankssts",
    "caudalanteriorcingulate",
    "caudalmiddlefrontal",
    "cuneus",
    "entorhinal",
    "fusiform",
    "inferiorparietal",
    "inferiortemporal",
    "isthmuscingulate",
    "lateraloccipital",
    "lateralorbitofrontal",
    "lingual",
    "medialorbitofrontal",
    "middletemporal",
    "parahippocampal",
    "paracentral",
    "parsopercularis",
    "parsorbitalis",
    "parstriangularis",
    "pericalcarine",
    "postcentral",
    "posteriorcingulate",
    "precentral",
    "precuneus",
    "rostralanteriorcingulate",
    "rostralmiddlefrontal",
    "superiorfrontal",
    "superiorparietal",
    "superiortemporal",
    "supramarginal",
    "frontalpole",
    "temporalpole",
    "transversetemporal",
    "insula",
]


TESTS = ["ancova", "partial_correlation"]
ANALYSES = ["baseline", "longitudinal"]


def _ensure_cols(df: pd.DataFrame, cols: list[str]) -> None:
    miss = [c for c in cols if c not in df.columns]
    if miss:
        raise ValueError(f"Missing required columns in CSV: {miss}")


def print_stats_cortical(metric: str) -> pd.DataFrame:
    """
    Load CSVs for cortical *volumes* metric, filter to cortical_regions,
    compute 'fluctuating' = proportion_significant strictly between 0 and 1.
    Returns tidy dataframe with columns: region, hemisphere, test, analysis, fluctuating.
    Skips any missing CSVs gracefully.
    """
    print(f"Proportion of regions fluctuating for {metric}")
    rows = []
    for test in TESTS:
        for analysis in ANALYSES:
            csv_path = input_dir / test / f"{test}_{analysis}_{metric}_stats.csv"
            if not csv_path.exists():
                print(f"Warning: missing file {anondir(csv_path)}")
                # Skip missing files rather than crashing
                continue

            df = pd.read_csv(csv_path)
            _ensure_cols(df, ["region", "proportion_significant"])

            # Keep only targeted cortical regions (region names without hemi)
            df = df[df["region"].isin(cortical_regions)].copy()

            # Hemispheres (if not present, create a placeholder)
            if "hemisphere" not in df.columns:
                df["hemisphere"] = "both"  # or set to "" if you prefer

            # Compute strict in-between flag; coerce to numeric first
            ps = pd.to_numeric(df["proportion_significant"], errors="coerce")
            df["fluctuating"] = (0 < ps) & (ps < 1)

            df["test"] = test
            df["analysis"] = analysis

            rows.append(df[["region", "hemisphere", "test", "analysis", "fluctuating"]])

    if not rows:
        return pd.DataFrame(
            columns=["region", "hemisphere", "test", "analysis", "fluctuating"]
        )

    return pd.concat(rows, ignore_index=True)


def print_stats_summary(metric: str) -> pd.DataFrame:
    """
    Summarize fluctuations by categories:
      - baseline (across tests)
      - longitudinal (across tests)
      - ancova (across analyses)
      - partial_correlation (across analyses)
      - at_least_one_fluctuation (across all four)
    Returns a dataframe with counts and percentages per category.
    """
    tidy = print_stats_cortical(metric)
    if tidy.empty:
        return pd.DataFrame(
            columns=[
                "category",
                "n_fluctuating_pairs",
                "n_possible_pairs",
                "percentage",
            ]
        )

    tidy["pair"] = tidy["test"] + "_" + tidy["analysis"]
    pivot = tidy.pivot_table(
        index=["region", "hemisphere"],
        columns="pair",
        values="fluctuating",
        aggfunc="any",  # any fluctuation across potential duplicates
        fill_value=False,
    ).astype(bool)

    # Ensure all four expected pair columns exist
    expected = [
        "ancova_baseline",
        "ancova_longitudinal",
        "partial_correlation_baseline",
        "partial_correlation_longitudinal",
    ]
    for col in expected:
        if col not in pivot.columns:
            pivot[col] = False

    n_units = pivot.shape[0]  # region×hemisphere units

    def summarize(category: str, pair_cols: list[str]) -> dict:
        n_fluct = (
            pivot[pair_cols].to_numpy().sum()
        )  # sum of True across all units×pairs
        n_pairs = n_units * len(pair_cols)
        pct = 100.0 * n_fluct / n_pairs if n_pairs else 0.0
        return {
            "category": category,
            "n_fluctuating_pairs": int(n_fluct),
            "n_possible_pairs": int(n_pairs),
            "percentage": round(pct, 0),
        }

    ab = "ancova_baseline"
    al = "ancova_longitudinal"
    pb = "partial_correlation_baseline"
    pl = "partial_correlation_longitudinal"

    out = [
        summarize("ancova baseline", [ab]),
        summarize("ancova longitudinal", [al]),
        summarize("partial_correlation baseline", [pb]),
        summarize("partial_correlation longitudinal", [pl]),
        summarize("baseline", [ab, pb]),
        summarize("longitudinal", [al, pl]),
        summarize("ancova", [ab, al]),
        summarize("partial_correlation", [pb, pl]),
        summarize("at_least_one_fluctuation", [ab, al, pb, pl]),
    ]
    return pd.DataFrame(out)

In [5]:
ct = print_stats_summary("thickness")
ct['metric'] = "thickness"
ct

Proportion of regions fluctuating for thickness


Unnamed: 0,category,n_fluctuating_pairs,n_possible_pairs,percentage,metric
0,ancova baseline,11,68,16.0,thickness
1,ancova longitudinal,9,68,13.0,thickness
2,partial_correlation baseline,19,68,28.0,thickness
3,partial_correlation longitudinal,17,68,25.0,thickness
4,baseline,30,136,22.0,thickness
5,longitudinal,26,136,19.0,thickness
6,ancova,20,136,15.0,thickness
7,partial_correlation,36,136,26.0,thickness
8,at_least_one_fluctuation,56,272,21.0,thickness


In [6]:
sa = print_stats_summary("area")
sa['metric'] = 'area'
sa

Proportion of regions fluctuating for area


Unnamed: 0,category,n_fluctuating_pairs,n_possible_pairs,percentage,metric
0,ancova baseline,18,68,26.0,area
1,ancova longitudinal,36,68,53.0,area
2,partial_correlation baseline,4,68,6.0,area
3,partial_correlation longitudinal,26,68,38.0,area
4,baseline,22,136,16.0,area
5,longitudinal,62,136,46.0,area
6,ancova,54,136,40.0,area
7,partial_correlation,30,136,22.0,area
8,at_least_one_fluctuation,84,272,31.0,area


In [7]:
cv = print_stats_summary("volume")
cv['metric']='volume'
cv

Proportion of regions fluctuating for volume


Unnamed: 0,category,n_fluctuating_pairs,n_possible_pairs,percentage,metric
0,ancova baseline,14,68,21.0,volume
1,ancova longitudinal,20,68,29.0,volume
2,partial_correlation baseline,8,68,12.0,volume
3,partial_correlation longitudinal,36,68,53.0,volume
4,baseline,22,136,16.0,volume
5,longitudinal,56,136,41.0,volume
6,ancova,34,136,25.0,volume
7,partial_correlation,44,136,32.0,volume
8,at_least_one_fluctuation,78,272,29.0,volume


In [8]:
consistency = pd.concat([sa,ct,cv,sv])
filename = output_dir / "p-value.csv"
consistency.to_csv(filename, index=False)
consistency

Unnamed: 0,category,n_fluctuating_pairs,n_possible_pairs,percentage,metric
0,ancova baseline,18,68,26.0,area
1,ancova longitudinal,36,68,53.0,area
2,partial_correlation baseline,4,68,6.0,area
3,partial_correlation longitudinal,26,68,38.0,area
4,baseline,22,136,16.0,area
5,longitudinal,62,136,46.0,area
6,ancova,54,136,40.0,area
7,partial_correlation,30,136,22.0,area
8,at_least_one_fluctuation,84,272,31.0,area
0,ancova baseline,11,68,16.0,thickness


In [15]:
consistency[consistency['category']=='at_least_one_fluctuation']

Unnamed: 0,category,n_fluctuating_pairs,n_possible_pairs,percentage,metric
8,at_least_one_fluctuation,84,272,31.0,area
8,at_least_one_fluctuation,56,272,21.0,thickness
8,at_least_one_fluctuation,78,272,29.0,volume
8,at_least_one_fluctuation,15,56,27.0,subcortical_volume
