# Living-park cross-sectional analysis

## Compute CSV files for area, volume and thickness measurements

In [21]:
import glob
import json
import os
from pathlib import Path

import numpy as np
import pandas as pd

import joblib
from freesurfer.aparcstats2table import main as aparcstats2table
from freesurfer.asegstats2table import main as asegstats2table

anonymizer = True

root_dir = Path.cwd()


def anondir(path: Path, prefix=root_dir) -> Path:
    """Anonymize a directory path by replacing user-specific parts with <root>."""
    if not anonymizer:
        return path
    path_str = str(path).replace(str(prefix), "<living-park>")
    return Path(path_str)


print(f"Running in root dir: {anondir(root_dir)}")
input_dir = root_dir / "vip_outputs"
print(f"Input directory: {anondir(input_dir)}")
output_dir = root_dir / "tables_QCed"
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {anondir(output_dir)}")

Running in root dir: <living-park>
Input directory: <living-park>/vip_outputs
Output directory: <living-park>/tables_QCed


### Build TSV tables from raw data

In [34]:
dataset = pd.read_csv(
    root_dir
    / "cohort"
    / "vip_executions_stats_info_2visits_passed_qc_with_26_repetitions.csv"
)
cohort = pd.read_csv(root_dir / "cohort" / "cross-sectional_cohort_qced.csv")
subjects_in_cohort = cohort["PATNO_id"].unique()
dataset = dataset[dataset["subject_visit"].isin(subjects_in_cohort)]
print(f"Total subject/visit in dataset: {len(subjects_in_cohort)}")
print(f"Unique subjects in cohort: {dataset['subject'].nunique()}")

Total subject/visit in dataset: 534
Unique subjects in cohort: 267


In [23]:
measurements = ["volume", "thickness", "area"]

rsv = {
    f"rep{rep}": dataset[dataset["repetition"] == rep]["subject_visit"].tolist()
    for rep in dataset["repetition"].unique()
}


def compute_aseg_volume(subjects_dir, repetition, subjects, output_dir):
    subjects_dir = os.path.join(subjects_dir, repetition)
    tablefile = os.path.join(output_dir, f"{repetition}.aseg.volume.tsv")
    args = [
        f"--sd={subjects_dir}",
        "--skip",
        "--subjects",
        *subjects,
        "--meas=volume",
        f"--tablefile={tablefile}",
    ]
    asegstats2table(args)


def compute_volume(subjects_dir, repetition, subjects, output_dir):
    subjects_dir = os.path.join(subjects_dir, repetition)
    for hemi in ["lh", "rh"]:
        tablefile = os.path.join(output_dir, f"{repetition}.{hemi}.aparc.volume.tsv")
        args = [
            f"--sd={subjects_dir}",
            "--skip",
            "--parc=aparc",
            f"--hemi={hemi}",
            "--subjects",
            *subjects,
            "--meas=volume",
            f"--tablefile={tablefile}",
        ]
        aparcstats2table(args)


def compute_thickness(subjects_dir, repetition, subjects, output_dir):
    subjects_dir = os.path.join(subjects_dir, repetition)
    for hemi in ["lh", "rh"]:
        tablefile = os.path.join(output_dir, f"{repetition}.{hemi}.aparc.thickness.tsv")
        args = [
            f"--sd={subjects_dir}",
            "--skip",
            "--parc=aparc",
            f"--hemi={hemi}",
            "--subjects",
            *subjects,
            "--meas=thickness",
            f"--tablefile={tablefile}",
        ]
        aparcstats2table(args)


def compute_area(subjects_dir, repetition, subjects, output_dir):
    subjects_dir = os.path.join(subjects_dir, repetition)
    for hemi in ["lh", "rh"]:
        tablefile = os.path.join(output_dir, f"{repetition}.{hemi}.aparc.area.tsv")
        args = [
            f"--sd={subjects_dir}",
            "--skip",
            "--parc=aparc",
            f"--hemi={hemi}",
            "--subjects",
            *subjects,
            "--meas=area",
            f"--tablefile={tablefile}",
        ]
        aparcstats2table(args)


joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(compute_aseg_volume)(input_dir, repetition, subjects, output_dir)
    for repetition, subjects in rsv.items()
)
joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(compute_volume)(input_dir, repetition, subjects, output_dir)
    for repetition, subjects in rsv.items()
)
joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(compute_thickness)(input_dir, repetition, subjects, output_dir)
    for repetition, subjects in rsv.items()
)
_ = joblib.Parallel(n_jobs=-1, verbose=10)(
    joblib.delayed(compute_area)(input_dir, repetition, subjects, output_dir)
    for repetition, subjects in rsv.items()
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  34 | elapsed:    2.7s remaining:   28.1s
[Parallel(n_jobs=-1)]: Done   7 out of  34 | elapsed:    3.3s remaining:   12.6s
[Parallel(n_jobs=-1)]: Done  11 out of  34 | elapsed:    3.4s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done  15 out of  34 | elapsed:    3.5s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  19 out of  34 | elapsed:    3.6s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done  23 out of  34 | elapsed:    5.3s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  27 out of  34 | elapsed:    5.5s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  31 out of  34 | elapsed:    6.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:    6.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  34 | elapsed:    3.1s remaining:   31.6s
[Parallel(n_jobs=-1)]: Done   7 out of  34 | e

### Read TSV tables

In [24]:
filename = output_dir / "*.tsv"
tsv_tables = glob.glob(str(filename))
aseg_tables_group = {}
aparc_tables_group = {}
for tsv_table in tsv_tables:
    fields = os.path.basename(tsv_table).split(".")
    if fields[1] == "aseg":
        aseg_tables_group["volume"] = aseg_tables_group.get("volume", []) + [tsv_table]
    elif fields[1] in ["lh", "rh"]:
        hemi = fields[1]
        measure = fields[3]
        aparc_tables_group[measure] = aparc_tables_group.get(measure, {})
        aparc_tables_group[measure][hemi] = aparc_tables_group[measure].get(
            hemi, []
        ) + [tsv_table]

In [25]:
print("Parcellation tables found:")
for measure, tables in aparc_tables_group.items():
    for hemi, tsv_tables in tables.items():
        print(f"\tFound {len(tsv_tables)} for {measure} {hemi} tables")
print("Segmentation tables found:")
for measure, tsv_tables in aseg_tables_group.items():
    print(f"\tFound {len(tsv_tables)} for {measure} tables")

Parcellation tables found:
	Found 34 for volume rh tables
	Found 34 for volume lh tables
	Found 34 for thickness rh tables
	Found 34 for thickness lh tables
	Found 34 for area lh tables
	Found 34 for area rh tables
Segmentation tables found:
	Found 34 for volume tables


### Merge tables 

In [26]:
print(f"root_dir: {anondir(root_dir)}")
stats_dir = Path(root_dir) / "stats_QCed"
raw_stats_dir = stats_dir / "raw"
os.makedirs(stats_dir, exist_ok=True)
os.makedirs(raw_stats_dir, exist_ok=True)

root_dir: <living-park>


In [27]:
import pandas as pd

cohort_filename = root_dir / "cohort" / "cross-sectional_cohort_qced.csv"
cohort = pd.read_csv(cohort_filename)


def read_tsv(tsv):
    fields = os.path.basename(tsv).split(".")
    repetition = fields[0].replace("rep", "")
    df = pd.read_csv(tsv, sep="\t")
    df.rename(lambda column: column.replace("lh_", ""), inplace=True, axis=1)
    df.rename(lambda column: column.replace("rh_", ""), inplace=True, axis=1)
    df.rename(columns={df.columns[0]: "subject_visit"}, inplace=True)
    df = df[df["subject_visit"].isin(cohort["PATNO_id"].unique())]
    df["subject"] = df["subject_visit"].apply(lambda x: x.split("_")[0])
    df["visit"] = df["subject_visit"].apply(lambda x: x.split("_")[1])
    df["repetition"] = int(repetition)
    df["dx_group"] = df["subject_visit"].apply(
        lambda x: (
            cohort[cohort["PATNO_id"] == x]["dx_group"].values[0]
            if x in cohort["PATNO_id"].unique()
            else "unknown"
        )
    )
    df["PD_status"] = df["dx_group"].apply(lambda x: "PD" if "PD" in x else "HC")
    df["PATNO_id"] = df["subject_visit"]
    if ".lh." in tsv:
        df["hemi"] = "lh"
    elif ".rh." in tsv:
        df["hemi"] = "rh"
    return df


def concat_tables(tables, hemi=True):
    if hemi:
        lh = pd.concat([read_tsv(table) for table in tables["lh"]])
        rh = pd.concat([read_tsv(table) for table in tables["rh"]])
        concat = pd.concat((lh, rh))
        return concat
    else:
        return pd.concat([read_tsv(table) for table in tables])


thickness_df = concat_tables(aparc_tables_group["thickness"])
thickness_df.to_parquet(raw_stats_dir / "thickness.parquet")

area_df = concat_tables(aparc_tables_group["area"])
area_df.to_parquet(raw_stats_dir / "area.parquet")

volume_df = concat_tables(aparc_tables_group["volume"])
volume_df.to_parquet(raw_stats_dir / "volume.parquet")

subcortical_volume = concat_tables(aseg_tables_group["volume"], hemi=False)
subcortical_volume.to_parquet(raw_stats_dir / "subcortical_volume.parquet")

### Load pre-computed tables

In [28]:
thickness_df = pd.read_parquet(raw_stats_dir / "thickness.parquet")
area_df = pd.read_parquet(raw_stats_dir / "area.parquet")
volume_df = pd.read_parquet(raw_stats_dir / "volume.parquet")
subcortical_volume_df = pd.read_parquet(raw_stats_dir / "subcortical_volume.parquet")

In [29]:
import pandas as pd
from hashlib import sha1


def keep_first_rows(
    df: pd.DataFrame,
    dataset: pd.DataFrame,
    n: int = 26,
    hemi: bool = False,
    subject_col: str = "subject_visit",
    repetition_col: str = "repetition",
    hemi_col: str = "hemi",
    reject_col: str = "rejected_images",
    merge_left_on: tuple | list = None,
    merge_right_on: tuple | list = None,
    sort_by: list | None = None,
    verbose: bool = True,
) -> pd.DataFrame:
    """
    Keep the first n rows per subject (and per hemisphere if hemi=True) after
    merging with `dataset` and removing rejected images.

    Parameters
    ----------
    df : DataFrame
        Measurement dataframe (e.g., cortical thickness).
    dataset : DataFrame
        Metadata dataframe to merge (must share subject/repetition keys).
    n : int
        Number of rows to keep per group.
    hemi : bool
        If True, enforce n per (subject, hemi). Otherwise per subject.
    subject_col, repetition_col, hemi_col, reject_col : str
        Column names in `df`.
    merge_left_on, merge_right_on : list/tuple
        Keys to merge on. Defaults to [subject_col, repetition_col] for both.
    sort_by : list | None
        Extra sort keys (in addition to grouping keys). Defaults to [repetition_col].
    verbose : bool
        Print progress info.

    Returns
    -------
    DataFrame
        Sampled dataframe with exactly n rows per group (subject or subject×hemi).
        Subjects/groups with < n available rows are dropped (and reported).
    """

    # Make safe copies and align merge columns if needed
    left = df.copy()
    right = dataset.copy()

    # ----- Merge
    merged = pd.merge(
        left, right, on=["subject_visit", "subject", "visit", "repetition"], how="inner"
    )

    if verbose:
        print(
            f"Keeping {n} rows per {'subject×hemi' if hemi else 'subject'} (hemi={hemi})"
        )
        print(f"Rows before filtering: {merged.shape[0]}")

    # ----- Remove rejected images (if the column exists)
    if reject_col in merged.columns:
        nb_before = merged.shape[0]
        merged = merged[~merged[reject_col].astype(bool)]
        if verbose:
            print(f"Removed rejected images: {nb_before - merged.shape[0]}")
    else:
        if verbose:
            print(
                f"Warning: '{reject_col}' column not found; skipping rejection filter."
            )

    if merged.empty:
        if verbose:
            print("No data left after merge/filtering.")
        return merged

    # ----- Sort for deterministic head(n)
    group_keys = [subject_col] + (
        [hemi_col] if hemi and (hemi_col in merged.columns) else []
    )
    if hemi and hemi_col not in merged.columns:
        raise KeyError(f"hemi=True but '{hemi_col}' column is missing.")

    if sort_by is None:
        sort_by = [repetition_col] if repetition_col in merged.columns else []
    sort_keys = [k for k in (group_keys + sort_by) if k in merged.columns]
    if sort_keys:
        merged = merged.sort_values(by=sort_keys)

    # ----- Sample first n per group
    sampled = merged.groupby(group_keys, as_index=False, group_keys=False).head(n)

    # ----- Enforce exactly n per group (drop incomplete groups)
    counts = sampled.groupby(group_keys).size().reset_index(name="k")
    keepers = counts[counts["k"] == n][group_keys]
    sampled_strict = sampled.merge(keepers, on=group_keys, how="inner")

    # Report dropped groups
    dropped = counts[counts["k"] < n]
    if not dropped.empty and verbose:
        dropped_subjects = dropped[subject_col].unique()
        # Stable, short encodings for display
        encoded = [sha1(str(s).encode()).hexdigest()[:8] for s in dropped_subjects]
        print(f"Groups with < {n} rows dropped: {len(dropped_subjects)} subjects")
        print(f"Encoded subject ids (first 8 hex): {encoded}")

    # ----- Assertions & summary
    if hemi:
        # Check per-hemi counts
        for h in sampled_strict[hemi_col].unique():
            sub_h = sampled_strict[sampled_strict[hemi_col] == h]
            c = sub_h.groupby(subject_col).size()
            assert (c == n).all(), f"Found a {hemi_col}={h} group with != {n} rows."
        if verbose:
            n_subj_lh = sampled_strict.query(f"{hemi_col} == 'lh'")[
                subject_col
            ].nunique()
            n_subj_rh = sampled_strict.query(f"{hemi_col} == 'rh'")[
                subject_col
            ].nunique()
            print(f"Subjects (lh): {n_subj_lh} | Subjects (rh): {n_subj_rh}")
            print(f"Total subjects: {sampled_strict[subject_col].nunique()}")
    else:
        c = sampled_strict.groupby(subject_col).size()
        assert (c == n).all(), f"Found a subject with != {n} rows."
        if verbose:
            print(f"Total subjects: {sampled_strict[subject_col].nunique()}")

    if verbose:
        print(f"Final rows kept: {len(sampled_strict)}")

    return sampled_strict


print("Keeping 26 repetitions for each subject")
print("\nCortical thickness")
thickness_sampled_df = keep_first_rows(thickness_df, dataset, n=26, hemi=True)

print("\nCortical area")
area_sampled_df = keep_first_rows(area_df, dataset, n=26, hemi=True)

print("\nCortical volume")
volume_sampled_df = keep_first_rows(volume_df, dataset, n=26, hemi=True)

print("\nSubcortical volume")
subcortical_volume_sampled_df = keep_first_rows(
    subcortical_volume_df, dataset, n=26, hemi=False
)

Keeping 26 repetitions for each subject

Cortical thickness
Keeping 26 rows per subject×hemi (hemi=True)
Rows before filtering: 32200
Removed rejected images: 0
Subjects (lh): 534 | Subjects (rh): 534
Total subjects: 534
Final rows kept: 27768

Cortical area
Keeping 26 rows per subject×hemi (hemi=True)
Rows before filtering: 32200
Removed rejected images: 0
Subjects (lh): 534 | Subjects (rh): 534
Total subjects: 534
Final rows kept: 27768

Cortical volume
Keeping 26 rows per subject×hemi (hemi=True)
Rows before filtering: 32200
Removed rejected images: 0
Subjects (lh): 534 | Subjects (rh): 534
Total subjects: 534
Final rows kept: 27768

Subcortical volume
Keeping 26 rows per subject (hemi=False)
Rows before filtering: 16100
Removed rejected images: 0
Total subjects: 534
Final rows kept: 13884


### Save 

In [30]:
sampled_stats_dir = stats_dir / "sampled"
os.makedirs(sampled_stats_dir, exist_ok=True)
print(f"Sampled stats directory: {anondir(sampled_stats_dir)}")

description = {
    "sample_size": 26,
    "thickness": "Cortical thickness",
    "area": "Cortical area",
    "volume": "Cortical volume",
    "subcortical_volume": "Subcortical volume",
}
with open(sampled_stats_dir / "description.json", "w") as f:
    json.dump(description, f, indent=4)

thickness_sampled_df.to_parquet(sampled_stats_dir / "thickness.parquet")
area_sampled_df.to_parquet(sampled_stats_dir / "area.parquet")
volume_sampled_df.to_parquet(sampled_stats_dir / "volume.parquet")
subcortical_volume_sampled_df.to_parquet(
    sampled_stats_dir / "subcortical_volume.parquet"
)

Sampled stats directory: <living-park>/stats_QCed/sampled
