# RBC Loading & Modeling Template

This notebook demonstrates how to:

1. Load participant metadata from the Reproducible Brain Charts (RBC).  
2. Retrieve FreeSurfer region-surface stats for participants.  
3. Clean and filter data.  
4. Plot simple histograms of the target variable.  
5. Prepare a subset of subjects and features for modeling.


### Import necessary packages

In [None]:
# ---------------------------------------------------------------------------
# Imports
# ---------------------------------------------------------------------------

from pathlib import Path             # for handling filesystem paths
import re                            # for regex string cleaning
import numpy as np                   # for numeric operations
import pandas as pd                  # for tabular data handling
from rbclib import RBCPath           # RBC-specific path handling



### Data paths & participant metadata

In [None]:
# ---------------------------------------------------------------------------
# Paths to the RBC data
# ---------------------------------------------------------------------------
rbcdata_path = Path("/home/jovyan/shared/data/RBC")
train_filepath = rbcdata_path / "train_participants.tsv"
test_filepath  = rbcdata_path / "test_participants.tsv"

# ---------------------------------------------------------------------------
# Load participant metadata from TSV files
# ---------------------------------------------------------------------------

# Load train and test participants
train_data = pd.read_csv(train_filepath, sep="\t")
test_data  = pd.read_csv(test_filepath,  sep="\t")

# Concatenate into one DataFrame
all_data = pd.concat([train_data, test_data], ignore_index=True)

all_data.head()


### FreeSurfer helper functions

In [None]:
# ---------------------------------------------------------------------------
# FreeSurfer utility functions
# ---------------------------------------------------------------------------

def load_fsdata_raw(participant_id, local_cache_dir=Path.home() / "cache"):
    """
    Load the raw FreeSurfer TSV file for a PNC participant.
    
    Parameters
    ----------
    participant_id : str or int
        RBC participant identifier (without 'sub-' prefix).
    local_cache_dir : Path
        Local cache directory where RBC files will be stored.
    
    Returns
    -------
    pd.DataFrame
        Long-form FreeSurfer stats for the participant.
    """
    local_cache_dir = Path(local_cache_dir)
    local_cache_dir.mkdir(exist_ok=True)

    pnc_fspath = RBCPath(
        "rbc://PNC_FreeSurfer/freesurfer",
        local_cache_dir=local_cache_dir
    )
    subdir = pnc_fspath / f"sub-{participant_id}"
    tsv_path = subdir / f"sub-{participant_id}_regionsurfacestats.tsv"

    return pd.read_csv(tsv_path, sep="\t")


def filter_by_atlas(df, atlas_substr):
    """Filter rows by atlas substring (case-insensitive)."""
    mask = df["atlas"].astype(str).str.contains(atlas_substr, case=False, na=False)
    if not mask.any():
        available = sorted(df["atlas"].dropna().unique())
        raise ValueError(
            f"No atlas rows contained '{atlas_substr}'. "
            f"Available atlases: {', '.join(available)}"
        )
    return df.loc[mask].copy()


def select_measure(df, measure):
    """Select only the relevant columns for a given brain measure."""
    cols = ["subject_id", "atlas", "hemisphere", "StructName", measure]
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    return df[cols].copy()


def sanitize(col):
    """Sanitize a string for safe use as a column name."""
    clean = re.sub(r"[^0-9A-Za-z]+", "_", col)   # replace non-alphanumeric chars
    clean = re.sub(r"_{2,}", "_", clean)         # collapse multiple underscores
    return clean.strip("_")                      # strip leading/trailing underscores


def pivot(df, measure):
    """
    Pivot long-form FreeSurfer stats into a wide format with one row per subject.
    """
    subj = df["subject_id"].iloc[0]
    out = {"subject_id": subj}
    for _, row in df.iterrows():
        raw = f"{row['hemisphere']}_{row['atlas']}_{row['StructName']}_{measure}"
        col = sanitize(raw)
        out[col] = row[measure]
    return pd.DataFrame([out])


def load_and_pivot_fsdata(participant_id, atlas, measure,
                          local_cache_dir=Path.home() / "cache"):
    """
    Load, filter, select, and pivot FreeSurfer stats into a wide format.
    """
    fsdata_raw = load_fsdata_raw(participant_id, local_cache_dir)
    fsdata_by_atlas = filter_by_atlas(fsdata_raw, atlas)
    fsdata_by_measure = select_measure(fsdata_by_atlas, measure)
    return pivot(fsdata_by_measure, measure)


### Remove uninformative subjects (with plots)

In [None]:
# ---------------------------------------------------------------------------
# Remove subjects at the minimum p_factor value and plot distributions
# ---------------------------------------------------------------------------

# Plot histogram of p_factor for all training subjects
train_data["p_factor"].hist(bins=100)

# Print minimum value and its count
print("Minimum p_factor:", train_data["p_factor"].min())
print("Number of subjects with min value:",
      train_data["p_factor"].value_counts()[train_data["p_factor"].min()])

# Remove subjects with minimum p_factor (to avoid bias in modeling)
min_val_idx = train_data["p_factor"] == train_data["p_factor"].min()
removed_train_data = train_data[min_val_idx].copy()
clean_train_data   = train_data[~min_val_idx]

# Plot histogram after cleaning
clean_train_data["p_factor"].hist()


### Sample a small subset of subjects

In [None]:
# ---------------------------------------------------------------------------
# Work with a small subset of subjects (for speed)
# ---------------------------------------------------------------------------

clean_train_data = clean_train_data.reset_index(drop=True)
num_subjects = 5
idx_max = clean_train_data.shape[0]

# Randomly select subject indices
rand_ii = np.random.randint(low=0, high=idx_max, size=num_subjects)
rand_subjects = clean_train_data.loc[rand_ii, "participant_id"].values

print("Randomly selected subjects:", rand_subjects)

# Load FreeSurfer stats for selected subjects
dfs = []
for subject_id in rand_subjects:
    df = load_fsdata_raw(subject_id)
    dfs.append(df)

training_subset = pd.concat(dfs, ignore_index=True)

training_subset.head()


### Pick atlas & measure

In [None]:
# ---------------------------------------------------------------------------
# Pick an atlas and brain measure for modeling
# ---------------------------------------------------------------------------

print("Atlases:", training_subset["atlas"].unique())
print("Available columns (measures):", training_subset.columns.tolist())

# Example selection
atlas = "aparc.DKTatlas"
measure = "GrayVol"

fsdata_by_atlas = filter_by_atlas(training_subset, atlas)
fsdata_by_measure = select_measure(fsdata_by_atlas, measure)

fsdata_by_measure.head()
