In [1]:
# We will need the RBCPath type from the rbclib package to load data from the RBC.
from rbclib import RBCPath

# We'll also want to load some data directly from the filesystem.
from pathlib import Path

# We'll want to load/process some of the data using pandas and numpy.
import pandas as pd
import numpy as np
import re

# meta data

In [2]:
# Participant meta-data is generally located in the BIDS repository for each
# study:
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# We can also concatenate the two datasets into a single dataset of all
# study participants:
all_data = pd.concat([train_data, test_data])

# Display the full dataframe:
all_data

Unnamed: 0,participant_id,study,study_site,session_id,wave,age,sex,race,ethnicity,bmi,handedness,participant_education,parent_1_education,parent_2_education,p_factor,internalizing_mcelroy_harmonized_all_samples,externalizing_mcelroy_harmonized_all_samples,attention_mcelroy_harmonized_all_samples,cubids_acquisition_group
0,1000393599,PNC,PNC1,PNC1,1,15.583333,Male,Black,not Hispanic or Latino,22.15,Right,9th Grade,Complete primary,Complete secondary,0.589907,-0.449373,-0.630780,-1.842178,1
1,1001970838,PNC,PNC1,PNC1,1,17.833333,Male,Other,Hispanic or Latino,23.98,Right,11th Grade,Complete tertiary,Complete tertiary,-0.659061,0.531072,0.392751,0.190706,1
2,1007995238,PNC,PNC1,PNC1,1,13.750000,Female,Other,not Hispanic or Latino,23.77,Right,6th Grade,Complete tertiary,Complete primary,-1.608375,-0.744118,-0.314187,-0.432662,1
3,1011497669,PNC,PNC1,PNC1,1,16.666667,Male,White,not Hispanic or Latino,29.68,Right,9th Grade,Complete tertiary,Complete tertiary,-1.233807,-0.896835,-0.449099,0.111167,1
4,1017092387,PNC,PNC1,PNC1,1,18.666667,Female,Black,not Hispanic or Latino,23.24,Right,11th Grade,Complete primary,Complete primary,-0.923100,-0.313455,2.204168,-0.782266,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,969649154,PNC,PNC1,PNC1,1,12.333333,Male,White,not Hispanic or Latino,17.38,Right,5th Grade,Complete tertiary,Complete secondary,,-0.148520,0.556444,0.024228,1
530,970890500,PNC,PNC1,PNC1,1,18.166667,Female,White,not Hispanic or Latino,30.89,Right,11th Grade,Complete secondary,Complete secondary,,0.993806,1.578177,-0.373470,1
531,975856179,PNC,PNC1,PNC1,1,11.000000,Male,White,not Hispanic or Latino,15.67,Right,4th Grade,Complete primary,Complete secondary,,-1.026645,-0.582212,1.333857,1
532,984757368,PNC,PNC1,PNC1,1,13.416667,Male,Black,not Hispanic or Latino,16.66,Right,5th Grade,Complete primary,,,0.360029,-0.515655,1.509584,114


In [3]:
# This path refers to the repo github.com:ReproBrainChart/PNC_FreeSurfer;
# Subject 1000393599's directory is used as an example.
subject_id = 1000393599
# To browse the repo, use this link:
# https://github.com/ReproBrainChart/PNC_FreeSurfer/tree/main
sub_path = RBCPath(f'rbc://PNC_FreeSurfer/freesurfer/sub-{subject_id}')

# This path refers to a directory:
assert sub_path.is_dir()

stats_filepath = sub_path / f'sub-{subject_id}_regionsurfacestats.tsv'

# Use pandas to read in the TSV file then display it:

print(f"Loading {stats_filepath} ...")
with stats_filepath.open('r') as f:
    data = pd.read_csv(f, sep='\t')

print(data.columns)
print(data.atlas.unique())

Loading rbc://PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_regionsurfacestats.tsv ...
Index(['subject_id', 'session_id', 'atlas', 'hemisphere', 'StructName',
       'NumVert', 'SurfArea', 'GrayVol', 'ThickAvg', 'ThickStd', 'MeanCurv',
       'GausCurv', 'FoldInd', 'CurvInd', 'Index', 'SegId', 'Mean_wgpct',
       'StdDev_wgpct', 'Min_wgpct', 'Max_wgpct', 'Range_wgpct', 'SNR_wgpct',
       'Mean_piallgi', 'StdDev_piallgi', 'Min_piallgi', 'Max_piallgi',
       'Range_piallgi'],
      dtype='object')
['aparc.DKTatlas' 'aparc.a2009s' 'aparc' 'BA_exvivo' 'AAL' 'CC200' 'CC400'
 'glasser' 'gordon333dil' 'HOCPATh25' 'Juelich' 'PALS_B12_Brodmann'
 'Schaefer2018_1000Parcels_17Networks_order'
 'Schaefer2018_1000Parcels_7Networks_order'
 'Schaefer2018_100Parcels_17Networks_order'
 'Schaefer2018_100Parcels_7Networks_order'
 'Schaefer2018_200Parcels_17Networks_order'
 'Schaefer2018_200Parcels_7Networks_order'
 'Schaefer2018_300Parcels_17Networks_order'
 'Schaefer2018_300Parcels_7Networks_

In [4]:
def load_fsdata_raw(participant_id, local_cache_dir=Path.home() / "cache"):
    """
    Load the raw FreeSurfer TSV for a PNC participant.
    Returns a long-form DataFrame.
    """
    local_cache_dir = Path(local_cache_dir)
    local_cache_dir.mkdir(exist_ok=True)
    pnc_fspath = RBCPath(
        "rbc://PNC_FreeSurfer/freesurfer",
        local_cache_dir=local_cache_dir
    )
    subdir = pnc_fspath / f"sub-{participant_id}"
    tsv_path = subdir / f"sub-{participant_id}_regionsurfacestats.tsv"

    return pd.read_csv(tsv_path, sep="\t")


def filter_by_atlas(df, atlas_substr):
    mask = df["atlas"].astype(str).str.contains(atlas_substr, case=False, na=False)
    if not mask.any():
        available = sorted(df["atlas"].dropna().unique())
        raise ValueError(
            f"No atlas rows contained '{atlas_substr}'.\n"
            f"Available atlas names:\n  " + "\n  ".join(available)
        )
    df_by_atlas = df[mask].copy()
    return df_by_atlas

def select_measure(df, measure):
    cols = ["subject_id","atlas","hemisphere","StructName", measure]
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    df_by_measure = df[cols].copy()
    return df_by_measure

def sanitize(col):
    # replace any sequence of non-alphanumeric chars with a single underscore
    clean = re.sub(r'[^0-9A-Za-z]+', '_', col)
    # collapse repeated underscores
    clean = re.sub(r'_{2,}', '_', clean)
    # strip leading/trailing underscores
    return clean.strip('_')

def pivot(df, measure):
    """
    Build one-row DataFrame:
      subject_id, and
      { sanitized hemisphere_atlas_StructName_measure: value }
    """
    subj = df['subject_id'].iloc[0]
    out = {'subject_id': subj}
    for _, row in df.iterrows():
        raw = f"{row['hemisphere']}_{row['atlas']}_{row['StructName']}_{measure}"
        col = sanitize(raw)
        out[col] = row[measure]
    return pd.DataFrame([out])

def load_and_pivot_fsdata(participant_id,
                          atlas,
                          measure,
                          local_cache_dir=Path.home()/"cache"):
    """
    1) load raw
    2) filter atlas
    3) select just the one measure
    4) pivot into a single-row wide DataFrame
    """
    fsdata_raw = load_fsdata_raw(participant_id, local_cache_dir)
    fsdata_raw_by_atlas = filter_by_atlas(fsdata_raw, atlas)
    fsdata_raw_by_atlas_by_measure = select_measure(fsdata_raw_by_atlas, measure)
    df_wide = pivot(fsdata_raw_by_atlas_by_measure, measure)
    return df_wide



In [5]:
subject_wide = load_and_pivot_fsdata(
    participant_id=1000393599,
    atlas="aparc.DKTatlas",
    measure="ThickAvg"
)

subject_wide

Unnamed: 0,subject_id,lh_aparc_DKTatlas_caudalanteriorcingulate_ThickAvg,lh_aparc_DKTatlas_caudalmiddlefrontal_ThickAvg,lh_aparc_DKTatlas_cuneus_ThickAvg,lh_aparc_DKTatlas_entorhinal_ThickAvg,lh_aparc_DKTatlas_fusiform_ThickAvg,lh_aparc_DKTatlas_inferiorparietal_ThickAvg,lh_aparc_DKTatlas_inferiortemporal_ThickAvg,lh_aparc_DKTatlas_isthmuscingulate_ThickAvg,lh_aparc_DKTatlas_lateraloccipital_ThickAvg,...,rh_aparc_DKTatlas_precentral_ThickAvg,rh_aparc_DKTatlas_precuneus_ThickAvg,rh_aparc_DKTatlas_rostralanteriorcingulate_ThickAvg,rh_aparc_DKTatlas_rostralmiddlefrontal_ThickAvg,rh_aparc_DKTatlas_superiorfrontal_ThickAvg,rh_aparc_DKTatlas_superiorparietal_ThickAvg,rh_aparc_DKTatlas_superiortemporal_ThickAvg,rh_aparc_DKTatlas_supramarginal_ThickAvg,rh_aparc_DKTatlas_transversetemporal_ThickAvg,rh_aparc_DKTatlas_insula_ThickAvg
0,sub-1000393599,2.87,2.882,2.019,3.655,2.738,2.573,2.869,2.371,2.09,...,2.785,2.58,2.882,2.702,2.896,2.381,3.113,2.792,2.658,3.238


# Create full demo + brain data

In [6]:
ATLAS="aparc.DKTatlas",
MEASURE="ThickAvg",


# We'll display a progress bar `prog` as we go also:
from ipywidgets import IntProgress
prog = IntProgress(min=0, max=len(all_data))
display(prog)

demo = [
    'age',
    'sex',
    'race',
    'ethnicity',
    'bmi',
    'handedness',
    'participant_education',
    'parent_1_education',
    'parent_2_education',
    'p_factor'
]


records = []

for row in all_data.itertuples(index=False):
    # start record with participant_id + all demo vars
    rec = {col: getattr(row, col) for col in ['participant_id'] + demo}

    # try to load & pivot; if successful, merge in all FS cols
    try:
        freesurfer = (
                        load_and_pivot_fsdata(
                            participant_id=rec['participant_id'],
                            atlas=ATLAS,
                            measure=MEASURE,
                            local_cache_dir=Path.home()/"cache"
                        )
            .drop(columns='subject_id')  # remove duplicate ID col
            .iloc[0]
            .to_dict()
        )
        rec.update(freesurfer)
    except (FileNotFoundError, ValueError):
        # leave rec with only ID+demo if FS data is missing/blank
        pass

    records.append(rec)
    prog.value += 1

all_demo_and_brain = pd.DataFrame(records)

# split into train/test
train_vars = all_demo_and_brain[all_demo_and_brain['p_factor'].notna()]
test_vars  = all_demo_and_brain[all_demo_and_brain['p_factor'].isna()]

all_demo_and_brain.head()

IntProgress(value=0, max=1601)

Unnamed: 0,participant_id,age,sex,race,ethnicity,bmi,handedness,participant_education,parent_1_education,parent_2_education,...,rh_aparc_DKTatlas_rostralanteriorcingulate_ThickAvg,rh_aparc_DKTatlas_rostralmiddlefrontal_ThickAvg,rh_aparc_DKTatlas_superiorfrontal_ThickAvg,rh_aparc_DKTatlas_superiorparietal_ThickAvg,rh_aparc_DKTatlas_superiortemporal_ThickAvg,rh_aparc_DKTatlas_supramarginal_ThickAvg,rh_aparc_DKTatlas_transversetemporal_ThickAvg,rh_aparc_DKTatlas_insula_ThickAvg,lh_aparc_DKTatlas_frontalpole_ThickAvg,rh_aparc_DKTatlas_temporalpole_ThickAvg
0,1000393599,15.583333,Male,Black,not Hispanic or Latino,22.15,Right,9th Grade,Complete primary,Complete secondary,...,2.882,2.702,2.896,2.381,3.113,2.792,2.658,3.238,,
1,1001970838,17.833333,Male,Other,Hispanic or Latino,23.98,Right,11th Grade,Complete tertiary,Complete tertiary,...,2.795,2.68,2.864,2.188,3.084,2.587,2.462,3.114,,
2,1007995238,13.75,Female,Other,not Hispanic or Latino,23.77,Right,6th Grade,Complete tertiary,Complete primary,...,2.797,2.624,2.87,2.53,3.092,2.854,2.529,3.438,,
3,1011497669,16.666667,Male,White,not Hispanic or Latino,29.68,Right,9th Grade,Complete tertiary,Complete tertiary,...,2.946,2.687,2.867,2.307,2.992,2.822,2.507,3.179,,
4,1017092387,18.666667,Female,Black,not Hispanic or Latino,23.24,Right,11th Grade,Complete primary,Complete primary,...,3.282,2.725,3.036,2.279,2.934,2.688,2.439,3.197,,


In [7]:
all_demo_and_brain.to_csv("~/DKTatlas_ThickAvg.csv", index=False)