## Notebook to keep track of mri derivatives

### Raw tabular data is here
    `/home/nikhil/projects/Parkinsons/ppmi/derivatives`

### Currently collating and tracking following pipelines:
    - BIDS
    - fMRIPrep
    - freesurfer

In [1]:
import numpy as np
import pandas as pd
from itertools import product

In [5]:
ppmi_dir = "/home/nikhil/projects/Parkinsons/ppmi/"
demographics_dir = f"{ppmi_dir}tabular/demographics/"

bids_status_dir = f"{ppmi_dir}scratch/bids_status/current/"

derivatives_dir = f"{ppmi_dir}derivatives/"

bids_status_csv = f"{bids_status_dir}bids_status.csv"

## tmp subject counts on fmriprep dir
fmriprep_ses_subject_ids_dir = f"{derivatives_dir}subject_ids/"

## status_dir
bagel_csv = f"{ppmi_dir}/derivatives/bagel.csv"

In [6]:
current_mr_proc_manifest_csv = f"{demographics_dir}/mr_proc_manifest.csv"
current_mr_proc_manifest_df = pd.read_csv(current_mr_proc_manifest_csv)

n_participants = len(current_mr_proc_manifest_df["participant_id"].unique())
print(f"n_participants: {n_participants}")

dx_groups = current_mr_proc_manifest_df["group"].unique()
print(f"groups: {dx_groups}")
select_dex_groups = ['PD', 'Prodromal', 'Control', 'SWEDD']
print(f"select groups: {select_dex_groups}")
current_mr_proc_manifest_df = current_mr_proc_manifest_df[current_mr_proc_manifest_df["group"].isin(select_dex_groups)]

n_participants = len(current_mr_proc_manifest_df["participant_id"].unique())
print(f"n_participants: {n_participants}")
current_mr_proc_manifest_df.head()

n_participants: 1216
groups: ['PD' 'Prodromal' 'Control' 'SWEDD']
select groups: ['PD', 'Prodromal', 'Control', 'SWEDD']
n_participants: 1216


Unnamed: 0,participant_id,age,sex,group
0,100001,67,M,PD
1,100005,53,M,PD
2,100006,56,F,PD
3,100007,67,M,PD
4,100012,66,F,PD


### Check BIDS participants

In [22]:
bids_df = pd.read_csv(bids_status_csv)
bids_ids = bids_df["participant_id"].unique()
n_bids_ids = len(bids_ids)
print(f"n_participants: {n_bids_ids}")

bids_df = bids_df[bids_df["anat"] !=0 ]

bids_df.groupby("session_id").count()

n_participants: 1131


Unnamed: 0_level_0,participant_id,anat,dwi
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6,6,6
1,977,977,977
5,248,248,248
7,244,244,244
9,2,2,2
11,160,160,160
21,10,10,10
30,1,1,1
90,7,7,7
91,1,1,1


### Check fmriprep output tally
- 0,,'Screening'
- 1,0,'Baseline'
- 5,12,'Month 12'
- 7,24,'Month 24'
- 9,36,'Month 36'
- 11,48,'Month 48'
- 30,,'Premature Withdrawal'
- 21,,'Symptomatic Therapy'
- 90,,'Unscheduled Visit 01'

In [8]:
session_dict = {0: "Screening",
                1: "Baseline", 
                5: "Month 12",
                7: "Month 24",
                9: "Month 36",
                11: "Month 48"
                }

for ses_id, ses_name in session_dict.items():
    ses_participant_ids_txt = f"{fmriprep_ses_subject_ids_dir}/PPMI_ses-{ses_id}_fmriprep_anat_20.2.7_participant_ids.txt"
    participants_list = list(pd.read_csv(ses_participant_ids_txt, header=None)[0].unique())
    n_participants = len(participants_list)

    print(f"{ses_name}, n_participants: {n_participants}")

Screening, n_participants: 3
Baseline, n_participants: 430
Month 12, n_participants: 238
Month 24, n_participants: 237
Month 36, n_participants: 2
Month 48, n_participants: 157


### Baseline fmriprep proc

In [17]:
ses_participant_ids_txt = f"{fmriprep_ses_subject_ids_dir}/PPMI_ses-1_fmriprep_anat_20.2.7_participant_ids.txt"
baseline_participants_list = list(pd.read_csv(ses_participant_ids_txt, header=None)[0].unique())
n_participants = len(baseline_participants_list)
print(f"n_participants: {n_participants}")

n_participants: 430


In [18]:
# Demographics of processed fmriprep ids
proc_df = current_mr_proc_manifest_df[current_mr_proc_manifest_df["participant_id"].isin(baseline_participants_list)]
print(f"n_proc_df: {len(proc_df)}")
proc_df.groupby(["group"]).count()["participant_id"]

n_proc_df: 0


Series([], Name: participant_id, dtype: int64)

### Check bagel.csv 
    - currently only tracking freesurfer

In [34]:
bagel_df = pd.read_csv(bagel_csv).rename(columns={"Unnamed: 0":"participant_id"})
bagel_df["bids_id"] = "sub-" + bagel_df["participant_id"].astype(str)
bagel_df = bagel_df[bagel_df["Global_run_status"]]
bagel_df.head()

Unnamed: 0,participant_id,session,pipeline,version,Global_run_status,Phase_DKT_stats,workflow,bids_id
12,100891,1,,6.0.1,True,True,freesurfer,sub-100891
13,100898,1,,6.0.1,True,True,freesurfer,sub-100898
16,100952,1,,6.0.1,True,True,freesurfer,sub-100952
20,101038,1,,6.0.1,True,True,freesurfer,sub-101038
21,101039,1,,6.0.1,True,True,freesurfer,sub-101039


### Subjects to be processed

In [35]:
def get_missing_participant_ids(bids_df, bagel_df):
    bids_ids = set(bids_df["participant_id"].unique())
    bagel_ids = set(bagel_df["bids_id"].unique())
    missing_ids = bids_ids - bagel_ids
    return list(missing_ids)

In [38]:
save_ids = False
participants_list_dir = f"{ppmi_dir}/scratch/participant_lists/"
for ses in [1,5,7,9,11]:
    df1 = bids_df[bids_df["session_id"]==ses]
    df2 = bagel_df[bagel_df["session"]==ses]

    missing_ids = get_missing_participant_ids(df1, df2)
    print(f"n_participants reprocess: {len(missing_ids)}")

    if save_ids: 
        missing_csv = f"{participants_list_dir}/participants_reprocess_ses-{ses}.csv"
        missing_df = pd.DataFrame()
        missing_df["participant_id"] = missing_ids
        missing_df["session_id"] = ses
        
        missing_df.to_csv(missing_csv, index=None)

n_participants reprocess: 554
n_participants reprocess: 25
n_participants reprocess: 23
n_participants reprocess: 0
n_participants reprocess: 11
