## Notebook to reorganize PPMI BIDS and derivative processing
    - Vincent has multiple releases
        - bids, sdMRI, T1only, livingpark

    - Steps: BIDS Merge
        - fix participants.tsv for any participants mismatch with BIDS dir
        - count available participants, sessions, and modalities (bids_tracker.py) acorss releases
        - copy additional participants from releases (i.e. ver_T1, ver_livingpark) into current (i.e. mr_proc) BIDS dir
        - copy additional sessions from releases into current participants dirs
        - copy additional modalities from releases into current participants/session dirs

    - Steps: Derivatives Merge
        - TODO

In [1]:
import numpy as np
import pandas as pd
from itertools import product

### Proc tracker paths

In [2]:
proc_status_dir = "/home/nikhil/projects/Parkinsons/ppmi/proc_status/"
current_dir = f"{proc_status_dir}current/"

ver_sdMRI_dir = f"{proc_status_dir}ver_sdMRI/"
ver_T1_dir = f"{proc_status_dir}ver_T1w/"
ver_LP1_dir = f"{proc_status_dir}ver_LP1/"

### Fix PPMI participants.tsv
    - ver_T1w has mismatched participants

In [3]:
# participant_tsv_to_fix = f"{ver_T1_dir}participants_to_fix.tsv"
# participant_tsv = f"{ver_T1_dir}participants.tsv"
# mismatched_csv = f"{ver_T1_dir}mismatched_participants.csv"

# participant_df = pd.read_csv(participant_tsv_to_fix, sep="\t")
# mismatched_df = pd.read_csv(mismatched_csv)[["participant_id","status"]]

# print(f"number of participants: {len(participant_df)}")
# participant_df.head()

# drop_participants = mismatched_df[mismatched_df["status"] == "participants_missing_in_bids_dir"]["participant_id"].values
# add_participants = mismatched_df[mismatched_df["status"] == "participants_missing_in_tsv"]["participant_id"].values

# print(f"n_drop particiants: {len(drop_participants)}, n_add_participants: {len(add_participants)}")

# add_df = pd.DataFrame()
# add_df["participant_id"] = add_participants
# add_df["age"] = None
# add_df["sex"] = None
# add_df["group"] = None

# participant_df = participant_df[~participant_df["participant_id"].isin(drop_participants)]
# participant_df = participant_df.append(add_df)
# print(f"number of participants after add/drop: {len(participant_df)}")

# # Save participant.tsv
# # participant_df.to_csv(participant_tsv, index=None, sep="\t")

### Check BIDS availability
    - participants in bids_current_df == participants in bids_sdMRI_df = 569
        - Including counts for each session and modality
    - len(current_participants - T1_participants ) = 104
    - len(T1_participants - current_participants) = 552

In [3]:
bids_current_df = pd.read_csv(f"{current_dir}bids_status.csv")
bids_T1_df = pd.read_csv(f"{ver_T1_dir}ver_T1_bids_status.csv")
bids_sdMRI_df = pd.read_csv(f"{ver_sdMRI_dir}ver_sdMRI_bids_status.csv")
bids_LP_df = pd.read_csv(f"{ver_LP1_dir}ver_LP1_bids_status.csv")

current_participants = set(bids_current_df["participant_id"].values)
sdMRI_participants = set(bids_sdMRI_df["participant_id"].values)
T1_participants = set(bids_T1_df["participant_id"].values)
LP_participants = set(bids_LP_df["participant_id"].values)

print(f"current: {len(current_participants)}\n"
        f"sdMRI: {len(sdMRI_participants)}\n"
        f"T1: {len(T1_participants)}\n"
        f"LP: {len(LP_participants)}")
        
bids_current_df.head()

current: 1131
sdMRI: 569
T1: 1017
LP: 21


Unnamed: 0,participant_id,session_id,anat,dwi
0,sub-42860,30,0,0
1,sub-42860,5,0,0
2,sub-42860,7,0,0
3,sub-42860,1,2,2
4,sub-42860,90,0,0


### Get session-modality count

In [4]:
def get_session_counts(df,modalities):
    ses_id_list = np.sort(df["session_id"].unique())
    print(ses_id_list)
    for ses_id in ses_id_list:
        count_list = []
        for modality in modalities:
            n_participants = len(df[(df["session_id"]==ses_id) & (df[modality] > 0)])
            count_list.append((modality,n_participants))
        print(f"session: {ses_id}, {count_list}")

In [5]:
modalities = ["anat","dwi"]
get_session_counts(bids_current_df,modalities)

[ 0  1  5  7  9 11 21 30 90 91]
session: 0, [('anat', 6), ('dwi', 4)]
session: 1, [('anat', 977), ('dwi', 473)]
session: 5, [('anat', 248), ('dwi', 247)]
session: 7, [('anat', 244), ('dwi', 247)]
session: 9, [('anat', 2), ('dwi', 2)]
session: 11, [('anat', 160), ('dwi', 205)]
session: 21, [('anat', 10), ('dwi', 10)]
session: 30, [('anat', 1), ('dwi', 1)]
session: 90, [('anat', 7), ('dwi', 6)]
session: 91, [('anat', 1), ('dwi', 1)]


In [6]:
get_session_counts(bids_T1_df,modalities)

[ 0  1  5  7  9 11 21 30 90 91]
session: 0, [('anat', 6), ('dwi', 0)]
session: 1, [('anat', 974), ('dwi', 0)]
session: 5, [('anat', 248), ('dwi', 0)]
session: 7, [('anat', 244), ('dwi', 0)]
session: 9, [('anat', 2), ('dwi', 0)]
session: 11, [('anat', 160), ('dwi', 0)]
session: 21, [('anat', 10), ('dwi', 0)]
session: 30, [('anat', 1), ('dwi', 0)]
session: 90, [('anat', 7), ('dwi', 0)]
session: 91, [('anat', 1), ('dwi', 0)]


### Identify additional participants to be copied into current
    - First from ver_T1
    - Second from ver_LP

In [9]:
participants_to_add = LP_participants - current_participants
print(f"Number of new participants to be added: {len(participants_to_add)}")
df = pd.DataFrame()
df["participant_id"] = list(participants_to_add)
# df.to_csv(f"{proc_status_dir}/LP_participants_to_add_into_current.csv", index=None)

Number of new participants to be added: 21


#### Update current participants.tsv

In [10]:
updated_participant_tsv = f"{current_dir}/updated_participants.tsv"

current_participants_df = pd.read_csv(f"{current_dir}/participants.tsv",sep="\t")
LP_participants_df = pd.read_csv(f"{ver_LP1_dir}/participants.tsv",sep="\t")

LP_participants_df_to_add_df = LP_participants_df[LP_participants_df["participant_id"].isin(participants_to_add)]

print(f"n_current: {len(current_participants_df)}, n_T1: {len(LP_participants_df_to_add_df)}")

current_participants_df = current_participants_df.append(LP_participants_df_to_add_df)
print(f"n_add: {len(LP_participants_df_to_add_df)}, n_updated: {len(current_participants_df)}")

# Drop duplicates (if any)
current_participants_df = current_participants_df.drop_duplicates("participant_id", keep="first")
print(f"after dropping duplicates n_updated: {len(current_participants_df)}")

# Save participant.tsv
# current_participants_df.to_csv(updated_participant_tsv, index=None, sep="\t")

n_current: 1121, n_T1: 21
n_add: 21, n_updated: 1142
after dropping duplicates n_updated: 1131


### Identify additional sessions to be copied into current