### Notebook to generate participant ids and populate `mr_proc_manifest.csv` for nimhans PD-YLO dataset

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
from pathlib import Path
import shutil

In [2]:
dataset_root = "/home/nikhil/projects/NIMHANS/data/PD_YLO/"
raw_dicom_dir = f"{dataset_root}downloads/YLOPD/YLOPD_49/"
dicom_dir = f"{dataset_root}dicom/"
mr_proc_manifest = f"{dataset_root}/tabular/demographics/mr_proc_manifest.csv"

In [25]:
def search_dicoms(root_dicom_dir):
    """ Search and return list of dicom files from a scanner dicom-dir-tree output
    """
    filelist = []
    for root, dirs, files in os.walk(raw_dicom_dir):
        for file in files:
            filelist.append(os.path.join(root,file))
    
    n_dcms = len(filelist)
    unique_dcm = set(filelist)
    n_unique_dcm = len(unique_dcm)

    if n_unique_dcm != n_dcms:
        n_duplicates = n_dcms - n_unique_dcm
        print(f"Duplicate dicom names found for {n_duplicates} dcms")

    return unique_dcm

def copy_dicoms(filelist, dicom_dir):
    """ Copy dicoms from a scanner dicom-dir-tree output into a flat participant-level dir
    """
    if not Path(dicom_dir).is_dir():
        os.mkdir(dicom_dir)
        for f in filelist:
            f_basename = os.path.basename(f)
            shutil.copyfile(f, f"{dicom_dir}{f_basename}")
    else:
        print(f"participant dicoms already exist")


In [28]:
manifest_df = pd.read_csv(mr_proc_manifest)

participants = manifest_df["participant_id"].values
n_participants = len(participants)

participant_dicom_dirs = os.listdir(f"{dataset_root}/dicom/")
n_participant_dicom_dirs = len(participant_dicom_dirs)

dicom_reorg_participants = set(participants) - set(participant_dicom_dirs)
n_dicom_reorg_participants = len(dicom_reorg_participants)

print(f"n_particitpants: {n_participants} \
    n_particitpant_dicom_dirs: {n_participant_dicom_dirs} \
    dicom_reorg_participants: {n_dicom_reorg_participants}")

if n_dicom_reorg_participants > 0:
    for participant in dicom_reorg_participants:
        print(f"participant_id: {participant}")
        participant_raw_dicom_dir = f"{raw_dicom_dir}{participant}/"
        raw_dcm_list = search_dicoms(participant_raw_dicom_dir)
        print(f"n_raw_dicom: {len(raw_dcm_list)}")
        participant_dicom_dir = f"{dicom_dir}{participant}/"
        copy_dicoms(raw_dcm_list, participant_dicom_dir)

else:
    print(f"No new participants found for dicom reorg...")

n_particitpants: 1     n_particitpant_dicom_dirs: 1     dicom_reorg_participants: 0
No new participants found for dicom reorg...
