In [None]:
import csv
import re
from pathlib import Path

import pandas as pd

pd.set_option("display.max_colwidth", None)

In [None]:
SUBMISSIONS = [
    # "midrc-ricord-2021-08-20",
    "midrc-ricord-2021-09-02",
    "midrc-ricord-2021-09-22",
    "midrc-ricord-2021-10-06",
    "midrc-ricord-2021-10-26",
    "RSNA_20211117",
    "RSNA_20211214",
    "RSNA_20220105",
    "RSNA_20220114",
    "RSNA_20220124",
    "RSNA_20220211",
    "RSNA_20220214",
]

In [None]:
def process_submission(SUBMISSION):
    # useful paths for data manipulation
    print(SUBMISSION)
    PACKAGES_PATH = Path(f"/Users/andrewprokhorenkov/CTDS/projects/midrc/midrc_indexing_scripts/packages/packages_{SUBMISSION}")
    PACKAGES_PATH.mkdir(parents=True, exist_ok=True)

    RSNA_PATH = Path("/Users/andrewprokhorenkov/CTDS/projects/midrc/s3-data/raw/rsna")
    SUBMISSION_PATH = RSNA_PATH / SUBMISSION

    # get all the necessary files

    image_manifest_file = list(SUBMISSION_PATH.glob("imaging_data_manifest_*.tsv"))[0]
    studies_file = list(SUBMISSION_PATH.glob("*imaging_study_*.tsv"))[0]
    series_files = list(SUBMISSION_PATH.glob("*_series_*.tsv"))
    instance_files = list(SUBMISSION_PATH.glob("*_instance_*.tsv"))

    # instance_files = list(SUBMISSION_PATH.glob("midrc_*_instance_*.tsv"))
    # instance_files = list(SUBMISSION_PATH.glob("midrc_*_image_*.tsv"))
    # studies_file = list(SUBMISSION_PATH.glob("midrc_imaging_study_*.tsv"))[0]

    id_pattern = r"([\d\.]+)$"
    id_regex = re.compile(id_pattern)

    # load all files into pandas DF's

    image_manifest = pd.read_csv(image_manifest_file, sep="\t")

    studies = pd.read_csv(studies_file, sep="\t")

    rename_columns_studies = {
        "cases.submitter_id": "case_id",
        "cases": "case_id", # for "midrc-ricord-2021-08-20"
        "study_uid": "study_id",
    }

    studies = studies.rename(columns=rename_columns_studies)

    studies["case_id"] = studies["case_id"].apply(lambda v: v.removeprefix("Case_"))

    studies = studies[["study_id", "case_id"]]
    
    series = list(map(lambda v: pd.read_csv(v, sep="\t"), series_files))

    rename_columns_series = {
        "mr_exams.submitter_id": "study_id",
        "ct_scans.submitter_id": "study_id",
        "radiography_exams.submitter_id": "study_id",
        "case_ids": "case_id",
        "series_uid": "series_id",
    }

    series = list(
        map(
            lambda v: v.rename(columns=rename_columns_series)[
                ["series_id", "study_id", "case_id"]
            ],
            series,
        )
    )

    all_series = pd.concat(series)
    all_series["case_id"] = all_series["case_id"].apply(lambda v: v.removeprefix("Case_"))
    all_series["study_id"] = all_series["study_id"].apply(lambda v: id_regex.search(v).group(0))

    instances = list(map(lambda v: pd.read_csv(v, sep="\t"), instance_files))

    rename_columns_instances = {
        "cr_series.submitter_id": "series_id",
        "dx_series.submitter_id": "series_id",
        "ct_series.submitter_id": "series_id",
        "mr_series.submitter_id": "series_id",
        "submitter_id": "instance_id",
        "case_ids": "case_id",
    }

    instances = list(map(lambda v: v.rename(columns=rename_columns_instances), instances))

    all_instances = pd.concat(instances)
    all_instances["case_id"] = all_instances["case_id"].apply(lambda v: v.removeprefix("Case_"))
    all_instances["instance_id"] = all_instances["instance_id"].apply(lambda v: id_regex.search(v).group(0))
    all_instances["series_id"] = all_instances["series_id"].apply(lambda v: id_regex.search(v).group(0))

    all_instances = all_instances[["instance_id", "series_id", "case_id", "file_name", "file_size", "md5sum", "storage_urls"]]

    merged = image_manifest.merge(all_instances).merge(all_series).merge(studies)
    merged["file_name"] = merged["instance_id"].apply(lambda v: f"{v}.dcm")
    merged = merged[["file_name", "file_size", "md5sum", "storage_urls", "case_id", "study_id", "series_id"]]

    print(f"{image_manifest.shape}\n{merged.shape}")

    list_of_packages = []

    for i, row in merged.iterrows():
        case_id = row["case_id"]
        study_id = row["study_id"]
        series_id = row["series_id"]

        series_path = f"./cases/{case_id}/{study_id}/{series_id}.tsv\n"
        if series_path not in list_of_packages:
            list_of_packages.append(series_path)

        folder = PACKAGES_PATH / "cases" / case_id / study_id

        folder.mkdir(parents=True, exist_ok=True)
        
        series_file = folder / f"{series_id}.tsv"
        series_file_exist = series_file.exists()

        with open(series_file, mode="a") as f:
            fieldnames = ["file_name", "file_size", "md5sum", "case_id", "study_id", "series_id", "instance_id", "storage_urls"]
            writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)

            if not series_file_exist:
                writer.writeheader()
            writer.writerow(row.to_dict())

    with open(PACKAGES_PATH / "packages.txt", "w") as f:
        f.writelines(list_of_packages)

In [None]:
for SUBMISSION in SUBMISSIONS:
    process_submission(SUBMISSION)