In [None]:
import csv
from itertools import chain
import locale

locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
import re
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_colwidth", None)


In [None]:
SUBMISSIONS = [
    "acrimage/2021/06",
    "acrimage/2021/07",
    "acrimage/2021/08",
    "acrimage/2021/0827",  # fu
    "acrimage/2021/09",
    "acrimage/2021/10/batch6",
    "acrimage/2021/10/batch7",
    "acrimage/2021/11",
    "acrimage/2021/ACRPETAL_20211220",
    "acrimage/2022/ACR_20220107",
    "ACR_20211115",
    "ACR_20220107",
    "ACR_20220218",
]


In [None]:
SUBMISSION = SUBMISSIONS[3]
SUBMISSION


In [None]:
PACKAGES_PATH = Path(
    f"/Users/andrewprokhorenkov/CTDS/projects/midrc/indexing-data/packages_acr/packages_{SUBMISSION}"
)
PACKAGES_PATH.mkdir(parents=True, exist_ok=True)

RSNA_PATH = Path("/Users/andrewprokhorenkov/CTDS/projects/midrc/processed-s3")
SUBMISSION_PATH = RSNA_PATH / SUBMISSION

image_manifest_file = list(
    chain(
        SUBMISSION_PATH.glob("**/CIRR*.txt"),
        SUBMISSION_PATH.glob("**/*image_manifest*.txt"),
        SUBMISSION_PATH.glob("**/image_*.txt"),
        SUBMISSION_PATH.glob("image_*.tsv"),
        SUBMISSION_PATH.glob("*_instance_*.tsv"),
    )
)
# studies_file = list(SUBMISSION_PATH.glob("*imaging_study_*.tsv"))[0]
series_files = list(
    chain(
        SUBMISSION_PATH.glob("**/*_series_*.txt"),
        SUBMISSION_PATH.glob("*_series_*.tsv"),
    )
)
# instance_files = list(SUBMISSION_PATH.glob("*_instance_*.tsv"))


In [None]:
rename_columns = {
    "case_ids": "case_id",
    "Subject_ID": "case_id",
    "series_uid": "series_id",
    "dr_exams.submitter_id": "study_id",
    "radiography_exam.submitter_id": "study_id",
    "radiography_exams.submitter_id": "study_id",
    "ct_scan.submitter_id": "study_id",
    "ct_scans.submitter_id": "study_id",
    "mr_exams.submitter_id": "study_id",
    "nm_exams.submitter_id": "study_id",
    "pt_scans.submitter_id": "study_id",
    "pr_exams.submitter_id": "study_id",
    "rf_exams.submitter_id": "study_id",
    "series-submitter": "submitter_id",
}

series = map(
    lambda v: pd.read_csv(v, sep="\t").rename(columns=rename_columns), series_files
)
series = pd.concat(series, ignore_index=True).reset_index(drop=True)

# series["study_id"] = series["study_id"].apply(lambda v: v.split("_")[1])

series = series[["series_id", "study_id", "case_id"]].drop_duplicates()

series


In [None]:
rename_columns = {
    "case_ids": "case_id",
    "study_uid": "study_id",
    "ct_scans.submitter_id": "study_id",
    "radiography_exam.submitter_id": "study_id",
    "series_uid": "series_id",
    "series.submitter_id": "series_id",
    "cr_series.submitter_id": "series_id",
    "ct_series.submitter_id": "series_id",
    "dx_series.submitter_id": "series_id",
    "*md5sum": "md5sum",
    "mdsum": "md5sum",
    "*file_name": "file_name",
    "*file_size": "file_size",
    "submitter_id": "instance_id",
    "object_id": "instance_id",
}

instances = map(
    lambda v: pd.read_csv(v, sep="\t").rename(columns=rename_columns),
    image_manifest_file,
)
instances = pd.concat(instances, ignore_index=True).reset_index(drop=True)

# instances["series_id"] = instances["series_id"].apply(lambda v: v.split("_")[1])
# instances["study_id"] = instances["study_id"].apply(lambda v: v.split("_")[1])

instances = instances.merge(series, on=["case_id", "series_id"])

if instances["file_size"].dtype == np.dtype("O"):
    instances["file_size"] = instances["file_size"].apply(lambda v: locale.atoi(v))
instances = instances[
    [
        "file_name",
        "file_size",
        "md5sum",
        "case_id",
        "study_id",
        "series_id",
        "instance_id",
        "storage_urls",
    ]
].drop_duplicates()

# instances


In [None]:
list_of_packages = []

for i, row in instances.iterrows():
    case_id = row["case_id"]
    study_id = row["study_id"]
    series_id = row["series_id"]

    series_path = f"./cases/{case_id}/{study_id}/{series_id}.tsv\n"
    if series_path not in list_of_packages:
        list_of_packages.append(series_path)

    # print(f"{case_id} / {study_id}")

    folder = PACKAGES_PATH / "cases" / case_id / study_id

    folder.mkdir(parents=True, exist_ok=True)

    series_file = folder / f"{series_id}.tsv"
    series_file_exist = series_file.exists()

    with open(series_file, mode="a") as f:
        fieldnames = [
            "file_name",
            "file_size",
            "md5sum",
            "case_id",
            "study_id",
            "series_id",
            "instance_id",
            "storage_urls",
        ]
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)

        if not series_file_exist:
            writer.writeheader()
        writer.writerow(row.to_dict())

with open(PACKAGES_PATH / "packages.txt", "w") as f:
    f.writelines(list_of_packages)
