In [None]:
import csv
import json
import sys
import random
import uuid
import hashlib
from pathlib import PosixPath, Path
import pandas as pd
from itertools import chain

csv.field_size_limit(sys.maxsize)

In [None]:
rnd = random.Random()
rnd.seed(42)

In [None]:
submissions = [
    # "2021/06",
    # "2021/07",
    "2021/08",
    "2021/0827",
    # "packages_acrimage/PETAL",
    "2021/09",
    "2021/10/batch6",
    "2021/10/batch7",
    "2021/11",
    # "packages_acrimage/2021/ACRPETAL_20211220",
    # "packages_acrimage/2022/ACR_20220107",
    # "ACR_20211115",
    # "ACR_20220107",
    # "packages_ACR_20220218",
]

submission_path = PosixPath(
    "~/CTDS/projects/midrc/ssot-s3/replicated-data-acr/acrclinical"
).expanduser()

to_index_path = submission_path / ".." / "to_index_acr"
(to_index_path/"open").mkdir(parents=True, exist_ok=True)
(to_index_path/"seq").mkdir(parents=True, exist_ok=True)
(to_index_path/"remove").mkdir(parents=True, exist_ok=True)
(to_index_path/"missing").mkdir(parents=True, exist_ok=True)

sequestration_master_file_path = PosixPath(
    "~/CTDS/projects/midrc/indexing-data/sequestration/master_sequestration_locations_31927_2022-09-07.tsv"
).expanduser()

In [None]:
RAW_DATA = PosixPath("~/CTDS/projects/midrc/ssot-s3").expanduser()

# for RemoveHeads files, they have the same structure and needs to remove imaging_study
# which is `submitter_id` in the format of
# <case>_<study_id>
remove_heads_files = RAW_DATA.glob("**/RemoveHeads*.txt")
remove_heads_studies = map(lambda v: pd.read_csv(v, sep="\t"), remove_heads_files)
remove_heads_studies = map(lambda v: v["submitter_id"] \
        .str.split("_", expand=True) \
        .rename(columns={0: "case_id", 1: "study_id"})[["study_id"]],
    remove_heads_studies)
remove_heads_studies = pd.concat(remove_heads_studies).reset_index(drop=True)

# same thing for different format of deletion files
# there are two different formats: one for imaging_study and one for images :facepalm:
# this needs some column renaming
rename_columns = {
    "*type": "type",
    "*submitter_id": "submitter_id",
    "study_uid": "study_id",
}

deletion_imaging_study_files = RAW_DATA.glob("**/deletion_imaging_*.tsv")
deletion_imaging_study_studies = map(lambda v: pd.read_csv(v, sep="\t") \
        .rename(columns=rename_columns),
    deletion_imaging_study_files)
deletion_imaging_study_studies = map(lambda v: v[["study_id"]], deletion_imaging_study_studies)

deletion_imaging_study_studies = pd.concat(deletion_imaging_study_studies).reset_index(drop=True)

studies_to_delete = pd.concat([remove_heads_studies, deletion_imaging_study_studies]).reset_index(drop=True)
match_studies_to_delete = studies_to_delete["study_id"].values

In [None]:
seq_master = {}
with open(sequestration_master_file_path) as sequestration_master_file:
    reader = csv.DictReader(sequestration_master_file, delimiter="\t")

    for row in reader:
        seq_master[row["case_ids"]] = row["dataset"]

In [None]:
submission_path

In [None]:
sub_path = Path(submission_path) / submission

image_manifest_file = list(
    chain(
        sub_path.glob("**/CIRR*.txt"),
        sub_path.glob("**/*image_manifest*.txt"),
        sub_path.glob("**/image_*.txt"),
        sub_path.glob("image_*.tsv"),
        sub_path.glob("*_instance_*.tsv"),
    )
)
# studies_file = list(SUBMISSION_PATH.glob("*imaging_study_*.tsv"))[0]
series_files = list(
    chain(
        sub_path.glob("**/*_series_*.txt"),
        sub_path.glob("*_series_*.tsv"),
    )
)
list(sub_path.glob("**/*_series_*.txt"))

In [None]:
for submission in submissions:
    print(submission)
    # packages_path = Path(output_path) / submission
    # packages_path.mkdir(parents=True, exist_ok=True)

    sub_path = Path(submission_path) / submission

    image_manifest_file = list(
        chain(
            sub_path.glob("**/CIRR*.txt"),
            sub_path.glob("**/*image_manifest*.txt"),
            sub_path.glob("**/image_*.txt"),
            sub_path.glob("image_*.tsv"),
            sub_path.glob("*_instance_*.tsv"),
        )
    )
    # studies_file = list(SUBMISSION_PATH.glob("*imaging_study_*.tsv"))[0]
    series_files = list(
        chain(
            sub_path.glob("**/*_series_*.txt"),
            sub_path.glob("*_series_*.tsv"),
        )
    )
    print(list(series_files))
    # instance_files = list(SUBMISSION_PATH.glob("*_instance_*.tsv"))

    rename_columns = {
        "case_ids": "case_id",
        "Subject_ID": "case_id",
        "series_uid": "series_id",
        "dr_exams.submitter_id": "study_id",
        "radiography_exam.submitter_id": "study_id",
        "radiography_exams.submitter_id": "study_id",
        "ct_scan.submitter_id": "study_id",
        "ct_scans.submitter_id": "study_id",
        "mr_exams.submitter_id": "study_id",
        "nm_exams.submitter_id": "study_id",
        "pt_scans.submitter_id": "study_id",
        "pr_exams.submitter_id": "study_id",
        "rf_exams.submitter_id": "study_id",
        "series-submitter": "submitter_id",
        "imaging_studies.submitter_id": "study_id",
    }

    series = map(
        lambda v: pd.read_csv(v, sep="\t").rename(columns=rename_columns), series_files
    )
    series = pd.concat(series, ignore_index=True).reset_index(drop=True)

    # series["study_id"] = series["study_id"].apply(lambda v: v.split("_")[1])

    series = series[["series_id", "study_id", "case_id"]].drop_duplicates()

    # series

    rename_columns = {
        "case_ids": "case_id",
        "study_uid": "study_id",
        "ct_scans.submitter_id": "study_id",
        "radiography_exam.submitter_id": "study_id",
        "series_uid": "series_id",
        "series.submitter_id": "series_id",
        "cr_series.submitter_id": "series_id",
        "ct_series.submitter_id": "series_id",
        "dx_series.submitter_id": "series_id",
        "*md5sum": "md5sum",
        "mdsum": "md5sum",
        "*file_name": "file_name",
        "*file_size": "file_size",
        "submitter_id": "instance_id",
        "object_id": "instance_id",
    }

    instances = map(
        lambda v: pd.read_csv(v, sep="\t").rename(columns=rename_columns),
        image_manifest_file,
    )
    instances = pd.concat(instances, ignore_index=True).reset_index(drop=True)

    # instances["series_id"] = instances["series_id"].apply(lambda v: v.split("_")[1])
    # instances["study_id"] = instances["study_id"].apply(lambda v: v.split("_")[1])

    instances = instances.merge(series, on=["case_id", "series_id"])

    if instances["file_size"].dtype == np.dtype("O"):
        instances["file_size"] = instances["file_size"].apply(lambda v: locale.atoi(v))
    instances = instances[
        [
            "file_name",
            "file_size",
            "md5sum",
            "case_id",
            "study_id",
            "series_id",
            "instance_id",
            "storage_urls",
        ]
    ].drop_duplicates()

    # instances

    list_of_packages = []

    break

    # for _, row in instances.iterrows():
    #     case_id = row["case_id"]
    #     study_id = row["study_id"]
    #     series_id = row["series_id"]

    #     series_path = f"./cases/{case_id}/{study_id}/{series_id}.tsv\n"
    #     if series_path not in list_of_packages:
    #         list_of_packages.append(series_path)

    #     folder = packages_path / "cases" / case_id / study_id

    #     folder.mkdir(parents=True, exist_ok=True)

    #     series_file = folder / f"{series_id}.tsv"
    #     series_file_exist = series_file.exists()

    #     with open(series_file, mode="a") as f:
    #         fieldnames = [
    #             "file_name",
    #             "file_size",
    #             "md5sum",
    #             "case_id",
    #             "study_id",
    #             "series_id",
    #             "instance_id",
    #             "storage_urls",
    #         ]
    #         writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)

    #         if not series_file_exist:
    #             writer.writeheader()
    #         writer.writerow(row.to_dict())

    # with open(packages_path / "packages.txt", "w") as f:
    #     f.writelines(list_of_packages)

In [None]:
for submission in submissions:
    package_files = submission_path / submission / "packages"

    print(package_files)

    open_packages = []
    seq_packages = []
    to_remove_packages = []
    missing_packages = []

    for package_filepath in package_files.iterdir():
        with open(package_filepath) as package_file:
            reader = csv.DictReader(package_file, delimiter="\t")
            for row in reader:
                item = row

                file_name = item["file_name"]
                case_id, study_id, _ = file_name.split("/")

                # in case study_id have some prefixes
                study_id = study_id.split("_")[-1]

                package_contents = json.loads(item["package_contents"].replace("'", "\""))
                for p in package_contents:
                    p["size"] = int(p["size"])
                
                item["package_contents"] = json.dumps(package_contents)

                dataset = seq_master.get(case_id, None)

                if dataset == "Open":
                    bucket = "s3://open-data-midrc/"
                    authz = json.dumps(["/programs/Open/projects/A1"])
                elif dataset == "Seq":
                    bucket = "s3://sequestered-data-midrc/"
                    authz = json.dumps(["/programs/SEQ_Open/projects/A3"])
                else:
                    authz = ""
                    bucket = ""

                item["authz"] = authz
                item["url"] = f"{bucket}{item['url']}"

                if study_id in match_studies_to_delete:
                    to_remove_packages.append(item)
                    continue

                m = hashlib.md5()
                m.update(f"{item['md5']}{item['size']}".encode('utf-8'))
                item["guid"] = f"dg.MD1R/{uuid.UUID(m.hexdigest(), version=4)}"
                if dataset == "Open":
                    open_packages.append(item)
                elif dataset == "Seq":
                    seq_packages.append(item)
                else:
                    missing_packages.append(item)

    datasets = [
        (f"open/new_packages_open_{submission.split('/')[-1].removeprefix('packages_')}.tsv", open_packages),
        (f"seq/new_packages_seq_{submission.split('/')[-1].removeprefix('packages_')}.tsv", seq_packages),
        (f"remove/new_packages_remove_{submission.split('/')[-1].removeprefix('packages_')}.tsv", to_remove_packages),
        (f"missing/new_packages_missing_{submission.split('/')[-1].removeprefix('packages_')}.tsv", missing_packages),
    ]

    fieldnames = [
        "record_type",
        "guid",
        "md5",
        "size",
        "authz",
        "url",
        "file_name",
        "package_contents",
    ]

    for filename, dataset in datasets:
        if not dataset:
            continue
        with open(
            to_index_path / filename,
            "w",
        ) as f:
            writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
            writer.writeheader()

            for item in dataset:
                writer.writerow(item)