In [None]:
import csv
import json
import sys
from pathlib import PosixPath

csv.field_size_limit(sys.maxsize)

In [None]:
# submission = "packages_RSNA_20211117" # done
# submission = "packages_RSNA_20211214" # partially done, indexed
# submission = "packages_RSNA_20220105" # partially done, indexed
# submission = "packages_RSNA_20220114" # partially done, indexed
# submission = "packages_RSNA_20220124" # partially done, indexed
# submission = "packages_RSNA_20220211" # partially done, indexed
# submission = "packages_RSNA_20220214" # partially done, indexed
# submission = "packages_midrc-ricord-2021-08-20" # done
# submission = "packages_midrc-ricord-2021-09-02" # done
# submission = "packages_midrc-ricord-2021-09-22" # done
# submission = "packages_midrc-ricord-2021-10-06" # done
# submission = "packages_midrc-ricord-2021-10-26" # done

submissions = [
    # "packages_RSNA_20211117",
    # "packages_RSNA_20211214",
    # "packages_RSNA_20220105",
    # "packages_RSNA_20220114",
    # "packages_RSNA_20220124",
    # "packages_RSNA_20220211",
    # "packages_RSNA_20220214",
    # "packages_midrc-ricord-2021-08-20",
    # "packages_midrc-ricord-2021-09-02",
    # "packages_midrc-ricord-2021-09-22",
    # "packages_midrc-ricord-2021-10-06",
    # "packages_midrc-ricord-2021-10-26",
    "packages_RSNA_20220228",
    "packages_RSNA_20220308",
    "packages_RSNA_20220314",
]

submission_path = PosixPath(
    "~/CTDS/projects/midrc/indexing-data/packages_rsna"
).expanduser()

to_index_path = submission_path / ".." / "to_index_rsna"
(to_index_path/"open").mkdir(parents=True, exist_ok=True)
(to_index_path/"seq").mkdir(parents=True, exist_ok=True)
(to_index_path/"missing").mkdir(parents=True, exist_ok=True)

sequestration_master_file_path = PosixPath(
    "~/CTDS/projects/midrc/indexing-data/sequestration/master_sequestration_locations_23333_2022-06-28.tsv"
).expanduser()

In [None]:
seq_master = {}
with open(sequestration_master_file_path) as sequestration_master_file:
    reader = csv.DictReader(sequestration_master_file, delimiter="\t")

    for row in reader:
        seq_master[row["case_ids"]] = row["dataset"]

In [None]:
for submission in submissions:
    package_files = submission_path / submission / "packages"

    print(package_files)

    open_packages = []
    seq_packages = []
    missing_packages = []

    for package_filepath in package_files.iterdir():
        with open(package_filepath) as package_file:
            reader = csv.DictReader(package_file, delimiter="\t")
            for row in reader:
                item = row

                file_name = item["file_name"]
                case_id = file_name.split("/")[0]

                package_contents = json.loads(item["package_contents"].replace("'", "\""))
                for p in package_contents:
                    p["size"] = int(p["size"])
                
                item["package_contents"] = json.dumps(package_contents)

                # if case_id not in seq_master:
                #     continue

                dataset = seq_master.get(case_id, None)

                if dataset == "Open":
                    bucket = "s3://open-data-midrc/"
                    authz = json.dumps(["/programs/Open/projects/R1"])
                elif dataset == "Ignore":
                    bucket = "s3://open-data-midrc/"
                    authz = json.dumps(["/programs/TCIA/projects/RICORD"])
                elif dataset == "Seq":
                    bucket = "s3://sequestered-data-midrc/"
                    authz = json.dumps(["/programs/SEQ_Open/projects/R3"])
                else:
                    authz = ""
                    bucket = ""

                item["authz"] = authz
                item["urls"] = f"{bucket}{item['url']}"
                del item["url"]

                if dataset == "Open" or dataset == "Ignore":
                    open_packages.append(item)
                elif dataset == "Seq":
                    seq_packages.append(item)
                else:
                    missing_packages.append(item)

    datasets = [
        (f"open/packages_open_{submission.removeprefix('packages_')}.tsv", open_packages),
        (f"seq/packages_seq_{submission.removeprefix('packages_')}.tsv", seq_packages),
        (f"missing/packages_missing_{submission.removeprefix('packages_')}.tsv", missing_packages),
    ]

    fieldnames = [
        "record_type",
        "guid",
        "md5",
        "size",
        "authz",
        "urls",
        "file_name",
        "package_contents",
    ]

    for filename, dataset in datasets:
        if not dataset:
            continue
        with open(
            to_index_path / filename,
            "w",
        ) as f:
            writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
            writer.writeheader()

            for item in dataset:
                writer.writerow(item)