In [None]:
from pathlib import Path

import pandas as pd

In [None]:
DATA_DIR = Path("/Users/andrew/CTDS/misc-projects/midrc/data/")

In [None]:
paths = {
    "midrc-ricord-2021-10-06": {
        "dir": "s3-data/rsna/midrc-ricord-2021-10-06/",
        "imaging_data_manifest": "imaging_data_manifest_2021_10_08.tsv",
        "other_files": [
            "midrc_ct_instance_419639-2021-10-13.tsv",
            "midrc_cr_instance_419639-2021-10-13.tsv",
            "midrc_dx_instance_419639-2021-10-13.tsv",
        ],
        "open_file": "../../../sequestered-data-split/open_rsna_uniq.txt",
        "seq_file": "../../../sequestered-data-split/seq_rsna_uniq.txt",
    },
    "midrc-ricord-2021-10-26": {
        "dir": "s3-data/rsna/midrc-ricord-2021-10-26/",
        "imaging_data_manifest": "imaging_data_manifest_2021_10_28.tsv",
        "other_files": [
            "midrc_cr_instance_419639-2021-10-29.tsv",
            "midrc_ct_instance_419639-2021-10-29.tsv",
            "midrc_mr_instance_419639-2021-10-29.tsv",
            "midrc_dx_instance_419639-2021-10-29.tsv",
        ],
        "open_file": "../../../sequestered-data-split/open_rsna_uniq.txt",
        "seq_file": "../../../sequestered-data-split/seq_rsna_uniq.txt",
    },
}

In [None]:
def split_seq(input_dict):
    dir = DATA_DIR.joinpath(input_dict["dir"])

    imaging_data_manifest_file = dir.joinpath(input_dict["imaging_data_manifest"])
    other_files = [dir.joinpath(v) for v in input_dict["other_files"]]
    open_file = dir.joinpath(input_dict["open_file"])
    seq_file = dir.joinpath(input_dict["seq_file"])

    imaging_data_manifest = pd.read_csv(imaging_data_manifest_file, sep="\t").reset_index(drop=True)
    instances = []

    for f in other_files:
        data = pd.read_csv(f, sep="\t")
        instances.append(data)

    files = pd.concat(instances).sort_values(by="file_name").reset_index(drop=True)
    open = pd.read_csv(open_file)
    open["status"] = "open"
    seq = pd.read_csv(seq_file)
    seq["status"] = "seq"
    
    status_files = pd.concat([open, seq])

    move_files = imaging_data_manifest.merge(files, on="file_name")[["file_name", "case_ids", "file_size_x", "md5sum_x", "storage_urls_x"]]

    all_files = pd.merge(status_files, move_files, how="right", on="case_ids")
    all_files.loc[all_files["status"].isna(), "status"] = "open"

    open_files = all_files.loc[all_files["status"] == "open"]
    seq_files = all_files.loc[all_files["status"] == "seq"]

    open_files.to_csv(Path(imaging_data_manifest_file).name.replace("imaging_data_manifest", "open_imaging_data_manifest"), sep="\t", index=None)
    seq_files.to_csv(Path(imaging_data_manifest_file).name.replace("imaging_data_manifest", "seq_imaging_data_manifest"), sep="\t", index=None)

    raw_open_files = open_files["storage_urls_x"]
    raw_open_files = raw_open_files.str.replace("s3://storage.ir.rsna.ai/", "", regex=False)
    raw_open_files.to_csv(Path(imaging_data_manifest_file).name.replace("imaging_data_manifest", "raw_open_imaging_data_manifest"), index=None, header=None)
    raw_seq_files = seq_files["storage_urls_x"]
    raw_seq_files = raw_seq_files.str.replace("s3://storage.ir.rsna.ai/", "", regex=False)
    raw_seq_files.to_csv(Path(imaging_data_manifest_file).name.replace("imaging_data_manifest", "raw_seq_imaging_data_manifest"), index=None, header=None)

In [None]:
split_seq(paths["midrc-ricord-2021-10-06"])
split_seq(paths["midrc-ricord-2021-10-26"])

In [None]:
leftover = pd.merge(files.file_name, imaging_data_manifest.file_name, how='right', indicator=True)

leftover[leftover['_merge'] == 'right_only']["file_name"]