In [2]:
import numpy as np
import pandas as pd
import os
from posixpath import join
import yaml



In [3]:
def ecvl_yaml(name, description, filenames, labels, train_ids, valid_ids, test_ids):
    d = {
        "name"        : name,
        "description" : description,
        "classes"     : [], 
        "images"      : [],
        "split"       : dict(training = train_ids, 
                            validation = valid_ids, 
                            test=test_ids)
    }
    imgs = []
    for fn, l in zip(filenames, labels):
        imgs.append({
            "location": fn,
            "label": l
        })
    d["images"] = imgs
    d["classes"] = sorted(list(set(labels)))
    return d

In [7]:
# mimic normal vs rest, unbalanced
# 3 models, dataset stays the same, random seed changes

in_file = join("/mnt/datasets/mimic-cxr/training_data/mimic", "normal_bin_unbal.tsv")  # input dataset
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mimic/normal_unbal"  # output fld
img_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"

dataset = pd.read_csv(in_file, sep="\t")
def adjust_path(path):
    path = path[:-len(".dcm")]
    path = join(img_fld, path + ".jpg")
    return path
display(dataset.T)

dataset["path"] = dataset["path"].apply(lambda path: adjust_path(path))
dataset = dataset.reset_index().set_index(pd.RangeIndex(len(dataset)))
splits = ["train", "validate", "test"]
filenames = []
labels = []
ids = {}
for s in splits:
    split = dataset.loc[dataset.split == s]
    print(f"{s}: {split.shape}")
    filenames += split.path.tolist()
    labels += split.target.tolist()
    ids[s] = split.index.values.tolist()
print(f"filenames, {len(filenames)}")
print(f"labels, {len(labels)}")

print(f"train:", len(ids["train"]))
print(f"validate:", len(ids["validate"]))
print(f"test:", len(ids["test"]))

yml_ds = ecvl_yaml(filenames, labels, ids["train"], ids["validate"], ids["test"])
for k, v in yml_ds.items():
    print(f"{k}: {len(v)}")

n_folds = 3
for i in range(n_folds):
    out_fld = join(  exp_fld, f"fold_{i}" )
    os.makedirs(out_fld, exist_ok=True)
    fn = join(out_fld, "dataset.yml")
    
    with open(fn, "w") as fout:
        yaml.safe_dump(yml_ds, fout, default_flow_style=None)
    print(f"saved {fn}")
#     # takes up to 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,361316,361317,361318,361319,361320,361321,361322,361323,361324,361325
dicom_id,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,b79e55c3-735ce5ac-64412506-cdc9ea79-f1af521f,dcfeeac4-1597e318-d0e6736a-8b2c2238-47ac3f1b,...,ebe1c050-b5cd68d5-7eb2b544-8906f022-e95300a4,3616a226-c86c9ea2-733dbb59-4b1530a0-ba7b25dc,4998e40c-698af874-8c293856-85757f55-1a4817e4,6e57e7d7-a4f8c909-e7540997-0280112e-14da108f,727e2aa5-ddfdd2ff-b5723867-520a758e-c81ca8e2,313f1d75-23648c34-dd36ecad-5d0f94e8-93a40356,4d0251eb-cc875c55-fde85f43-3a9d7888-c62772b8,f6a7a470-9e057a45-d244e0e5-3efe1422-bb946478,43636aec-98ae2504-18f03cac-4fe8e211-62049c6d,d122eb74-bc404dd2-45a05cd3-18505b72-5058fbdd
target,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
split,train,train,train,train,train,train,train,train,train,train,...,test,test,test,test,test,test,test,test,test,test
path,files/p10/p10000032/s50414267/02aa804e-bde0afd...,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,files/p10/p10000032/s53189527/e084de3b-be89b11...,files/p10/p10000032/s53911762/68b5c4b1-227d048...,files/p10/p10000032/s53911762/fffabebf-74fd3a1...,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,files/p10/p10000764/s57375967/096052b7-d256dc4...,files/p10/p10000764/s57375967/b79e55c3-735ce5a...,files/p10/p10000764/s57375967/dcfeeac4-1597e31...,...,files/p19/p19991135/s56918032/ebe1c050-b5cd68d...,files/p19/p19991135/s57096024/3616a226-c86c9ea...,files/p19/p19991135/s57096024/4998e40c-698af87...,files/p19/p19991135/s57096024/6e57e7d7-a4f8c90...,files/p19/p19991135/s57757467/727e2aa5-ddfdd2f...,files/p19/p19991135/s58283482/313f1d75-23648c3...,files/p19/p19991135/s58283482/4d0251eb-cc875c5...,files/p19/p19991135/s58283482/f6a7a470-9e057a4...,files/p19/p19991135/s59381316/43636aec-98ae250...,files/p19/p19991135/s59381316/d122eb74-bc404dd...


train: (353622, 5)
validate: (2867, 5)
test: (4837, 5)
filenames, 361326
labels, 361326
train: 353622
validate: 2867
test: 4837


TypeError: ecvl_yaml() missing 2 required positional arguments: 'valid_ids' and 'test_ids'

In [8]:
# mimic view frontal/lateral, unbalanced
# 3 models, dataset stays the same, random seed changes

in_file = join("/mnt/datasets/mimic-cxr/training_data/mimic", "frontal_lateral_mimic.tsv")  # input dataset
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mimic/view_unbal"  # output fld
img_fld = "/mnt/datasets/mimic-cxr/mimic-cxr-jpg/physionet.org/files/mimic-cxr-jpg/2.0.0"

dataset = pd.read_csv(in_file, sep="\t")
def adjust_path(path):
    path = path[:-len(".dcm")]
    path = join(img_fld, path + ".jpg")
    return path
display(dataset.T)

dataset["path"] = dataset["path"].apply(lambda path: adjust_path(path))
dataset = dataset.reset_index().set_index(pd.RangeIndex(len(dataset)))
splits = ["train", "validate", "test"]
filenames = []
labels = []
ids = {}
for s in splits:
    split = dataset.loc[dataset.split == s]
    print(f"{s}: {split.shape}")
    filenames += split.path.tolist()
    labels += split.target.tolist()
    ids[s] = split.index.values.tolist()
print(f"filenames, {len(filenames)}")
print(f"labels, {len(labels)}")

yml_ds = ecvl_yaml("view classifier", "frontal-lateral, unbalanced", filenames, labels, ids["train"], ids["validate"], ids["test"])
for k, v in yml_ds.items():
    print(f"{k}: {len(v)}")

n_folds = 3
for i in range(n_folds):
    out_fld = join(  exp_fld, f"fold_{i}" )
    os.makedirs(out_fld, exist_ok=True)
    fn = join(out_fld, "dataset.yml")
    
    with open(fn, "w") as fout:
        yaml.safe_dump(yml_ds, fout, default_flow_style=None)
    print(f"saved {fn}")
#     # takes up to 





Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,361316,361317,361318,361319,361320,361321,361322,361323,361324,361325
dicom_id,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,b79e55c3-735ce5ac-64412506-cdc9ea79-f1af521f,dcfeeac4-1597e318-d0e6736a-8b2c2238-47ac3f1b,...,ebe1c050-b5cd68d5-7eb2b544-8906f022-e95300a4,3616a226-c86c9ea2-733dbb59-4b1530a0-ba7b25dc,4998e40c-698af874-8c293856-85757f55-1a4817e4,6e57e7d7-a4f8c909-e7540997-0280112e-14da108f,727e2aa5-ddfdd2ff-b5723867-520a758e-c81ca8e2,313f1d75-23648c34-dd36ecad-5d0f94e8-93a40356,4d0251eb-cc875c55-fde85f43-3a9d7888-c62772b8,f6a7a470-9e057a45-d244e0e5-3efe1422-bb946478,43636aec-98ae2504-18f03cac-4fe8e211-62049c6d,d122eb74-bc404dd2-45a05cd3-18505b72-5058fbdd
target,1,0,1,0,1,1,1,1,0,0,...,1,0,1,0,1,1,1,1,0,1
split,train,train,train,train,train,train,train,train,train,train,...,test,test,test,test,test,test,test,test,test,test
path,files/p10/p10000032/s50414267/02aa804e-bde0afd...,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,files/p10/p10000032/s53189527/e084de3b-be89b11...,files/p10/p10000032/s53911762/68b5c4b1-227d048...,files/p10/p10000032/s53911762/fffabebf-74fd3a1...,files/p10/p10000032/s56699142/ea030e7a-2e3b134...,files/p10/p10000764/s57375967/096052b7-d256dc4...,files/p10/p10000764/s57375967/b79e55c3-735ce5a...,files/p10/p10000764/s57375967/dcfeeac4-1597e31...,...,files/p19/p19991135/s56918032/ebe1c050-b5cd68d...,files/p19/p19991135/s57096024/3616a226-c86c9ea...,files/p19/p19991135/s57096024/4998e40c-698af87...,files/p19/p19991135/s57096024/6e57e7d7-a4f8c90...,files/p19/p19991135/s57757467/727e2aa5-ddfdd2f...,files/p19/p19991135/s58283482/313f1d75-23648c3...,files/p19/p19991135/s58283482/4d0251eb-cc875c5...,files/p19/p19991135/s58283482/f6a7a470-9e057a4...,files/p19/p19991135/s59381316/43636aec-98ae250...,files/p19/p19991135/s59381316/d122eb74-bc404dd...


train: (353622, 5)
validate: (2867, 5)
test: (4837, 5)
filenames, 361326
labels, 361326
name: 15
description: 27
classes: 2
images: 361326
split: 3
saved /mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mimic/view_unbal/fold_0/dataset.yml
saved /mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mimic/view_unbal/fold_1/dataset.yml
saved /mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/mimic/view_unbal/fold_2/dataset.yml
