In [1]:
import json
from collections import namedtuple
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from skimage.io import imread, imsave
from skimage.transform import rescale
import numpy as np

Annotation = namedtuple("Annotation", ["filename", "is_nodule", "is_generated", "x", "y", "w", "h", "area"])

output_dir = Path("luna16_sliced")
output_dir.mkdir(parents=True, exist_ok=True)

def coco2ann(filename: str):
    with open(filename) as f:
        coco = json.load(f)
    images = {image["id"]: image["file_name"] for image in coco["images"]}
    annotations = [
        Annotation(images[ann["image_id"]], 1, 0, ann["bbox"][0], ann["bbox"][1], ann["bbox"][2], ann["bbox"][3], ann["area"])
        for ann in coco["annotations"]
    ]

    return annotations

In [2]:
ann = coco2ann("/homes/xz1919/train.json") + coco2ann("/homes/xz1919/test.json")
df = pd.DataFrame(ann)
df.to_csv(output_dir / "labels.csv", index=False)
df

Unnamed: 0,filename,is_nodule,is_generated,x,y,w,h,area
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,1,0,402.5,151.5,7.0,7.0,49.0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,1,0,40.5,207.5,9.0,9.0,81.0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,1,0,380.0,265.0,10.0,10.0,100.0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,1,0,362.0,321.0,30.0,30.0,900.0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,1,0,362.5,234.5,33.0,33.0,1089.0
...,...,...,...,...,...,...,...,...
1137,1.3.6.1.4.1.14519.5.2.1.6279.6001.994459772950...,1,0,110.0,163.0,6.0,6.0,36.0
1138,1.3.6.1.4.1.14519.5.2.1.6279.6001.994459772950...,1,0,189.0,331.0,6.0,6.0,36.0
1139,1.3.6.1.4.1.14519.5.2.1.6279.6001.994459772950...,1,0,38.5,217.5,7.0,7.0,49.0
1140,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,1,0,206.5,279.5,41.0,41.0,1681.0


In [9]:
df.filename[0]

'1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860_117.png'

# Add diffusion-generated images

In [52]:
def copy_from_generated(root: Path, output_dir: Path, df: pd.DataFrame, get_filename, output_prefix: str = "diffusion", scale: int = 1, is_generated: int = 1):
    df = df.copy()
    total_nr_missing = 0
    done_indices = []

    for filename, mini_df in tqdm(df.groupby("filename")):
        series_uid, vZ = filename.rstrip(".png").split("_", 2)
        vZ = int(vZ)

        generated_image_filename = root / get_filename(series_uid, vZ)

        if not generated_image_filename.exists():
            # print(f"{generated_image_filename} does not exist!")
            total_nr_missing += 1
            continue

        img = imread(generated_image_filename, as_gray=True)
        if scale > 1:
            img = rescale(img, scale)
        output_filename = f"{output_prefix}_{series_uid}_{vZ}.png"
        imsave(output_dir / output_filename, (img * 255.0).astype(np.uint8))

        df["filename"].loc[mini_df.index] = output_filename
        done_indices += list(mini_df.index)

    print("Number of missing images:", total_nr_missing)
    df_out = df.loc[done_indices]
    df_out["is_generated"] = is_generated
    df_out.reset_index(drop=True, inplace=True)
    return df_out

In [53]:
dg_root = Path("/vol/bitbucket/xz1919/diffusion-generated-images/with-nodule/RESULTS/ADE20K-SDM-256CH/images")
df_diffusion_generated = copy_from_generated(dg_root, output_dir, df, lambda series_uid, vZ: f"{series_uid}_{vZ}.png.png", "diffusion", scale=2, is_generated=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
100%|██████████| 1132/1132 [00:02<00:00, 558.60it/s]

Number of missing images: 47





In [54]:
gan_root = Path("/vol/bitbucket/xz1919/GAN-generated-images")
df_gan_generated = copy_from_generated(gan_root, output_dir, df, lambda series_uid, vZ: f"{series_uid}_{vZ}-fake.png", "gan", scale=1, is_generated=2)

100%|██████████| 1132/1132 [00:02<00:00, 383.58it/s]

Number of missing images: 3





In [55]:
df_all = pd.concat([df, df_diffusion_generated, df_gan_generated]).reset_index(drop=True)
df_all.to_csv(output_dir / "labels_all.csv")
df_all

Unnamed: 0,filename,is_nodule,is_generated,x,y,w,h,area
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,1,0,402.5,151.5,7.0,7.0,49.0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,1,0,40.5,207.5,9.0,9.0,81.0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,1,0,380.0,265.0,10.0,10.0,100.0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,1,0,362.0,321.0,30.0,30.0,900.0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,1,0,362.5,234.5,33.0,33.0,1089.0
...,...,...,...,...,...,...,...,...
3371,gan_1.3.6.1.4.1.14519.5.2.1.6279.6001.97042894...,1,2,108.5,264.5,15.0,15.0,225.0
3372,gan_1.3.6.1.4.1.14519.5.2.1.6279.6001.97542662...,1,2,339.0,371.0,12.0,12.0,144.0
3373,gan_1.3.6.1.4.1.14519.5.2.1.6279.6001.97908301...,1,2,94.0,297.0,12.0,12.0,144.0
3374,gan_1.3.6.1.4.1.14519.5.2.1.6279.6001.99445977...,1,2,110.0,163.0,6.0,6.0,36.0


In [20]:
df_all = pd.read_csv(output_dir / "labels_all.csv")

In [21]:
CocoImage = namedtuple("CocoImage", ["file_name", "id", "height", "width"])
CocoAnnotation = namedtuple("CocoAnnotation", ["id", "image_id", "segmentation", "area", "category_id", "bbox", "iscrowd"])

def generate_coco(df: pd.DataFrame, output_filename: Path):
    coco_images = []
    coco_annotations = []
    ann_count = 0

    for image_id, (filename, anns) in enumerate(df.groupby("filename", sort=False)):
        coco_images.append(CocoImage(filename, image_id, 512, 512))

        for _, ann in anns.iterrows():
            coco_annotations.append(CocoAnnotation(ann_count, image_id, [], ann.area, 0, [ann.x, ann.y, ann.w, ann.h], 0))
            ann_count += 1
    
    coco_annotations = list(map(lambda x: x._asdict() if hasattr(x, "_asdict") else x, coco_annotations))
    coco_images = list(map(lambda x: x._asdict() if hasattr(x, "_asdict") else x, coco_images))

    coco = {
        "images": coco_images,
        "annotations": coco_annotations,
        "categories": [{
            "id": 0,
            "name": "nodule",
        }]
    }

    with open(output_filename, "w") as f:
        json.dump(coco, f, indent=2)

In [22]:
from sklearn.model_selection import KFold

def save_ann_kfold(df: pd.DataFrame, use_generated_nodules: int, root: Path): # use_generated_nodule: 0 - no, 1 - diffusion, 2 - gan
    df_dataset = df[(df.is_generated == 0) | (df.is_generated == use_generated_nodules)]
    df_dataset = df_dataset.sample(len(df_dataset), random_state=57).reset_index(drop=True)

    kf = KFold(n_splits=10)

    for i, (train_index, test_index) in enumerate(kf.split(df_dataset)):
        df_train = df_dataset.iloc[train_index]
        df_test = df_dataset.iloc[test_index]
        df_test = df_test[df_test.is_generated == 0]
        print("df_test len:", len(df_test))
        name = ["original", "diffusion", "gan"][use_generated_nodules]
        generate_coco(df_train, root / f"train_kfold_{name}_{i}.json")
        generate_coco(df_test, root / f"test_kfold_{name}_{i}.json")

In [24]:
# save_ann_kfold(df_all, 0, output_dir)
# save_ann_kfold(df_all, 1, output_dir / "kfold10_new")
save_ann_kfold(df_all, 2, output_dir / "kfold10_new")

df_test len: 110
df_test len: 108
df_test len: 133
df_test len: 113
df_test len: 108
df_test len: 115
df_test len: 117
df_test len: 114
df_test len: 114
df_test len: 110


# Validate images

In [8]:
for i, row in df_all.iterrows():
    try:
        img = imread(output_dir / row.filename, as_gray=True)
        if img.shape != (512, 512):
            raise Exception(f"Bad image shape: {img.shape}")
        if not img.std() > 0.1:
            raise Exception("Image contrast too low")
        if any(np.isnan(img.flatten())):
            raise Exception("Image has NaN")
    except Exception as e:
        print(f"{row.filename} has errors: {e}")

In [1]:
df_gan_fold = pd.DataFrame(coco2ann(output_dir / "kfold10/train_kfold_gan_4.json"))
df_gan_fold = df_gan_fold.sample(frac=1.0)
generate_coco(df_gan_fold, output_dir / "kfold10_shuffled/train_kfold_gan_4.json")

NameError: name 'pd' is not defined