# Generates annotation files used for MMClassification

## Input

A directory of cropped images along with `labels_all.csv`, consisting samples from the original dataset AND samples genearted by the diffusion model.

The directory can be generated with `crop.ipynb`.

## Output

* `train_ann.txt`: training set EXCLUDING diffusion-generated images. To make the dataset balanced, we dropped some non-nodule images.
* `train_gen_ann.txt`: training set INCLUDING diffusion-generated nodule images AND duplicates of randomly-chosen non-nodule samples to make the class balanced.
    * diffusion-generated non-nodule images are not used, since they are already abundant in the original dataset.
* `test_ann.txt`: testing set, which is kept balanced and does not include images generated by the diffusion model.

In [20]:
import pandas as pd
from pathlib import Path

root = Path("luna16_cropped")
ann_train_file = root / "train_ann.txt"         # training data excluding generated images
ann_train_gen_file = root / "train_gen_ann.txt" # training data including generated images and padding of non-nodules
ann_test_file = root / "test_ann.txt"

df = pd.read_csv(root / "labels_all.csv")
df.head()

Unnamed: 0,filename,is_nodule,vX,vY,vZ,is_generated
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,1,406,155,117,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,1,45,212,78,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0,225,348,82,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0,213,288,173,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0,131,397,86,0


In [21]:
def save_ann(df: pd.DataFrame, filename: str):
    with open(filename, "w") as f:
        for i, row in df.iterrows():
            f.write(f"{row.filename} {row.is_nodule}\n")

In [22]:
train_proportion = 0.8

nr_nodules_dataset = sum((df.is_nodule == 1) & (df.is_generated == 0))
print("number of nodules in the dataset:", nr_nodules_dataset)
l = nr_nodules_dataset

df_nodule = df[(df.is_nodule == 1) & (df.is_generated == 0)][:l]
df_non_nodule = df[(df.is_nodule == 0) & (df.is_generated == 0)][:l]

df_train = pd.concat([df_nodule.iloc[:int(l * train_proportion)], df_non_nodule.iloc[:int(l * train_proportion)]])
df_test = pd.concat([df_nodule.iloc[int(l * train_proportion):], df_non_nodule.iloc[int(l * train_proportion):]])

df_generated_nodule = df[(df.is_nodule == 1) & (df.is_generated == 1)]

df_train_gen = pd.concat([df_train, df_generated_nodule, df_non_nodule.iloc[:int(l * train_proportion)].sample(len(df_generated_nodule), random_state=90)])

save_ann(df_train, ann_train_file)
save_ann(df_train_gen, ann_train_gen_file)
save_ann(df_test, ann_test_file)

number of nodules in the dataset: 1524


In [23]:
from sklearn.model_selection import KFold

def save_ann_kfold(df: pd.DataFrame, use_generated_nodules: int, root: Path): # use_generated_nodule: 0 - no, 1 - diffusion, 2 - gan
    if use_generated_nodules:
        df_nodule = df[(df.is_nodule == 1) & ((df.is_generated == 0) | (df.is_generated == use_generated_nodules))]
    else:
        df_nodule = df[(df.is_nodule == 1) & (df.is_generated == 0)]

    df_non_nodule = df[df.is_nodule == 0][:len(df_nodule)]
    df_dataset = pd.concat([df_nodule, df_non_nodule])
    df_dataset = df_dataset.sample(len(df_dataset), random_state=57).reset_index(drop=True)

    kf = KFold(n_splits=10)

    for i, (train_index, test_index) in enumerate(kf.split(df_dataset)):
        df_train = df_dataset.iloc[train_index]
        df_test = df_dataset.iloc[test_index]
        df_test = df_test[df_test.is_generated == 0]
        print("Nodule proportion:", df_test.is_nodule.mean())
        name = ["original", "diffusion", "gan"][use_generated_nodules]
        save_ann(df_train, root / f"train_kfold_{name}_{i}.txt")
        save_ann(df_test, root / f"test_kfold_{name}_{i}.txt")

In [24]:
save_ann_kfold(df, 0, root)
save_ann_kfold(df, 1, root)
save_ann_kfold(df, 2, root)

Nodule proportion: 0.46557377049180326
Nodule proportion: 0.5311475409836065
Nodule proportion: 0.4852459016393443
Nodule proportion: 0.5344262295081967
Nodule proportion: 0.5442622950819672
Nodule proportion: 0.5016393442622951
Nodule proportion: 0.49836065573770494
Nodule proportion: 0.4918032786885246
Nodule proportion: 0.48026315789473684
Nodule proportion: 0.46710526315789475
Nodule proportion: 0.4128686327077748
Nodule proportion: 0.4128686327077748
Nodule proportion: 0.390745501285347
Nodule proportion: 0.3835978835978836
Nodule proportion: 0.3769633507853403
Nodule proportion: 0.382051282051282
Nodule proportion: 0.42159383033419023
Nodule proportion: 0.41711229946524064
Nodule proportion: 0.424
Nodule proportion: 0.3951612903225806
Nodule proportion: 0.4148936170212766
Nodule proportion: 0.39313984168865435
Nodule proportion: 0.38944723618090454
Nodule proportion: 0.40106951871657753
Nodule proportion: 0.3763157894736842
Nodule proportion: 0.3756345177664975
Nodule proportion: