In [1]:
# Create a directory under working to save everything
%mkdir vinbigdata-mlp-1024-images
%cd vinbigdata-mlp-1024-images

/kaggle/working/vinbigdata-mlp-1024-images


In [2]:
import numpy as np
import pydicom
import cv2
import os
from pydicom.pixel_data_handlers.util import apply_voi_lut
from tqdm.auto import tqdm
from pathos.multiprocessing import ProcessingPool as Pool
from functools import partial
import pandas as pd
from PIL import Image
from skmultilearn.model_selection import iterative_train_test_split

In [3]:
%mkdir images

In [4]:
# Add in images from NIH dataset
targetsize = 1024

def translate_class_name(name):
    if name == 'Effusion':
        return 'Pleural effusion'
    elif name == 'Infiltrate':
        return 'Infiltration'
    elif name == 'Mass' or name == 'Nodule':
        return 'Nodule/Mass'
    else:
        return name

def get_class_id(name):
    if name == 'Atelectasis':
        return 1
    elif name == 'Effusion':
        return 10
    elif name == 'Cardiomegaly':
        return 3
    elif name == 'Infiltrate':
        return 6
    elif name == 'Pneumothorax':
        return 12
    elif name == 'Mass' or name == 'Nodule':
        return 8
    else:
        raise Exception(f'Unknown name {name}')
    
def resize_img(name):
#     Try each folder 
    baseDir = '/kaggle/input/data/'
    for i in range(1,13):
        imgPath = os.path.join(baseDir, f"images_{i:03d}/images/{name}")
        if os.path.isfile(imgPath):
            im = Image.open(imgPath)
            im = im.resize((targetsize,targetsize))
            im.save(f"./images/{name}")
            break

def copy_image(name):
#     Try each folder 
    baseDir = '/kaggle/input/data/'
    for i in range(1,13):
        imgPath = os.path.join(baseDir, f"images_{i:03d}/images/{name}")
        if os.path.isfile(imgPath):
            %cp {imgPath} ./images/{name}
            break
    
    
def get_box_coords(row):
    # Scale the lengths and widths down to the new image size
    x_min = row['Bbox [x'] * (targetsize / 1024)
    x_max = x_min + row['w'] * (targetsize / 1024)
    y_min = row['y'] * (targetsize / 1024)
    y_max = y_min + row['h]'] * (targetsize / 1024)
    return {'x_min': x_min, 'x_max': x_max, 'y_min': y_min, 'y_max': y_max}

def get_yolo_coords(box):
    x_center = (box['x_min'] + box['x_max']) / (2 * targetsize)
    y_center = (box['y_min'] + box['y_max']) / (2 * targetsize)
    width = (box['x_max'] - box['x_min']) / targetsize
    height = (box['y_max'] - box['y_min']) / targetsize
    return {'x_center': x_center, 'y_center': y_center, 'width': width, 'height': height}

nihf = pd.read_csv('/kaggle/input/data/Data_Entry_2017.csv')
nih = pd.read_csv('/kaggle/input/data/BBox_List_2017.csv')

to_append = [] 

df = pd.read_csv('/kaggle/input/vinbig1024stratified/train.csv')

for idx, row in nih.iterrows():
    
    # Skip Pneumonia as it's not in VinbigData
    if row['Finding Label'] == 'Pneumonia':
        continue
    
    # Only include images from NIH that have the same view
    if nihf[nihf['Image Index'] == row['Image Index']]['View Position'].item() == 'PA':
        copy_image(row['Image Index'])
        box_coords = get_box_coords(row)
#         yolo_coords = get_yolo_coords(box_coords)
        class_name = translate_class_name(row['Finding Label'])
        class_id = get_class_id(row['Finding Label'])
        # Strip the file extension off
        img_id = os.path.splitext(row['Image Index'])[0]
        to_append.append({'image_id': img_id, 'class_name': class_name, 'class_id': class_id, **box_coords})

df = df.append(to_append)

In [5]:
df.to_csv('train_with_nih.csv', index=False)

In [6]:
%%bash
# Zip the contents of the folder and delete the original files
zip -r -q ../output.zip .
cd ..
rm -rf vinbigdata-mlp-1024-images