In [1]:
# Create a directory under working to save everything
%mkdir vinbigdata-mlp-1024-images
%cd vinbigdata-mlp-1024-images

/kaggle/working/vinbigdata-mlp-1024-images


In [2]:
import numpy as np
import pydicom
import cv2
import os
from pydicom.pixel_data_handlers.util import apply_voi_lut
from tqdm.auto import tqdm
from pathos.multiprocessing import ProcessingPool as Pool
from functools import partial
import pandas as pd

In [3]:
def process_xray(path):

    # Read the dicom file
    dicom = pydicom.read_file(path)

    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    pixels = apply_voi_lut(dicom.pixel_array, dicom)
    
    # Depending on this value, X-ray may look inverted - fix that:
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        pixels = np.amax(pixels) - pixels
    
    # This is rescaling all the pixel values to be in the RGB range
    pixels = pixels - np.min(pixels)
    pixels = pixels / np.max(pixels)
    pixels = (pixels * 255.0).astype(np.uint8)
    
    # Resize the image
    pixels = cv2.resize(pixels, (640, 640))
    
    meta = {
        'orig_width': dicom.get('Columns'),
        'orig_height': dicom.get('Rows'),
        'age': dicom.get('PatientAge'),
        'sex': dicom.get('PatientSex')
    }
    
    return pixels, meta

def resize_and_save(img_filename, test_train):
    # Preprocess and resize the x-ray
    fullpath = os.path.join(f'../../input/vinbigdata-chest-xray-abnormalities-detection/{test_train}/', img_filename)
    pixels, meta = process_xray(fullpath)

    # Save the image
    cv2.imwrite(os.path.join(test_train, img_filename.replace('.dicom','.png')), pixels)
    
    # Return the image id and meta data
    img_id = img_filename.replace('.dicom', '') 
    return (img_id, meta)

%mkdir train
%mkdir test

meta_data = []

for test_train in ['train', 'test']:
    imgs = os.listdir(f'../../input/vinbigdata-chest-xray-abnormalities-detection/{test_train}/')
#     !! REMOVE !!
#     imgs = imgs[:100]
    
    if test_train == 'train':
        # Remove this image as it's smaller than 1024 and we don't want to scale up
        imgs.remove('b1c50f14b8ed2fe9d3478b115600eee3.dicom')
    
    # Process all the images in parallel
    worker_fn = partial(resize_and_save, test_train=test_train)
    with Pool(4) as p:
        meta_data.extend(list(tqdm(p.imap(worker_fn, imgs), total=len(imgs))))
                
# Convert the meta data to a dict for fast lookup
meta_data = dict(meta_data)


  0%|          | 0/14999 [00:00<?, ?it/s]

  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "
  f"The (0028,0101) 'Bits Stored' value ({ds.BitsStored}-bit) "


  0%|          | 0/3000 [00:00<?, ?it/s]

In [4]:
def calc_iou(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou

In [5]:
# Preprocess the CSV file
data_dir = "../../input/vinbigdata-chest-xray-abnormalities-detection/"
df = pd.read_csv(f"{data_dir}/train.csv")

# Add additional meta data columns with default values
df['age'] = None
df['sex'] = None
df['orig_width'] = None
df['orig_height'] = None
df['x_center'] = None
df['y_center'] = None
df['width'] = None
df['height'] = None

# Delete entries for anomolous image that's smaller than 1024x1024
df.drop(df[df['image_id'] == 'b1c50f14b8ed2fe9d3478b115600eee3'].index, inplace=True)

img_dir = os.path.join(data_dir, 'train')

# Scale the annotated boxes to the new image dimensions
for idx, row in df.iterrows():
    if row['image_id'] in meta_data:
        meta = meta_data[row['image_id']]
    else:
        continue

    if row['class_id'] != 14:
        # Scale the bounding box co-ords
        new_xmin = round(row['x_min'] * (640 / meta['orig_width']))
        new_xmax = round(row['x_max'] * (640 / meta['orig_width']))
        new_ymin = round(row['y_min'] * (640 / meta['orig_height']))
        new_ymax = round(row['y_max'] * (640 / meta['orig_height']))
        
        df.at[idx, 'x_min'] = new_xmin
        df.at[idx, 'x_max'] = new_xmax
        df.at[idx, 'y_min'] = new_ymin
        df.at[idx, 'y_max'] = new_ymax
        
        # Convert to yolo format
        df.at[idx, 'x_center'] = (new_xmin + new_xmax) / 640
        df.at[idx, 'y_center'] = (new_ymin + new_ymax) / 640
        df.at[idx, 'width'] = (new_xmax - new_xmin) / 640
        df.at[idx, 'height'] = (new_ymax - new_ymin) / 640
            
    # Set the other meta data
    df.at[idx, 'age'] = meta['age']
    df.at[idx, 'sex'] = meta['sex']
    df.at[idx, 'orig_width'] = meta['orig_width']
    df.at[idx, 'orig_height'] = meta['orig_height']
    

# Save a copy of the training set containing all the annotations
df.to_csv('train_all_annotations.csv', index=False)

In [6]:
df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,age,sex,orig_width,orig_height,x_center,y_center,width,height
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,,,M,2332,2580,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,,,,2954,3159,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,213.0,377.0,509.0,502.0,061Y,F,2080,2336,1.128125,1.373438,0.462500,0.195312
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,351.0,165.0,448.0,226.0,,F,2304,2880,1.248438,0.610938,0.151562,0.095312
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,,022Y,F,2540,3072,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67909,936fd5cff1c058d39817a08f58b72cae,No finding,14,R1,,,,,,,2444,3200,,,,
67910,ca7e72954550eeb610fe22bf0244b7fa,No finding,14,R1,,,,,,O,1994,2430,,,,
67911,aa17d5312a0fb4a2939436abca7f9579,No finding,14,R8,,,,,,O,2048,2500,,,,
67912,4b56bc6d22b192f075f13231419dfcc8,Cardiomegaly,3,R8,251.0,307.0,546.0,411.0,052Y,F,1968,2040,1.245313,1.121875,0.460938,0.162500


In [7]:
# Now build some datasets that have some annotations removed

# For each image want to average out annotations by separate radiologists that
# identify the same abnormality.
df_half = df.copy()

# Get the rows with annotations
ann_df = df_half[df_half['class_id'] != 14]

for name, group in ann_df.groupby(['image_id', 'class_id']):
    avgs = []
    for idx, row in group.iterrows():
        box = [row['x_min'], row['y_min'], row['x_max'], row['y_max']]
        yolo_box = np.array([row['x_center'], row['y_center'], row['width'], row['height']])
        
        if avgs:
            intbox_or_default = next((a for a in avgs if calc_iou(a['box'], box) > 0), None)
            
            if intbox_or_default:
                df_half.drop(idx, inplace=True)
                intbox_or_default['yolo_box'] = intbox_or_default['yolo_box'] + yolo_box
                intbox_or_default['count'] += 1
                
            else:
                avgs.append({'idx':idx, 'box':box, 'yolo_box': yolo_box, 'count':1})
        else:
            avgs.append({'idx':idx, 'box':box, 'yolo_box': yolo_box, 'count':1})
        
    for avg in avgs:
        df_half.at[avg['idx'], 'x_center'] = avg['x_center'] / avg['count']
        df_half.at[avg['idx'], 'y_center'] = avg['y_center'] / avg['count']
        df_half.at[avg['idx'], 'width'] = avg['width'] / avg['count']
        df_half.at[avg['idx'], 'height'] = avg['height'] / avg['count']
            
df_half.to_csv('train.csv', index=False)

KeyError: 'x_center'

In [8]:
# # Now build some datasets that have some annotations removed

# # For each image want to remove annotations by separate radiologists that
# # identify the same abnormality. Use threshold of 0.5 the same threshold
# # used to determine correctness of predictions.
# df_half = df.copy()

# # Get the rows with annotations
# ann_df = df_half[df_half['class_id'] != 14]

# for name, group in ann_df.groupby(['image_id', 'class_id']):
#     existing = []
#     for idx, row in group.iterrows():
#         box = [row['x_min'], row['y_min'], row['x_max'], row['y_max']]
#         if existing:
#             if any(map(lambda x : calc_iou(x, box) > 0.5, existing)):
#                 df_half.drop(idx, inplace=True)
#             else:
#                 existing.append(box)
#         else:
#             existing.append(box)
            
# df_half.to_csv('train.csv', index=False)

In [9]:
# # Now build some datasets that have some annotations removed

# # For each image want to remove annotations by separate radiologists that
# # identify the same abnormality. Use threshold of 0.75

# df_tq = df.copy()

# # Get the rows with annotations
# ann_df = df_tq[df_tq['class_id'] != 14]

# for name, group in ann_df.groupby(['image_id', 'class_id']):
#     existing = []
#     for idx, row in group.iterrows():
#         box = [row['x_min'], row['y_min'], row['x_max'], row['y_max']]
#         if existing:
#             if any(map(lambda x : calc_iou(x, box) > 0.75, existing)):
#                 df_tq.drop(idx, inplace=True)
#             else:
#                 existing.append(box)
#         else:
#             existing.append(box)
            
# df_tq.to_csv('train_75.csv', index=False)

In [10]:
%%bash
# Zip the contents of the folder and delete the original files
zip -r -q ../output.zip .
cd ..
rm -rf vinbigdata-mlp-1024-images