In [None]:
import os
from PIL import Image
from ast import literal_eval
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import pydicom
import glob
import cv2
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
SIZE = 512
DATA_PATH = './data'

In [None]:
def read_xray(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array        
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8) 
    return data

def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    img = Image.fromarray(array)
    if keep_ratio:
        img.thumbnail((size, size), resample)
    else:
        img = img.resize((size, size), resample)
    return img

def img_mask(row, dim_x, dim_y, size):
    img = np.zeros((size, size), dtype=np.uint8)
    wratio = size / dim_x
    hratio = size / dim_y
    if row.hasnans:
        bbox = []
    else:
        bbox = literal_eval(row['boxes'])
    if len(bbox) > 0:
        for i in range(len(bbox)):
            x = bbox[i]['x'] * wratio 
            y = bbox[i]['y'] * hratio 
            xmax = bbox[i]['width'] * wratio  
            ymax = bbox[i]['height'] * hratio 
            img = cv2.rectangle(
                img, 
                (int(x), int(y)),
                (int(xmax) + int(x), int(ymax) + int(y)),
                (255),
                thickness=-1
            )
    return img 

In [None]:
df_train_img = pd.read_csv(f'{DATA_PATH}/train_image_level.csv')
df_train_sty = pd.read_csv(f'{DATA_PATH}/train_study_level.csv')

In [None]:
df_train_sty['StudyInstanceUID'] = df_train_sty['id'].apply(lambda x: x.replace('_study', ''))
del df_train_sty['id']
df_train_img = df_train_img.merge(df_train_sty, on='StudyInstanceUID')

In [None]:
paths = []
counter = 0
for sid in tqdm(df_train_img['StudyInstanceUID']):
    try:
        paths.append(glob.glob(f'{DATA_PATH}/train/{sid}/*/*')[0])
    except:
        paths.append('')
        counter += 1
print('no dicom images:', counter)
df_train_img['path'] = paths

In [None]:
counter = 0
images_paths = []
dim_x = []
dim_y = []
load_path = f'{DATA_PATH}/train/'
save_path = f'{DATA_PATH}/train_{SIZE}/'
save_path_masks = f'{DATA_PATH}/train_{SIZE}_masks/'
os.makedirs(save_path, exist_ok=True)
os.makedirs(save_path_masks, exist_ok=True)
for idx, row in tqdm(df_train_img.iterrows(), desc='train', total=len(df_train_img)):
    file = row['path']
    if file == '':
        counter += 1
    else:
        xray = read_xray(file)
        img = resize(xray, size=SIZE) # keep_ratio=True to have original aspect ratio
        img.save(save_path + file.split('/')[-1].replace('dcm', 'png'))
        images_paths.append(file.split('/')[-1].replace('dcm', 'png'))
        dim_x.append(xray.shape[1])
        dim_y.append(xray.shape[0])
        mask = img_mask(row, xray.shape[1], xray.shape[0], size=SIZE)
        mask = Image.fromarray(mask)
        mask.save(save_path_masks + file.split('/')[-1].replace('dcm', 'png'))
print('files omitted:', counter)

In [None]:
df = pd.DataFrame.from_dict({'img': images_paths, 'dim_x': dim_x, 'dim_y': dim_y})
df.to_csv(f'{DATA_PATH}/train_meta_{SIZE}.csv', index=False)