# Convert images in Dicom format to png or jpg format

In [1]:
from tqdm import tqdm_notebook as tqdm
import os
import pydicom
import glob
from PIL import Image
import joblib
%matplotlib inline 
import pandas as pd

## Create a folder containing all images

In [2]:
PATH="./rsna-pneumonia-detection-challenge"
train_PATH=PATH+'/stage_2_train_images/'
dicom_paths = glob.glob(train_PATH+'*dcm')
#size = (512,512)
png_path = f'./data/size{size[0]}/stage_2_train_images/'


def dicom_to_png(dcm_path, png_path, size=None):
    im = pydicom.read_file(dcm_path).pixel_array
    img =Image.fromarray(im)
    if size is not None:
        img = img.resize(size) # size = (weight, height)
    name = dcm_path.split('/')[-1][:-3]+'png'
    if not os.path.exists(png_path):
        os.makedirs(png_path)
    img.save(os.path.join(png_path, name))

def prepare_images_njobs(dicom_paths, png_path, size=None, n_jobs=-1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(dicom_to_png)(dcm_path, png_path, size) for dcm_path in tqdm(dicom_paths))

In [None]:
# Running on the first 10 files for testing 
prepare_images_njobs(dicom_paths[:10], png_path, size)

In [None]:
# Running all files
prepare_images_njobs(dicom_paths, png_path)

In [None]:
len(os.listdir(png_path))

## create a folder only containing PA view position images

In [2]:
# final_df equals to train_df in EDA notebook
df = pd.read_csv('final_df.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0_x', 'SOPInstanceUID', 'AccessionNumber',
       'PatientName', 'PatientID', 'PatientBirthDate', 'PatientSex',
       'PatientAge', 'ViewPosition', 'StudyInstanceUID', 'SeriesInstanceUID',
       'StudyID', 'PatientOrientation', 'Unnamed: 0_y', 'patientId', 'x', 'y',
       'width', 'height', 'Target', 'class'],
      dtype='object')

In [10]:
df_patient = df.drop_duplicates('PatientID', keep='first')

In [25]:
PATH="./rsna-pneumonia-detection-challenge"
train_PATH=PATH+'/stage_2_train_images/'
dicom_paths = glob.glob(train_PATH+'*dcm')
size = (1024,1024)
png_path = f'./data/size{size[0]}/PA/'


def dicom_to_png(dcm_path, png_path, size=None):
    img_id = dcm_path.split('/')[-1][:-4]
    im = pydicom.read_file(dcm_path).pixel_array
    img =Image.fromarray(im)
    size = None
    if size is not None:
        img = img.resize(size) # size = (weight, height)
    name = dcm_path.split('/')[-1][:-3]+'jpg'
    if not os.path.exists(png_path):
        os.makedirs(png_path)
    if df_patient[df_patient['patientId'] == img_id]['ViewPosition'].values[0]=='PA':
        img.save(os.path.join(png_path, name))

def prepare_images_njobs(dicom_paths, png_path, size=None, n_jobs=-1):
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(dicom_to_png)(dcm_path, png_path, size) for dcm_path in tqdm(dicom_paths))

In [26]:
# Running on the first 10 files for testing
prepare_images_njobs(dicom_paths[:10], png_path, size)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [32]:
# Running all files
prepare_images_njobs(dicom_paths, png_path, size)

HBox(children=(IntProgress(value=0, max=26684), HTML(value='')))




### confirm the numbers of PA images

In [33]:
len(os.listdir(png_path))

14511

In [34]:
df_PA = df_patient[df_patient['ViewPosition']=='PA']

In [35]:
df_PA.shape

(14511, 22)

In [38]:
# df_PA.to_csv('./data/size1024/df_PA.csv')