## Unzip Data

In [14]:
import os

if 'test_dcm' not in os.listdir('data/new_xray/raw_data'):
    !unzip 'data/new_xray/raw_data/stage_2_test_images.zip' -d 'data/new_xray/raw_data/test_dcm'
if 'train_dcm' not in os.listdir('data/new_xray/raw_data'):
    !unzip 'data/new_xray/raw_data/stage_2_train_images.zip' -d 'data/new_xray/raw_data/train_dcm'

## Merge csv information

In [17]:
import os
import pandas as pd

path = 'data/new_xray/raw_data/'

labels = pd.read_csv(path+'stage_2_train_labels.csv')
class_info = pd.read_csv(path+'stage_2_detailed_class_info.csv')

class_info.drop_duplicates(subset=['patientId', 'class'], inplace=True)
labels.drop_duplicates(subset=['patientId', 'x', 'y', 'width', 'height', 'Target'], inplace=True)

# Join both info
details = labels.merge(
    class_info,
    left_on='patientId',
    right_on='patientId',
    how='left'
)

details.to_csv('data/new_xray/merged_details.csv', index=False)

## Convert DCM to PNG

Source code:
https://github.com/vivek8981/DICOM-to-JPG/blob/master/convert-extract.py

### Convert and extract DCM info:

In [24]:
import pydicom as dicom
import matplotlib.pyplot as plt
import os
import cv2
import PIL
import pandas as pd
import csv
from tqdm.notebook import tqdm


# download this file from the given link # https://github.com/vivek8981/DICOM-to-JPG
dicom_image_description = pd.read_csv('data/new_xray/other/dicom_image_description.csv')


def dcm_to_png(dcm_path, output_path, output_csv_path, dicom_image_description):
    """
    Converts DCM files to PNG files and stores information from DCM to csv.
    
    Arguments:
        dcm_path {str} -- The directory where the DCM files are stored.
        output_path {str} -- The directory where the output PNGs will be stored.
        output_csv_path {str} -- Output directory + name of the CSV containing info from DCM files.
        dicom_image_description {pd.DataFrame} -- DataFrame containing DCM code converter.
    """
    # make it True if you want in PNG format
    PNG = True
    # Specify the .dcm folder path
    folder_path = dcm_path
    # Specify the .jpg/.png folder path
    jpg_folder_path = output_path
    images_path = os.listdir(folder_path)
    # list of attributes available in dicom image

    with open(output_csv_path, 'w', newline ='') as csvfile:
        fieldnames = list(dicom_image_description["Description"])
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(fieldnames)
        for n, image in enumerate(tqdm(images_path)):
            ds = dicom.dcmread(os.path.join(folder_path, image))
            rows = []
            pixel_array_numpy = ds.pixel_array
            if PNG == False:
                image = image.replace('.dcm', '.jpg')
            else:
                image = image.replace('.dcm', '.png')
            cv2.imwrite(os.path.join(jpg_folder_path, image), pixel_array_numpy)
            for field in fieldnames:
                if ds.data_element(field) is None:
                    rows.append('')
                else:
                    x = str(ds.data_element(field)).replace("'", "")
                    y = x.find(":")
                    x = x[y+2:]
                    rows.append(x)
            writer.writerow(rows)

### Execute

In [32]:
base_path_output = 'data/new_xray/'

dcm_path_train = path+'train_dcm'
output_path_train = base_path_output+'train_png'
output_path_csv_train = base_path_output+'train_dcm_png_info.csv'

dcm_path_test = path+'test_dcm'
output_path_test = base_path_output+'test_png'
output_path_csv_test = base_path_output+'test_dcm_png_info.csv'

# Train data
dcm_to_png(dcm_path_train, output_path_train, output_path_csv_train, dicom_image_description)

# Test data
dcm_to_png(dcm_path_test, output_path_test, output_path_csv_test, dicom_image_description)

HBox(children=(IntProgress(value=0, max=26684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))


