In [1]:
# Esse notebook tem como objetivo realizar o pré-processamento dos dados do CBIS-DDSM
# carrergando as imagens com o seu respectivo label e salvando em um arquivo csv

In [2]:
# Variáveis de ambiente
# ---------------------
# - DATA_PATH: caminho para os dados
# C:\Users\{home}\Data

# imports
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pydicom
import glob 
import shutil
import random
from tqdm import tqdm

#import image contrast enhancement
from skimage import exposure


# This notebook want to import the CBIS-DDSM dataset
# make a subset of the dataset for fast training and testing
# and save the subset in a new folder
# the subset will be used in the next notebooks
# the subset will be composed by 2 folders: train and test
# each folder will have 2 subfolders: benign and malignant
# each subfolder will have the images of the respective class

# The dataset is available at: CBISDDSM_PATH

home_directory = os.path.expanduser( '~' )
DATA_PATH = os.path.join( home_directory, 'Data')
# CBIS-DDSM
CBISDDSM_PATH = os.path.join( DATA_PATH, 'CBIS-DDSM')

CALC_TEST_PATH = os.path.join( CBISDDSM_PATH, 'CALC-TEST' )
CALC_TRAIN_PATH = os.path.join( CBISDDSM_PATH, 'CALC-TRAIN' )
# in CBIS-DDSM there are 2 folder for CALC
# and 2 folders for MASS
# each folder has the images and the metadata

# the metadata are in DICOM format
# the images are in dcm format

# the metadata are in the root of the folder
# the images are in subfolders in CBIS-DDSM

# import train and test metadata

# metadata for CALC
train_calc = pd.read_csv( os.path.join( CBISDDSM_PATH, 'CALC-TRAIN', 'metadata.csv' ) )
test_calc = pd.read_csv( os.path.join( CBISDDSM_PATH, 'CALC-TEST', 'metadata.csv' ) )

# Annotation file for CALC
train_calc_annotations = pd.read_csv( os.path.join( CBISDDSM_PATH, 'calc_case_description_train_set.csv' ) )
test_calc_annotations = pd.read_csv( os.path.join( CBISDDSM_PATH, 'calc_case_description_test_set.csv' ) )

train_calc_annotations['Case'] = train_calc_annotations['ROI mask file path'].apply( lambda x: x.split('/')[-2] )

case_objects = []

for index, row in train_calc_annotations.iterrows():

    # get the case
    case = row['Case']
    # get the objects that the case is equal to the index
    
    case_objects.append(train_calc[train_calc.index.values == case]['File Location'].values[0])

#add \\ to the path
case_objects = [x.replace('/', '\\') for x in case_objects]
case_objects = [x[2:] for x in case_objects]
# add CALC-TRAIN to the path
case_objects = [os.path.join( CALC_TRAIN_PATH, x ) for x in case_objects]
# add \\ to the end of the path
case_objects = [x + '\\' for x in case_objects]

# list number of files
# for each case
# and add to the dataframe
file_cases = []
for case in case_objects:
    files = glob.glob(case + '**\\*.dcm', recursive=True)
    file_cases.append(files)

# each case has 2 files
# one for the ROI mask
# and one for the cropped image
# we will use the cropped image

# identify the cropped image
# and remove the ROI mask
# from the list of files
cropped_files = []
# the cropped image has the least size
# so we will use this to identify the cropped image
for files in file_cases:
    sizes = []
    for file in files:
        sizes.append(os.path.getsize(file))
    cropped_files.append(files[np.argmin(sizes)])

# add the cropped files to the dataframe
train_calc_annotations['cropped_file'] = cropped_files

# delete columns that are not necessary
train_calc_annotations.drop(columns=['ROI mask file path','image file path'], inplace=True)
train_calc_annotations.head()

# do the same for test
# Annotation file for CALC
test_calc_annotations = pd.read_csv( os.path.join( CBISDDSM_PATH, 'calc_case_description_test_set.csv' ) )

test_calc_annotations['Case'] = test_calc_annotations['ROI mask file path'].apply( lambda x: x.split('/')[-2] )

case_objects = []

for index, row in test_calc_annotations.iterrows():
    
        # get the case
        case = row['Case']
        # get the objects that the case is equal to the index
        
        case_objects.append(test_calc[test_calc.index.values == case]['File Location'].values[0])

#add \\ to the path
case_objects = [x.replace('/', '\\') for x in case_objects]
case_objects = [x[2:] for x in case_objects]
# add CALC-TRAIN to the path
case_objects = [os.path.join( CALC_TEST_PATH, x ) for x in case_objects]
# add \\ to the end of the path
case_objects = [x + '\\' for x in case_objects]

# list number of files
# for each case
# and add to the dataframe
file_cases = []
for case in case_objects:
    files = glob.glob(case + '**\\*.dcm', recursive=True)
    file_cases.append(files)

# each case has 2 files
# one for the ROI mask
# and one for the cropped image
# we will use the cropped image

# identify the cropped image
# and remove the ROI mask
# from the list of files
cropped_files = []
# the cropped image has the least size
# so we will use this to identify the cropped image
for files in file_cases:
    sizes = []
    for file in files:
        sizes.append(os.path.getsize(file))
    cropped_files.append(files[np.argmin(sizes)])

# add the cropped files to the dataframe
test_calc_annotations['cropped_file'] = cropped_files

# delete columns that are not necessary
test_calc_annotations.drop(columns=['ROI mask file path', 'cropped image file path', 'image file path'], inplace=True)
test_calc_annotations.head()

# save the annotations in a csv file
train_calc_annotations.to_csv('train_calc_annotations.csv', index=False)
test_calc_annotations.to_csv('test_calc_annotations.csv', index=False)