In [16]:
# Esse notebook tem como objetivo realizar o pré-processamento dos dados do CBIS-DDSM
# carrergando as imagens com o seu respectivo label e salvando em um arquivo csv

In [17]:
# Variáveis de ambiente
# ---------------------
# - DATA_PATH: caminho para os dados
# C:\Users\{home}\Data

# imports
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import glob 


#import image contrast enhancement
from skimage import exposure


# This notebook want to import the CBIS-DDSM dataset
# make a subset of the dataset for fast training and testing
# and save the subset in a new folder
# the subset will be used in the next notebooks
# the subset will be composed by 2 folders: train and test
# each folder will have 2 subfolders: benign and malignant
# each subfolder will have the images of the respective class

# The dataset is available at: CBISDDSM_PATH

home_directory = os.path.expanduser( '~' )
DATA_PATH = os.path.join( home_directory, 'Data')
# CBIS-DDSM
CBISDDSM_PATH = os.path.join( DATA_PATH, 'CBIS-DDSM')

CALC_TEST_PATH = os.path.join( CBISDDSM_PATH, 'CALC-TEST', 'CBIS-DDSM' )
CALC_TRAIN_PATH = os.path.join( CBISDDSM_PATH, 'CALC-TRAIN', 'CBIS-DDSM' )

# metadata for CALC
train_calc = pd.read_csv( os.path.join( CBISDDSM_PATH, 'CALC-TRAIN', 'metadata.csv' ) )
test_calc = pd.read_csv( os.path.join( CBISDDSM_PATH, 'CALC-TEST', 'metadata.csv' ) )

# Annotation file for CALC
train_calc_annotations = pd.read_csv( os.path.join( CBISDDSM_PATH, 'calc_case_description_train_set.csv' ) )
test_calc_annotations = pd.read_csv( os.path.join( CBISDDSM_PATH, 'calc_case_description_test_set.csv' ) )

# get dir of the images
# the filename changed so delete them from the path
# get all file path but the filename
train_calc_annotations['dir'] = train_calc_annotations['cropped image file path'].apply( lambda x: x.split('/')[:-1] )
test_calc_annotations['dir'] = test_calc_annotations['cropped image file path'].apply( lambda x: x.split('/')[:-1] )

# join the path
train_calc_annotations['dir'] = train_calc_annotations['dir'].apply( lambda x: '/'.join(x) )
test_calc_annotations['dir'] = test_calc_annotations['dir'].apply( lambda x: '/'.join(x) )


# add CALC-TRAIN to the path
train_calc_annotations['dir'] = train_calc_annotations['dir'].apply( lambda x: os.path.join( CALC_TRAIN_PATH, x ) )

# add CALC-TEST to the path
test_calc_annotations['dir'] = test_calc_annotations['dir'].apply( lambda x: os.path.join( CALC_TEST_PATH, x ) )

# add \\ to the end of the path
train_calc_annotations['dir'] = train_calc_annotations['dir'].apply( lambda x: x + '/' )
test_calc_annotations['dir'] = test_calc_annotations['dir'].apply( lambda x: x + '/' )

print(train_calc_annotations['dir'][0])
print(test_calc_annotations['dir'][0])

# for the train dataset
file_cases = []
for case in train_calc_annotations['dir']:
    files = glob.glob(case + '*.dcm', recursive=True)
    file_cases.append(files)

print(file_cases[0])

cropped_files = []
for files in file_cases:
    sizes = []
    for file in files:
        sizes.append(os.path.getsize(file))
    cropped_files.append(files[np.argmin(sizes)])

# add the cropped files to the dataframe
train_calc_annotations['cropped_file'] = cropped_files

train_calc_annotations.drop(columns=['ROI mask file path','image file path'], inplace=True)
train_calc_annotations.head()

# for the test dataset
file_cases = []
for case in test_calc_annotations['dir']:
    files = glob.glob(case + '*.dcm', recursive=True)
    file_cases.append(files)

print(file_cases[0])

cropped_files = []
for files in file_cases:
    sizes = []
    for file in files:
        sizes.append(os.path.getsize(file))
    cropped_files.append(files[np.argmin(sizes)])

# add the cropped files to the dataframe
test_calc_annotations['cropped_file'] = cropped_files

# delete columns that are not necessary
test_calc_annotations.drop(columns=['ROI mask file path', 'cropped image file path', 'image file path'], inplace=True)
test_calc_annotations.head()

# save the annotations in a csv file
train_calc_annotations.to_csv('train_calc_annotations2.csv', index=False)
test_calc_annotations.to_csv('test_calc_annotations2.csv', index=False)

/home/vdgaete-pc/Data/CBIS-DDSM/CALC-TRAIN/CBIS-DDSM/Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9590.100.1.2.328778919012412769218080124214088709081/1.3.6.1.4.1.9590.100.1.2.393344010211719049419601138200355094682/
/home/vdgaete-pc/Data/CBIS-DDSM/CALC-TEST/CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.161465562211359959230647609981488894942/1.3.6.1.4.1.9590.100.1.2.419081637812053404913157930753972718515/
['/home/vdgaete-pc/Data/CBIS-DDSM/CALC-TRAIN/CBIS-DDSM/Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9590.100.1.2.328778919012412769218080124214088709081/1.3.6.1.4.1.9590.100.1.2.393344010211719049419601138200355094682/1-1.dcm', '/home/vdgaete-pc/Data/CBIS-DDSM/CALC-TRAIN/CBIS-DDSM/Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9590.100.1.2.328778919012412769218080124214088709081/1.3.6.1.4.1.9590.100.1.2.393344010211719049419601138200355094682/1-2.dcm']
['/home/vdgaete-pc/Data/CBIS-DDSM/CALC-TEST/CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.1614655622

In [18]:
#