In [4]:
# Esse notebook tem como objetivo realizar o pré-processamento dos dados do CBIS-DDSM
# bem como a criação de um dataset com as imagens de treino e teste para o modelo



In [5]:
# Variáveis de ambiente
# ---------------------
# - DATA_PATH: caminho para os dados
# C:\Users\{home}\Data

# imports
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pydicom
import glob 
import shutil
import random
from tqdm import tqdm

#import image contrast enhancement
from skimage import exposure


# This notebook want to import the CBIS-DDSM dataset
# make a subset of the dataset for fast training and testing
# and save the subset in a new folder
# the subset will be used in the next notebooks
# the subset will be composed by 2 folders: train and test
# each folder will have 2 subfolders: benign and malignant
# each subfolder will have the images of the respective class

# The dataset is available at: CBISDDSM_PATH

home_directory = os.path.expanduser( '~' )
DATA_PATH = os.path.join( home_directory, 'Data')
# CBIS-DDSM
CBISDDSM_PATH = os.path.join( DATA_PATH, 'CBIS-DDSM','manifest-ZkhPvrLo5216730872708713142')
# MIAS
MIAS_PATH = os.path.join( DATA_PATH, 'MIAS')
# INbreast
INBREAST_PATH = os.path.join( DATA_PATH, 'INbreast')

# Check existance of paths

if not os.path.exists( DATA_PATH ):
    raise Exception( 'Data path not found: {}'.format( DATA_PATH ) )

if not os.path.exists( CBISDDSM_PATH ):
    # raise Exception( 'CBIS-DDSM path not found: {}'.format( CBISDDSM_PATH ) )
    # create path
    os.makedirs( CBISDDSM_PATH )

if not os.path.exists( MIAS_PATH ):
    # raise Exception( 'MIAS path not found: {}'.format( MIAS_PATH ) )
    # create path
    os.makedirs( MIAS_PATH )

if not os.path.exists( INBREAST_PATH ):
    # raise Exception( 'INbreast path not found: {}'.format( INBREAST_PATH ) )
    # create path
    os.makedirs( INBREAST_PATH )

# CBIS-DDSM
# ---------

# the dataset is available at: https://wiki.cancerimagingarchive.net/display/Public/CBIS-DDSM
# is already downloaded and extracted in the DATA_PATH
# the is in format DICOM

# the dataset is composed of a csv metadata.csv

# load metadata

metadata = pd.read_csv( os.path.join( CBISDDSM_PATH, 'metadata.csv' ) )

# check metadata



                                                   Series UID  Collection  \
1.3.6.1.4.1.9590.100.1.2.3741159975118890730213...  CBIS-DDSM         NaN   
1.3.6.1.4.1.9590.100.1.2.1743903611126467477186...  CBIS-DDSM         NaN   
1.3.6.1.4.1.9590.100.1.2.4190816378120534049131...  CBIS-DDSM         NaN   
1.3.6.1.4.1.9590.100.1.2.1886139557101704178030...  CBIS-DDSM         NaN   
1.3.6.1.4.1.9590.100.1.2.2448769975138750902395...  CBIS-DDSM         NaN   

                                                                               3rd Party Analysis  \
1.3.6.1.4.1.9590.100.1.2.3741159975118890730213...  https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY   
1.3.6.1.4.1.9590.100.1.2.1743903611126467477186...  https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY   
1.3.6.1.4.1.9590.100.1.2.4190816378120534049131...  https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY   
1.3.6.1.4.1.9590.100.1.2.1886139557101704178030...  https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY   
1.3.6.1.4.1.9590.100.1.2.2448769

In [6]:
# Import CBIS-DDSM data
# csv


# import all csv in the folder
def import_csv(path):
    csv_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                csv_list.append(os.path.join(root, file))
    return csv_list

# save each csv as a dataframe
def save_csv_as_df(csv_list):
    df_list = []
    for i in range(len(csv_list)):
        df = pd.read_csv(csv_list[i])
        df_list.append(df)
    return df_list

# print header 
def print_header(df_list):
    for i in range(len(df_list)):
        print(df_list[i].head())

