## Create subdirectory containing images for each class

In [22]:
import os
import random
import pandas as pd
from shutil import copyfile


In [2]:
main_data = pd.read_csv('data/data_labels_mainData.csv')
extra_data = pd.read_csv('data/data_labels_extraData.csv')

In [3]:
list(main_data.cellTypeName.unique())

['fibroblast', 'inflammatory', 'others', 'epithelial']

In [4]:
cellTypeNametoIdx = {'fibroblast':0, 'inflammatory':1, 'epithelial':2, 'others':3}

In [5]:
main_data.drop(['InstanceID', 'patientID', 'cellTypeName'], inplace=True, axis=1)

In [6]:
fibroblast_imgs = list(main_data.ImageName[main_data.cellType == 0])
inflamatory_imgs = list(main_data.ImageName[main_data.cellType == 1])
epithelial_imgs = list(main_data.ImageName[main_data.cellType == 2])
others_imgs = list(main_data.ImageName[main_data.cellType == 3])

In [7]:
print(len(fibroblast_imgs))
print(len(inflamatory_imgs))
print(len(epithelial_imgs))
print(len(others_imgs))
print(len(fibroblast_imgs) + len(inflamatory_imgs) + len(epithelial_imgs) + len(others_imgs))

1888
2543
4079
1386
9896


In [8]:
img_list = []
#create list of img names for each class
img_list.append(fibroblast_imgs)
img_list.append(inflamatory_imgs)
img_list.append(epithelial_imgs)
img_list.append(others_imgs)

## Create seperated directory for each class image data

In [9]:
CUR_DIR = os.getcwd()
DATA_DIR = '/data'
TRAINING_DIR = '/training'
TESTING_DIR = '/testing'
IMG_SOURCE_DIR = CUR_DIR + DATA_DIR + '/patch_images'
CELL_TYPE_DIR = '/cell_type'
no_classes = 4

## Create data directory for CELL TYPE classification problem

## Create seperated directory for each class image data

In [10]:
SOURCE_DIR = CUR_DIR + DATA_DIR + CELL_TYPE_DIR
CELL_TYPE_SOURCE_DIR = SOURCE_DIR + '/source'
FIBROBLAST_SUB_DIR = CELL_TYPE_SOURCE_DIR + '/0'
INFLAMATORY_SUB_DIR = CELL_TYPE_SOURCE_DIR + '/1'
EPITHELIAL_SUB_DIR = CELL_TYPE_SOURCE_DIR + '/2'
OTHERS_SUB_DIR = CELL_TYPE_SOURCE_DIR + '/3'

In [11]:
if not os.path.exists(SOURCE_DIR):
    os.mkdir(SOURCE_DIR)
if not os.path.exists(CELL_TYPE_SOURCE_DIR):
    os.mkdir(CELL_TYPE_SOURCE_DIR)

In [12]:
if not os.path.exists(FIBROBLAST_SUB_DIR):
    os.mkdir(FIBROBLAST_SUB_DIR)
if not os.path.exists(INFLAMATORY_SUB_DIR):
    os.mkdir(INFLAMATORY_SUB_DIR)
if not os.path.exists(EPITHELIAL_SUB_DIR):
    os.mkdir(EPITHELIAL_SUB_DIR)
if not os.path.exists(OTHERS_SUB_DIR):
    os.mkdir(OTHERS_SUB_DIR)

In [13]:
def copy_img_data(class_number):
    img_name_list = img_list[class_number]
    for i in range(len(img_name_list)):
        copyfile(IMG_SOURCE_DIR + '/' + img_name_list[i], 
                 CELL_TYPE_SOURCE_DIR + '/' + str(class_number) + '/' + img_name_list[i])

In [14]:
copy_img_data(0)
copy_img_data(1)
copy_img_data(2)
copy_img_data(3)

## split train/test directory for each class image data

In [15]:
#create separate folder for training, including training data and validation data being splitted from source data
TRAIN_DIR = SOURCE_DIR + TRAINING_DIR
TEST_DIR = SOURCE_DIR + TESTING_DIR
if not os.path.exists(TRAIN_DIR):
    os.mkdir(TRAIN_DIR)
if not os.path.exists(TEST_DIR):
    os.mkdir(TEST_DIR)

In [16]:
for i in range(no_classes):
    train_dir_class_i = os.path.join(TRAIN_DIR, str(i))
    test_dir_class_i = os.path.join(TEST_DIR, str(i))

    if not os.path.exists(train_dir_class_i):
        os.mkdir(train_dir_class_i)
    if not os.path.exists(test_dir_class_i):
        os.mkdir(test_dir_class_i)

In [17]:
FIBROBLAST_SUB_DIR_TRAIN = TRAIN_DIR + '/0'
FIBROBLAST_SUB_DIR_TEST =  TEST_DIR + '/0'

INFLAMATORY_SUB_DIR_TRAIN = TRAIN_DIR + '/1'
INFLAMATORY_SUB_DIR_TEST = TEST_DIR + '/1'

EPITHELIAL_SUB_DIR_TRAIN = TRAIN_DIR + '/2'
EPITHELIAL_SUB_DIR_TEST = TEST_DIR + '/2'

OTHERS_SUB_DIR_TRAIN = TRAIN_DIR + '/3'
OTHERS_SUB_DIR_TEST = TEST_DIR + '/3'

In [23]:
#Function to split source data into training/testing by split_size
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
    files = os.listdir(SOURCE)
    randomized_files = random.sample(files, len(files))
    for i in range(int(SPLIT_SIZE * len(randomized_files))):
        copyfile(SOURCE + '/' + randomized_files[i], 
                 TRAINING + '/'+ randomized_files[i]
                )
    for i in range(int(SPLIT_SIZE * len(randomized_files)), len(randomized_files)):
        copyfile(SOURCE + '/' + randomized_files[i], 
                 TESTING + '/' + randomized_files[i]
                )

# source_dir = os.path.join(VM_DIR, 'img')
split_size = .8
for i in range(no_classes):
    class_i_source_dir = CELL_TYPE_SOURCE_DIR + '/' + str(i)
    class_i_training_dir = TRAIN_DIR + '/' + str(i)
    class_i_test_dir = TEST_DIR + '/' + str(i)
    split_data(class_i_source_dir, class_i_training_dir, class_i_test_dir, split_size)

## Create data directory for isCancerous classification problem|

In [None]:
IS_CANCEROUS_DIR = '/is_cancerous'

In [None]:
POSITIVE_CANCEROUS_DIR = '/0'
NEGATIVE_CANCEROUS_DIR = '/1'