The purpose of this sheet is to take files in two different folders and shuffle them into sub-groups of files. The sub-groups of files should contain a mix of both folders, varying from 25% to 75% of the total count of 200 images.


In [17]:
import shutil
import os
import random
from pathlib import Path
# n = number of bags, an integer
n = 10
# c = number of classes
c = 2
# b = bag size plus or minus 50
b = 150

Very important to set the "bags" directory in the next cell correctly

In [2]:
#in_bag_folder is the input location, it should contain class name folders directly below, follwed by image name folders, followed by positive and negative folders of patches
in_bag_folder = os.path.join("d://","bin_DL_project","dsmil-wsi","WSI","CP_PDAC","annotated_bags") #possibly dynamically generate the bag folder name per run

#each output bag folder is emptied when used in create_bags
#folder to store output mixed bags
mixed_bag_folder = os.path.join("d://","bin_DL_project","dsmil-wsi","WSI","mixed_bags", "single")
os.makedirs(mixed_bag_folder, exist_ok=True)

In [44]:
classList = os.listdir(in_bag_folder)
#Sanity check that you are looking in the right place
print("The following is a list of classes")
print(classList)

def create_list_of_lists(classList):
    dict_classes = dict.fromkeys(classList, [])
    tileList_per_class = []
    for classFolder in classList:
        tileList = []
        print(classFolder)
        classFolderPath = os.path.join(in_bag_folder,classFolder)
        slideFolders = os.listdir(classFolderPath)
        positive_tiles = []
        negative_tiles = []
        #cycle through all slides of a given class to sort their patches
        for slideFolder in slideFolders:
            slideFolderPath = os.path.join(classFolderPath,slideFolder)
            #print(slideFolderPath)
            pos_neg_folders = [f for f in Path(slideFolderPath).resolve().glob('**/*') if not f.is_file()]
            #the subclasses, at this point, will be either "positive" or "negative"
            #positive patches are from within pathologist created annotations
            for sub_class_folder in pos_neg_folders:
                #print(sub_class_folder)
                image_files = os.listdir(sub_class_folder)
                #sort positive and negative images into different lists within classTiles
                #POSSIBLE WEAKNESS IF POSITIVE IS USED ELSEWHERE IN FILE PATH
                if(sub_class_folder.parts[-1] == "positive"):
                    for file in image_files:
                        full_path = os.path.join(sub_class_folder,file)
                        #print(full_path)
                        positive_tiles.append(full_path)
                else:
                    for file in image_files:
                        full_path = os.path.join(sub_class_folder,file)
                        negative_tiles.append(full_path)
                    
        tileList.append(positive_tiles)
        tileList.append(negative_tiles)
        tileList_per_class.append(tileList)
    class_tiles_dict = dict(zip(classList, tileList_per_class))
    return class_tiles_dict
    # at this point the tileList has multiple lists, one per class.
    # each sub-list in tileList contains the names of all of the tiles for that class


The following is a list of classes
['CP', 'PDAC']


In [47]:
# bag_destination_path - location to move tiles to when creating bags
# list_of_tiles - a list of lists, with tiles separated by class
# number_of_bags - create bags in distinct folders per class (PDAC or CP), up to this number
def create_bags(bag_destination_path, dict_of_tiles, number_of_bags=3):
    
    #Currently makes the assumption that there are ONLY TWO CLASSES
    
    broken_up_lists = [] #list of lists that have been chopped up randomly
    #Cycle through each list in list_of_tiles, chopping it into pieces
    for classification in dict_of_tiles.keys():
        list_of_tiles = dict_of_tiles[classification]
        for tile_class in range(0,len(list_of_tiles)):
            one_class_of_tiles = list_of_tiles[tile_class]
            #print(len(one_class_of_tiles))
            random.shuffle(one_class_of_tiles)

            single_class_batches = []
            #Stop breaking up the list once there are less than 25 tiles remaining
            while (len(one_class_of_tiles)>25):
                size = int(b/2+random.randint(-25,25))
                #print(size)
                single_class_batches.append(one_class_of_tiles[-size:])
                del one_class_of_tiles[-size:]
                #print(len(one_class_of_tiles))    
            #print(len(single_class_batches))

            #catch any remaining tiles and add them to the shortest list
            single_class_batches.sort(key=len)
            single_class_batches[0].extend(one_class_of_tiles)

            broken_up_lists.append(single_class_batches)
            #at this point, one entire class of tiles should be broken up into 50 to 100 tile batches and stored in broken_up_lists

        #sort in different directions so that shortest and longest bags of each class are paired
        #broken_up_lists[0].sort()
        #broken_up_lists[1].sort(reverse=True)

        for bag in range(0,number_of_bags):
            #create a new folder for each bag
            destination = os.path.join(bag_destination_path,classification,"bag_"+str(bag+1))
            if os.path.isdir(destination):
                shutil.rmtree(destination)
            os.makedirs(destination, exist_ok=True)

            #Cycle through each sub-class (expected 2, positive and negative), and for each class, select one portion of the broken up list
            # add files corresponding to those elements to the bag
            for x in range(0,c):
                #print(list_of_tiles[x])
                #choose one of the sub-lists, should be between 50 and 100 names
                #DANGER, I DO NOT KNOW WHY I NEED [0] HERE
                sub_list = random.sample(broken_up_lists[x],1)[0]

                for choice in sub_list:
                    #print(choice)
                    #choice_path = os.path.join(classFolderPath,choice)
                    slideName = os.path.basename(os.path.split(choice)[-2])
                    print(slideName)

                    destination_path = os.path.join(destination, slideName+"_"+os.path.basename(choice))
                    #print(choice_path)
                    #print(destination)
                    shutil.copyfile(choice, destination_path)
            
            
    

In [48]:
dict_of_tiles = create_list_of_lists(classList)
create_bags(mixed_bag_folder, dict_of_tiles, 20)

CP
PDAC
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
ne

negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
p

positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
p

negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
n

positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
p

positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
n