In [4]:
import os
import glob
import pandas as pd
import numpy as np
import random
from collections import Counter
from vit_keras import vit, utils
import tensorflow_addons as tfa
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow.keras.metrics
from PIL import Image

In [2]:
working_dir = '' # PUT YOUR HOME HERE
processed_data = working_dir + "processed/"

MODEL_PARAMETERS = {"model_name" : "Mohs_BCC_SCC_2023_propsplit", #name of the model of my choosing    
                    "image_size" : 512,
                    "loss" : "categorical_crossentropy", #
                    "class_mode": "categorical", #
                    "output_layers" : 3, #
                    "output_activation" : "softmax", #
                    "activation" : "relu", 
                    "optimizer" : "sgd",
                    "num_epochs" : 30,
                    "batch_size" : 4}

experiment_folder = working_dir + MODEL_PARAMETERS["model_name"] + "/"

In [5]:
if not os.path.isdir(experiment_folder):
    print("Making a new folder for the new experiment")
    os.mkdir(experiment_folder)
else :
    print("Caution : Will overwrite existing data and models")

# Write out parameters
with open(experiment_folder+'MODEL_PARAMETERS.txt','w') as data: 
      data.write(str(MODEL_PARAMETERS))

Making a new folder for the new experiment


In [None]:
Patch_table = pd.read_csv(working_dir + "Mohs_Patch_Data.csv")
Patch_table

In [None]:
#putting all photos in one table with full path
image_pattern = raw_data + "/**/*.jpg"
file_list = glob.glob(image_pattern, recursive=True)
Training_table = pd.DataFrame(file_list,columns = ["full_path"])
Training_table["Image_File"] = [path.split("/")[10].replace(".jpg", "") for path in Training_table["full_path"]]
Training_table = Training_table[~Training_table["Image_File"].str.contains("_2x_")]
Training_table.reset_index(inplace=True, drop = True)
Training_table

In [None]:
patch_set = set(Patch_table["Image_File"])
full_image_set = set(Training_table["Image_File"])

print("In Google sheet, not in folder list")
print(patch_set.difference(full_image_set))

print("In folder list, not on google sheet")
print(full_image_set.difference(patch_set))

In [None]:
Training_table = pd.merge(Training_table, Patch_table, on='Image_File')
Training_table

In [None]:
Training_table[Training_table.isnull().any(axis=1)]

In [None]:
# Iterate through the Training Table (each row)
# Read in the image in "full_path"
# Take each patch element (1 - 6)
# Write our a new file with filename that has all metadata
crop_dictionary = {}

crop_dictionary[1] =  {"x1" :0, "y1" : 0, "x2" : 682,  "y2" : 766}
crop_dictionary[2] = {"x1" : 682, "y1" : 0, "x2" : 1364,  "y2" : 766}
crop_dictionary[3] = {"x1" :1364, "y1" : 0, "x2" : 2046,  "y2" : 766}
crop_dictionary[4] = {"x1" :0, "y1" : 766, "x2" : 682,  "y2" : 1532}
crop_dictionary[5] = {"x1" :682, "y1" : 766, "x2" : 1364,  "y2" : 1532}
crop_dictionary[6] = {"x1" :1364, "y1" : 766, "x2" : 2046,  "y2" : 1532}

dir = processed_data
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))

for index in Training_table.index:

    if index % 100 == 0:
        print("Starting " + str(index))

    out_filename = Training_table["Image_File"][index]
    
    img = Image.open(Training_table["full_path"][index])

    for i in range(1,7):

        out_filename_loop = out_filename + "_p_" + str(i) + "_" + Training_table[str(i)][index] + ".jpg"
        
        cropped_image = img.crop((crop_dictionary[i]["x1"], 
                                    crop_dictionary[i]["y1"], 
                                    crop_dictionary[i]["x2"], 
                                    crop_dictionary[i]["y2"]))

        cropped_image.save(processed_data + out_filename_loop)

In [6]:
patches = os.listdir(processed_data)
full_path = [processed_data + patch for patch in patches]
Training_table = pd.DataFrame({"full_path" : full_path, "image_name" : patches})
Training_table

Unnamed: 0,full_path,image_name
0,/home/ubuntu/Data/MOHS/processed/7_A_10x_N_13_...,7_A_10x_N_13_p_2_N.jpg
1,/home/ubuntu/Data/MOHS/processed/2_A_10x_BCC_6...,2_A_10x_BCC_6_p_4_B.jpg
2,/home/ubuntu/Data/MOHS/processed/3_A_10x_N_21_...,3_A_10x_N_21_p_2_N.jpg
3,/home/ubuntu/Data/MOHS/processed/12_B_10x_N_25...,12_B_10x_N_25_p_5_N.jpg
4,/home/ubuntu/Data/MOHS/processed/20_A_10x_N_12...,20_A_10x_N_12_p_5_Empt.jpg
...,...,...
8977,/home/ubuntu/Data/MOHS/processed/12_A_10x_BCC_...,12_A_10x_BCC_10_p_1_B.jpg
8978,/home/ubuntu/Data/MOHS/processed/12_B_10x_BCC_...,12_B_10x_BCC_44_p_5_B.jpg
8979,/home/ubuntu/Data/MOHS/processed/11_D_10x_BCC_...,11_D_10x_BCC_8_p_2_B.jpg
8980,/home/ubuntu/Data/MOHS/processed/3_A_10x_N_23_...,3_A_10x_N_23_p_2_N.jpg


In [7]:
Training_table["metadata"] = [image_name.replace(".jpg", "") for image_name in Training_table["image_name"]]

Training_table = Training_table[ Training_table[ "metadata" ].str.contains( "Empt" )==False ]
Training_table.reset_index(drop = True,inplace = True)

Training_table

Unnamed: 0,full_path,image_name,metadata
0,/home/ubuntu/Data/MOHS/processed/7_A_10x_N_13_...,7_A_10x_N_13_p_2_N.jpg,7_A_10x_N_13_p_2_N
1,/home/ubuntu/Data/MOHS/processed/2_A_10x_BCC_6...,2_A_10x_BCC_6_p_4_B.jpg,2_A_10x_BCC_6_p_4_B
2,/home/ubuntu/Data/MOHS/processed/3_A_10x_N_21_...,3_A_10x_N_21_p_2_N.jpg,3_A_10x_N_21_p_2_N
3,/home/ubuntu/Data/MOHS/processed/12_B_10x_N_25...,12_B_10x_N_25_p_5_N.jpg,12_B_10x_N_25_p_5_N
4,/home/ubuntu/Data/MOHS/processed/10_B_10x_N_2_...,10_B_10x_N_2_p_4_N.jpg,10_B_10x_N_2_p_4_N
...,...,...,...
8134,/home/ubuntu/Data/MOHS/processed/12_A_10x_BCC_...,12_A_10x_BCC_10_p_1_B.jpg,12_A_10x_BCC_10_p_1_B
8135,/home/ubuntu/Data/MOHS/processed/12_B_10x_BCC_...,12_B_10x_BCC_44_p_5_B.jpg,12_B_10x_BCC_44_p_5_B
8136,/home/ubuntu/Data/MOHS/processed/11_D_10x_BCC_...,11_D_10x_BCC_8_p_2_B.jpg,11_D_10x_BCC_8_p_2_B
8137,/home/ubuntu/Data/MOHS/processed/3_A_10x_N_23_...,3_A_10x_N_23_p_2_N.jpg,3_A_10x_N_23_p_2_N


In [None]:
Training_table["patient"] = ""
Training_table["slide"] = ""
Training_table["zoom"] = ""
Training_table["cancer"] = ""
Training_table["image_number"] = ""

for index in Training_table.index:
    metadata = Training_table["metadata"][index]
    metadata = metadata.split("_")
    Training_table.loc[index,"patient"] = metadata[0]
    Training_table.loc[index,"slide"] = metadata[1]
    Training_table.loc[index,"zoom"] = metadata[2]
    Training_table.loc[index,"cancer"] = metadata[3]
    Training_table.loc[index,"image_number"] = metadata[4]
    Training_table.loc[index,"patch_number"] = metadata[6]
    Training_table.loc[index,"patch_cancer"] = metadata[7]

Training_table

In [9]:
print(Counter(Training_table["patient"]))
print(Counter(Training_table["patch_cancer"]))

counts = Training_table.groupby(["patient", "patch_cancer"]).size().reset_index(name='Count')
counts

Counter({'11': 2026, '12': 1336, '3': 560, '18': 450, '4': 428, '15': 385, '14': 366, '13': 318, '20': 281, '7': 271, '2': 244, '1': 200, '6': 193, '17': 192, '8': 179, '10': 174, '19': 148, '9': 139, '16': 125, '5': 124})
Counter({'N': 5889, 'B': 1504, 'S': 746})


Unnamed: 0,patient,patch_cancer,Count
0,1,B,15
1,1,N,185
2,10,B,25
3,10,N,149
4,11,B,503
5,11,N,1523
6,12,B,607
7,12,N,729
8,13,N,251
9,13,S,67


In [None]:
Training_table["class_set"] = "train"

random.seed(10)
test_sample = Training_table.groupby(["patient","patch_cancer"], group_keys=False).apply(lambda x:x.sample(frac=0.15)).index.tolist()
Training_table.loc[Training_table.index[test_sample],"class_set"] = "test"

random.seed(10)
val_sample = Training_table.loc[test_sample].groupby(["patient","patch_cancer"], group_keys=False).apply(lambda x:x.sample(frac=0.33)).index.tolist()
Training_table.loc[Training_table.index[val_sample],"class_set"] = "val"

Training_table.groupby(["patch_cancer", "class_set"]).size()

In [None]:
val_data = Training_table.loc[Training_table["class_set"] == "val"]
val_data.reset_index(inplace = True, drop = True)
test_data = Training_table.loc[Training_table["class_set"] == "test"]
test_data.reset_index(inplace = True, drop = True)
train_data = Training_table.loc[Training_table["class_set"] == "train"]
train_data.reset_index(inplace = True, drop = True)

In [None]:
Training_table.to_csv(experiment_folder + "full_data.csv")
val_data.to_csv(experiment_folder + "val_data.csv")
test_data.to_csv(experiment_folder + "test_data.csv")
train_data.to_csv(experiment_folder + "train_data.csv")

In [None]:
print(len(train_data["image_name"]))
print(len(np.unique(train_data["image_name"])))

print(len(test_data["image_name"]))
print(len(np.unique(test_data["image_name"])))

print(len(val_data["image_name"]))
print(len(np.unique(val_data["image_name"])))

print(len(np.unique(train_data["image_name"])) + len(np.unique(test_data["image_name"])) + len(np.unique(val_data["image_name"])))
print(len(np.unique(list(train_data["image_name"]) + list(test_data["image_name"]) + list(val_data["image_name"]))))