In [1]:
import csv
import os

import numpy as np

In [2]:
gt_path_train = "data/ISBI2016_ISIC_Part3_Training_GroundTruth.csv"
gt_path_test = "data/ISBI2016_ISIC_Part3_Test_GroundTruth.csv"

In [3]:
def get_class_ratio(path: str) -> float:
    ground_truth = list()
    with open(path, "r") as f:
        reader = csv.reader(f, delimiter=',')
        samples, n_pos = 0, 0
        for line in reader:
            label = line[-1]
            if label == "benign" or label == "0.0":
                n_pos += 1
            samples +=1

    return n_pos / samples

In [4]:
train_ratio = get_class_ratio(gt_path_train)
test_ratio = get_class_ratio(gt_path_test)
print(f"Ratio of train set is {train_ratio}")
print(f"Ratio of test set is {test_ratio}")

Ratio of train set is 0.8077777777777778
Ratio of test set is 0.8021108179419525


In [11]:
train_set_path = "data/train"
test_set_path = "data/val"
current_train_image_dir = "data/ISBI2016_ISIC_Part3_Training_Data"
current_test_image_dir = "data/ISBI2016_ISIC_Part3_Test_Data"

os.makedirs(train_set_path, exist_ok=True)
os.makedirs(test_set_path, exist_ok=True)

In [6]:
def sort_images_in_dir_by_ground_truth(image_dir: str, gt_path: str, new_path: str) -> None:
    os.makedirs(os.path.join(new_path, "benign"), exist_ok=True)
    os.makedirs(os.path.join(new_path, "malignant"), exist_ok=True)
    with open(gt_path, "r") as f:
        reader = csv.reader(f, delimiter=',')
        samples, n_pos = 0, 0
        for line in reader:
            filename, label = line
            filename += ".jpg"
            if label == "0.0":
                label = "benign"
            elif label == "1.0":
                label = "malignant"
        
            src = os.path.join(image_dir, filename)
            dst = os.path.join(new_path, os.path.join(label, filename))
            os.system(f'cp {src} {dst}')

In [7]:
sort_images_in_dir_by_ground_truth(current_train_image_dir, gt_path_train, train_set_path)
sort_images_in_dir_by_ground_truth(current_test_image_dir, gt_path_test, test_set_path)

In [8]:
def get_class_ratio_by_dir(path: str) -> float:
    n_benign = len(os.listdir(os.path.join(path, "benign")))
    n_malignant = len(os.listdir(os.path.join(path, "malignant")))
    return n_benign / (n_benign + n_malignant)

In [9]:
train_ratio = get_class_ratio_by_dir(train_set_path)
test_ratio = get_class_ratio_by_dir(test_set_path)
print(f"Ratio of train set is {train_ratio}")
print(f"Ratio of test set is {test_ratio}")

Ratio of train set is 0.8077777777777778
Ratio of test set is 0.8021108179419525


In [14]:
def get_inverse_class_ratio(path: str) -> float:
    entities = list()
    for category in os.listdir(path):
        class_dir = os.path.join(path, category)
        n_instances = len(os.listdir(class_dir))
        entities.append(n_instances)

    entities = np.array(entities)
    class_ratio = np.divide(entities, np.sum(entities))
    return np.divide(1, class_ratio)

    

In [15]:
train_inverse = get_inverse_class_ratio(train_set_path)
test_inverse = get_inverse_class_ratio(test_set_path )



In [17]:
np.divide(train_inverse, train_inverse.shape[0])

array([0.61898212, 2.60115607])