In [1]:
import shutil
import os, shutil
from skimage.exposure import match_histograms
import numpy as np
import csv

from sklearn.model_selection import train_test_split

%run ./variables.ipynb
%run ./utils.ipynb
%run ../utils/data_utils.ipynb
%run ../utils/image_utils.ipynb

inf = float("inf")
RANGE = [0, inf] #[select, trim]
TEST_SIZE = 0.1
norm = False

In [3]:
# Get taxa list for filtering
selected_taxa = get_taxa_list(FILTER_PATH)
print("Filter:", len(selected_taxa), "taxa to select!")

# Loading reference image for histogram matching and saving ref img
ref = cv2.imread("/mnt/nvme-storage/pfauregi/datasets/atlas/ref_img.png", cv2.IMREAD_GRAYSCALE)
cv2.imwrite(os.path.join(SAVED_MODELS_ROOT, "ref_img.png"), ref)

# Fetching files
taxa_dict = {}
#selected_taxa = get_selected_taxa(SELECTED_TAXA)
for path in DATASET:
    print("Processing:",path)
    for taxon in os.listdir(path):
        if taxon in selected_taxa:
            dir_path = os.path.join(path, taxon)
            files = [f for f in os.listdir(dir_path) if isfile(join(dir_path, f))]
            for file in files:
                split = file.split(".")
                if (len(split)>1 and split[1] in ["png", "tiff", "tif"]):
                    file_root = file.split(".")[0]
                    source_file = os.path.join(dir_path, file)
                    target_file = os.path.join(taxon, file_root+".png")
                    taxa_dict.setdefault(taxon, []).append({"source": source_file, "target": target_file})

Processing: /mnt/nvme-storage/pfauregi/datasets/Aqualitas/


In [4]:
# Filtering
X, y = [], []
eliminated_taxa = {}
for taxon in taxa_dict:
    files_tmp = taxa_dict[taxon]
    if len(files_tmp)>=RANGE[0]:
        if len(files_tmp)>=RANGE[1]: files_tmp = np.random.permutation(files_tmp)[0:RANGE[1]]
        X.extend(files_tmp)
        y.extend([taxon]*len(files_tmp))
    else:
        eliminated_taxa.setdefault(taxon, None)
eliminated_taxa = list(eliminated_taxa.keys())
print(len(X) ,"images detected belonging to", len(np.unique(y)), "classes found in",len(DATASET),"folder!")
print("Eliminated taxon (unsufficient number of images):", eliminated_taxa)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42, stratify=y)
taxa_dict_train = {}
taxa_dict_test = {}

print("Train dataset composed of", len(X_train), "images and", len(np.unique(y_train)), "classes.")
print("Test dataset composed of", len(X_test), "images and", len(np.unique(y_test)), "classes.")

# Building dataset
check_dirs(DATASET_PATH)
delete_all_files_in_folder(DATASET_PATH)
save_path = [TRAIN_DATASET_PATH, TEST_DATASET_PATH]
Xs = [X_train, X_test]
Ys = [y_train, y_test]
for k in range(len(save_path)):
    print((k+1),"/",len(save_path))
    path = save_path[k]
    X = Xs[k]
    y = Ys[k]
    for i in range(len(X)):
        taxon = y[i]
        source_file = X[i]["source"]
        target_file = os.path.join(path, X[i]["target"])
        check_dirs(target_file)
        img = cv2.imread(source_file, cv2.IMREAD_GRAYSCALE)
        if norm: img = match_histograms(img, ref, multichannel=False).astype("uint8")
        img = convert_to_square(img, new_size=256)
        cv2.imwrite(target_file, img)
        
# Save dataset infos
f = open(os.path.join(DATASET_PATH, 'dataset_infos.csv'), 'w')
with f:
    writer = csv.writer(f)
    writer.writerow(["taxon", "n_images"])
    for taxon in taxa_dict:
        if not taxon in eliminated_taxa:
            writer.writerow([taxon, len(taxa_dict[taxon])])
            
print("Finished !")

8668 images detected belonging to 80 classes found in 1 folder!
Eliminated taxon (unsufficient number of images): []
Train dataset composed of 7801 images and 80 classes.
Test dataset composed of 867 images and 80 classes.
1 / 2
2 / 2
Finished !
