In [None]:
import pandas as pd
import numpy as np
import cv2
from keras.applications.densenet import preprocess_input
from keras.preprocessing import image
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.utils import to_categorical

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def save_dump(file_path, data, labels):
    file = open(file_path, 'wb')
    # dump information to that file
    pickle.dump((data, labels), file)
    # close the file
    file.close()
    pass


def load_data(path_file):
    file = open(path_file, 'rb')
    # dump information to that file
    (pixels, labels) = pickle.load(file)
    # close the file
    file.close()
    print(pixels.shape)
    print(labels.shape)
    return pixels, labels


def get_name_image(id, dfi, s, b):
    name = 'img'
    if id < 10:
        name = name + '00' + str(id)
    elif id < 100:
        name = name + '0' + str(id)
    else:
        name = name + str(id)
    dfi = round(dfi * 10000)
    if dfi < 1000:
        dfi = '0' + str(dfi)
    name = name + '_' + s + '_' + b + '_DFI_' + str(dfi)
    return name + '.jpg'


def get_image(image_path):
    # return cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2GRAY)
    # return cv2.imread(image_path)
    # return cv2.resize(cv2.imread(image_path), dsize=(150, 150))
    # print(image_path)
    img = image.load_img(image_path, target_size=(30, 30, 3))
    # plt.imshow(img)
    return preprocess_input(image.img_to_array(img))


def get_label(dfi):
    if dfi < 0.1:
        return 0
    elif dfi < 0.15:
        return 1
    return 2


def load_image(dataframe, path_local):
    data_images = []
    data_labels = []
    for df in dataframe:
        image_name = get_name_image(df[0], df[1], df[2], df[3])
        image = get_image(path_local + image_name)
        data_images.append(image)
        data_labels.append(get_label(df[1]))
    data_images = np.array(data_images)
    data_labels = np.array(data_labels)
    print(data_images.shape)
    print(data_labels.shape)
    return data_images, data_labels


def view_chart(performance, people, chart):
    fig, ax = plt.subplots()
    y_pos = np.arange(len(people))
    ax.barh(y_pos, performance, align='center', color=['green', 'yellowgreen', 'dodgerblue','orange'])
    for index, value in enumerate(performance):
        plt.text(value, index, str(value))
    ax.set_yticks(y_pos)
    ax.set_yticklabels(people)
    ax.invert_yaxis()
    ax.set_xlabel('Number')
    ax.set_title(chart)
    plt.xlim(0, max(performance) + 400)
    plt.show()


def load_data_image_directory(CATEGORIES, DIRECTORY, size):
    print("[INFO] loading images...")
    data = []
    labels = []

    for category in CATEGORIES:
        path = os.path.join(DIRECTORY, category)
        for img in os.listdir(path):
            img_path = os.path.join(path, img)
            image = load_img(img_path, target_size=size)
            image = img_to_array(image)
            image = preprocess_input(image)
            data.append(image)
            labels.append(category)

    # perform one-hot encoding on the labels
    # lb = LabelBinarizer()
    # labels = lb.fit_transform(labels)
    # labels = to_categorical(labels)

    dataset = np.array(data, dtype="float32")
    labels = np.array(labels)

    data_train, data_test, labels_train, labels_test = train_test_split(dataset, labels, test_size=0.2, shuffle=True, 
                                                                        stratify=labels, random_state=100)
    
    train_data, valid_data, train_labels, valid_labels = train_test_split(data_train, labels_train, test_size=0.2, shuffle=True, 
                                                                          stratify=labels_train, random_state=100)
    
    return data_train, labels_train, train_data, train_labels, valid_data, valid_labels, data_test, labels_test

In [None]:
# Dataset DFI
csv_dfi_1_6 = '/content/drive/MyDrive/human_sperm_morphology_dataset/DFI_dataset/Donors1_6_cropped_with_DFI_labels/DFIs.csv'
csv_dfi_7 = '/content/drive/MyDrive/human_sperm_morphology_dataset/DFI_dataset/Donor7_cropped_with_DFI_labels/DFIs.csv'
local = '/content/drive/MyDrive/human_sperm_morphology_dataset/DFI_dataset/Donors1_6_cropped_with_DFI_labels/'

df = np.array(pd.read_csv(csv_dfi_1_6, usecols=[0, 1, 4, 5], index_col=False))
dataset, labels = load_image(df, local)
data_train, data_test, labels_train, labels_test = train_test_split(dataset, labels, test_size=0.2,
                                                                    shuffle=True, stratify=labels,
                                                                    random_state=100)

save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/DFI_dataset/DataImage_1_6/data_image_rgb_train.data', data_train, labels_train)
save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/DFI_dataset/DataImage_1_6/data_image_rgb_test.data', data_test, labels_test)

view_chart([sum(labels_train == 0), sum(labels_train == 1), sum(labels_train == 2)], 
           ['0', '1', '2'], 'Chart Data Train')

view_chart([sum(labels_test == 0), sum(labels_test == 1), sum(labels_test == 2)], 
           ['0', '1', '2'], 'Chart Data Test')

In [None]:
# Dataset SMIDS
DIRECTORY = "/content/drive/MyDrive/human_sperm_morphology_dataset/SMIDS/Original"
CATEGORIES = ["Abnormal_Sperm", "Non-Sperm", "Normal_Sperm"]
size_images = (40, 40)
data_train, labels_train, train_data, train_labels, valid_data, valid_labels, data_test, labels_test = load_data_image_directory(CATEGORIES, DIRECTORY, size_images)
# data_train.shape, labels_train.shape, train_data.shape, train_labels.shape, valid_data.shape, valid_labels.shape, data_test.shape, labels_test.shape

DIRECTORY_GANs = "/content/drive/MyDrive/human_sperm_morphology_dataset/SMIDS/GANs/1-1"
CATEGORIES_GANs = ["Abnormal_Sperm", "Non-Sperm", "Normal_Sperm"]
data_train_gans, labels_train_gans, train_data_gans, train_labels_gans, valid_data_gans, valid_labels_gans, data_test_gans, labels_test_gans = load_data_image_directory(CATEGORIES_GANs, DIRECTORY_GANs, size_images)
# data_train_gans.shape, labels_train_gans.shape, train_data_gans.shape, train_labels_gans.shape, valid_data_gans.shape, valid_labels_gans.shape, data_test_gans.shape, labels_test_gans.shape

global_data_train = np.concatenate((data_train, data_train_gans), axis=0)
global_labels_train = np.concatenate((labels_train, labels_train_gans), axis=None)

global_train_data = np.concatenate((train_data, train_data_gans), axis=0)
global_train_labels = np.concatenate((train_labels, train_labels_gans), axis=None)

global_valid_data = np.concatenate((valid_data, valid_data_gans), axis=0)
global_valid_labels = np.concatenate((valid_labels, valid_labels_gans), axis=None)

global_data_test = np.concatenate((data_test, data_test_gans), axis=0)
global_labels_test = np.concatenate((labels_test, labels_test_gans), axis=None)

save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SMIDS/dataset/smids_train.data', global_train_data, global_train_labels)
save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SMIDS/dataset/smids_valid.data', global_valid_data, global_valid_labels)
save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SMIDS/dataset/smids_datatrain.data', global_data_train, global_labels_train)
save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SMIDS/dataset/smids_datatest.data', global_data_test, global_labels_test)

global_data_train.shape, global_labels_train.shape, global_train_data.shape, global_train_labels.shape, global_valid_data.shape, global_valid_labels.shape, global_data_test.shape, global_labels_test.shape

In [None]:
# Dataset SCIAN
size_images = (40, 40)
DIRECTORY = "/content/drive/MyDrive/human_sperm_morphology_dataset/SCIAN-Morpho/Original"
DIRECTORY_GANs = "/content/drive/MyDrive/human_sperm_morphology_dataset/SCIAN-Morpho/GANs/1-1"

CATEGORIES = ["01-Normal", "02-Tapered", "03-Pyriform", "04-Small", "05-Amorphous"]
data_train, labels_train, train_data, train_labels, valid_data, valid_labels, data_test, labels_test = load_data_image_directory(CATEGORIES, DIRECTORY, size_images)
data_train_gans, labels_train_gans, train_data_gans, train_labels_gans, valid_data_gans, valid_labels_gans, data_test_gans, labels_test_gans = load_data_image_directory(CATEGORIES, DIRECTORY_GANs, size_images)

global_data_train = np.concatenate((data_train, data_train_gans), axis=0)
global_labels_train = np.concatenate((labels_train, labels_train_gans), axis=None)

global_train_data = np.concatenate((train_data, train_data_gans), axis=0)
global_train_labels = np.concatenate((train_labels, train_labels_gans), axis=None)

global_valid_data = np.concatenate((valid_data, valid_data_gans), axis=0)
global_valid_labels = np.concatenate((valid_labels, valid_labels_gans), axis=None)

global_data_test = np.concatenate((data_test, data_test_gans), axis=0)
global_labels_test = np.concatenate((labels_test, labels_test_gans), axis=None)

save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SCIAN-Morpho/dataset/scian_train.data', global_train_data, global_train_labels)
save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SCIAN-Morpho/dataset/scian_valid.data', global_valid_data, global_valid_labels)

save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SCIAN-Morpho/dataset/scian_datatrain.data', global_data_train, global_labels_train)
save_dump('/content/drive/MyDrive/human_sperm_morphology_dataset/SCIAN-Morpho/dataset/scian_datatest.data', global_data_test, global_labels_test)

global_data_train.shape, global_labels_train.shape, global_train_data.shape, global_train_labels.shape, global_valid_data.shape, global_valid_labels.shape, global_data_test.shape, global_labels_test.shape

In [None]:
# merge data

data_train, labels_train = load_data("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/smids/dataset/smids_datatrain.data")
data_test, labels_test = load_data("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/smids/dataset/smids_datatest.data")

data = np.concatenate((data_train, data_test), axis=0)
labels = np.concatenate((labels_train, labels_test), axis=0)

save_dump("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/smids/dataset/smids_full_data.data", data, labels)
data.shape, labels.shape

(4808, 40, 40, 3)
(4808,)
(1202, 40, 40, 3)
(1202,)


((6010, 40, 40, 3), (6010,))

In [None]:
np.unique(labels)

array(['Abnormal_Sperm', 'Non-Sperm', 'Normal_Sperm'], dtype='<U14')

In [None]:
from tqdm import tqdm
# binary data smids
data_smids = []
labels_smids = []
for i in tqdm(range(0, len(labels))):
    if labels[i] != 'Non-Sperm':
        data_smids.append(data[i])
        labels_smids.append(labels[i])
save_dump("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/smids/dataset/smids_binary_data.data", np.array(data_smids), np.array(labels_smids))
np.array(data_smids).shape, np.array(labels_smids).shape, np.unique(labels_smids)

100%|██████████| 6010/6010 [00:00<00:00, 222559.59it/s]


((4026, 40, 40, 3),
 (4026,),
 array(['Abnormal_Sperm', 'Normal_Sperm'], dtype='<U14'))

In [None]:
data_scian, labels_scian = load_data("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/scian/dataset/scian_full_data.data")
data_hushem, labels_hushem = load_data("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/hushem/dataset/HuSHeM_full_data.data")

(19259, 40, 40, 3)
(19259,)
(4216, 40, 40, 3)
(4216,)


In [None]:
data_scian.shape, labels_scian.shape, np.unique(labels_scian)

((19259, 40, 40, 3),
 (19259,),
 array(['01-Normal', '02-Tapered', '03-Pyriform', '04-Small',
        '05-Amorphous'], dtype='<U12'))

In [None]:
data_hushem.shape, labels_hushem.shape, np.unique(labels_hushem)

((4216, 40, 40, 3),
 (4216,),
 array(['01_Normal', '02_Tapered', '03_Pyriform', '04_Amorphous'],
       dtype='<U12'))

In [None]:
data_scian_4 = []
labels_scian_4 = []

for i in tqdm(range(0, len(labels_scian))):
    if labels_scian[i] != '04-Small':
        data_scian_4.append(data_scian[i])
        labels_scian_4.append(labels_scian[i])
save_dump("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/scian/dataset/scian_4_labels_data.data", np.array(data_scian_4), np.array(labels_scian_4))
np.array(data_scian_4).shape, np.array(labels_scian_4).shape, np.unique(labels_scian_4)

100%|██████████| 19259/19259 [00:00<00:00, 114555.14it/s]


((15966, 40, 40, 3),
 (15966,),
 array(['01-Normal', '02-Tapered', '03-Pyriform', '05-Amorphous'],
       dtype='<U12'))

In [None]:
data_scian_binary = []
labels_scian_binary = []

for i in tqdm(range(0, len(labels_scian))):
    if labels_scian[i] == '01-Normal':
        data_scian_binary.append(data_scian[i])
        labels_scian_binary.append(labels_scian[i])
    else:
        data_scian_binary.append(data_scian[i])
        labels_scian_binary.append('02-Abnormal')
save_dump("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/scian/dataset/scian_binary_data.data", np.array(data_scian_binary), np.array(labels_scian_binary))
np.array(data_scian_binary).shape, np.array(labels_scian_binary).shape, np.unique(labels_scian_binary)

100%|██████████| 19259/19259 [00:00<00:00, 630001.02it/s]


((19259, 40, 40, 3),
 (19259,),
 array(['01-Normal', '02-Abnormal'], dtype='<U11'))

In [None]:
data_hushem.shape, labels_hushem.shape, np.unique(labels_hushem)

((4216, 40, 40, 3),
 (4216,),
 array(['01_Normal', '02_Tapered', '03_Pyriform', '04_Amorphous'],
       dtype='<U12'))

In [None]:
data_hushem_binary = []
labels_hushem_binary = []

for i in tqdm(range(0, len(labels_hushem))):
    if labels_hushem[i] == '01_Normal':
        data_hushem_binary.append(data_hushem[i])
        labels_hushem_binary.append(labels_hushem[i])
    else:
        data_hushem_binary.append(data_hushem[i])
        labels_hushem_binary.append('02_Abnormal')
save_dump("/content/drive/MyDrive/manhquang/haui/human-sperm/dataset/hushem/dataset/hushem_binary_data.data", np.array(data_hushem_binary), np.array(labels_hushem_binary))
np.array(data_hushem_binary).shape, np.array(labels_hushem_binary).shape, np.unique(labels_hushem_binary)

100%|██████████| 4216/4216 [00:00<00:00, 401689.74it/s]


((4216, 40, 40, 3), (4216,), array(['01_Normal', '02_Abnormal'], dtype='<U11'))