In [46]:
import os
import cv2
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from math import ceil
import tensorflow as tf

random.seed(42)

In [16]:
os.chdir('..')
os.getcwd()

'/home/abraham-pc/Documents/paid_gigs/malaria_prediction_cv'

In [7]:
def scan_directory(directory: str, extension: str) -> list:
    """Check specified directory and return list of files with
    specified extension

    Args:
        directory (str): path string to directory e.g. "./the/directory"
        extension (str): extension type to be searched for e.g. ".csv"

    Returns:
        list: strings of file names with specified extension
    """    
    files: list = []
    for filename in os.listdir(directory):
        if filename.endswith(extension):
            files.append(filename)
    return files

In [13]:
raw_path = './data/raw/SpeciesDataset/'
raw_train_path = 'train/'
raw_test_path = 'test/'
species = ['Falciparum', 'Malariae', 'Ovale', 'Vivax']

interim_path = './data/interim/'

# for split in [raw_train_path, raw_test_path]:
#     for specie in species:
#         os.makedirs(interim_path + specie, exist_ok=True)
#         for image in scan_directory(raw_path + split + specie, '.png'):
#             png_img = cv2.imread(image)
#             cv2.imwrite(interim_path + specie + '/' + image[:-4] + '.jpg', png_img, [int(cv2.IMWRITE_JPEG_QUALITY), 100])


In [17]:
# split data into train and test by species
data_reference = pd.DataFrame(columns=['image', 'species'])

for specie in species:
    if specie == 'Falciparum':
        all_images = scan_directory(interim_path + specie, '.jpg')
        images = random.sample(all_images, ceil(np.mean([76, 31, 62])))
        full_images_path = [interim_path + specie + '/' + image for image in images]
        specie_df = pd.DataFrame({'image': full_images_path, 'species': specie})
        data_reference = pd.concat([data_reference, specie_df])
    else:
        images = scan_directory(interim_path + specie, '.jpg')
        full_images_path = [interim_path + specie + '/' + image for image in images]
        specie_df = pd.DataFrame({'image': full_images_path, 'species': specie})
        data_reference = pd.concat([data_reference, specie_df])


train_df, test_df = train_test_split(
    data_reference,
    stratify=data_reference['species'],
    test_size=0.2,
    random_state=42
)


In [27]:
data_reference['species'].value_counts()

species
Vivax         76
Malariae      62
Falciparum    57
Ovale         31
Name: count, dtype: int64

In [25]:
train_df['species'].value_counts()

species
Vivax         61
Malariae      49
Falciparum    45
Ovale         25
Name: count, dtype: int64

In [26]:
test_df['species'].value_counts()

species
Vivax         15
Malariae      13
Falciparum    12
Ovale          6
Name: count, dtype: int64

In [49]:
# Create train data generator
processed_path = './data/processed/'
processed_train_path = 'train/'
processed_test_path = 'test/'
os.makedirs(processed_path + processed_train_path, exist_ok=True)
os.makedirs(processed_path + processed_test_path, exist_ok=True)
train_path = processed_path + processed_train_path
test_path = processed_path + processed_test_path


train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=None,
    x_col="image",
    y_col="species",
    class_mode="categorical",
    target_size=(224, 224),
    save_to_dir=train_path,
    batch_size=32,
    seed=42
)

# Create test data generator
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=None,
    x_col="image",
    y_col="species",
    class_mode=None,
    target_size=(224, 224),
    save_to_dir=test_path,
    batch_size=32,
    seed = 42
)


Found 180 validated image filenames belonging to 4 classes.
Found 46 validated image filenames.


In [50]:
train_generator.class_indices

{'Falciparum': 0, 'Malariae': 1, 'Ovale': 2, 'Vivax': 3}