<a href="https://colab.research.google.com/github/yeabwang/malaria-diagnosis/blob/main/Malaria_diagnosis_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This binary classification model detects malaria by classifying individual red blood cell images as infected or uninfected.
# It utilizes a dataset of 27,558 cell images from thin blood smear slides, containing both parasitized and healthy cells.
# The model leverages deep learning techniques to automate malaria detection, potentially aiding in faster and more accurate diagnosis.
### Link to the dataset: https://lhncbc.nlm.nih.gov/LHC-research/LHC-projects/image-processing/malaria-datasheet.html
### Class - Parasitized and Uninfected

In [None]:
!pip install -U albumentations

In [None]:
!pip install wandb

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import io
import os
import random
from google.colab import files, drive
from PIL import Image
import albumentations as A
from albumentations.core.composition import OneOf
from albumentations.pytorch import ToTensorV2
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, InputLayer, BatchNormalization, Input, Dropout, RandomFlip, RandomRotation, Resizing, Rescaling
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy, FalsePositives, FalseNegatives, TruePositives, TrueNegatives, Precision, Recall, AUC, binary_accuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping, LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers  import L2, L1
from tensorboard.plugins.hparams import api as hp
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint, WandbEvalCallback

In [2]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myeabsiramersha58[0m ([33myeabsiramersha58-beijing-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
wandb.config = {
  "LEARNING_RATE": 0.001,
  "N_EPOCHS": 5,
  "BATCH_SIZE": 128,
  "DROPOUT_RATE": 0.0,
  "IM_SIZE": 224,
  "REGULARIZATION_RATE": 0.0,
  "N_FILTERS": 6,
  "KERNEL_SIZE": 3,
  "N_STRIDES": 1,
  "POOL_SIZE": 2,
  "N_DENSE_1": 100,
  "N_DENSE_2": 10,
}
CONFIGURATION = wandb.config

In [4]:
dataset, dataset_info = tfds.load('malaria', with_info=True, as_supervised=True, shuffle_files=True, split=['train'])
print(dataset)

Downloading and preparing dataset 337.08 MiB (download: 337.08 MiB, generated: Unknown size, total: 337.08 MiB) to /root/tensorflow_datasets/malaria/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/27558 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/malaria/incomplete.BCH6HE_1.0.0/malaria-train.tfrecord*...:   0%|         …

Dataset malaria downloaded and prepared to /root/tensorflow_datasets/malaria/1.0.0. Subsequent calls will reuse this data.
[<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>]


In [5]:
k = 0
for image, label in dataset[0]:
    with open(f'dataset/malaria_dataset_{k}.npz', mode='wb') as file:
        np.savez(file, image=image.numpy(), label=label.numpy())
    k += 1

    if k % 1000 == 0:
        print(k)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000


In [6]:
def load_original_data():
  with wandb.init(project="Malaria-Diagnosis") as run:

    original_data = wandb.Artifact(
        name = "new_dataset",
        type="raw_data",
        description = "The Malaria dataset contains a total of 27,558 cell images with equal instances of parasitized and uninfected cells from the thin blood smear slide images of segmented cells.",
        metadata = {"source": "TFDS",
                    "homepage": "https://lhncbc.nlm.nih.gov/publication/pub9932",
                    "source_code": "tfds.image_classification.Malaria",
                    "version": "1.0.0",
                    "download_size": "337.08 MiB",
                    }
    )

    original_data.add_dir('dataset/')

    run.log_artifact(original_data)

In [7]:
load_original_data()

[34m[1mwandb[0m: Currently logged in as: [33myeabsiramersha58[0m ([33myeabsiramersha58-beijing-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Adding directory to artifact (./dataset)... Done. 27.7s


In [8]:
with wandb.init(project="Malaria-Diagnosis") as run:
  artifact = run.use_artifact('yeabsiramersha58-beijing-institute-of-technology/Malaria-Diagnosis/new_dataset:v0', type='raw_data')
  artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact new_dataset:v0, 1419.75MB. 27558 files... 
[34m[1mwandb[0m:   27558 of 27558 files downloaded.  
Done. 0:1:7.6


In [12]:
IMG_SIZE = CONFIGURATION["IM_SIZE"]

def resize_rescale(image):
  return tf.image.resize(image, (IMG_SIZE, IMG_SIZE))/255.0

In [28]:
def preprocess_data():
    with wandb.init(project="Malaria-Diagnosis") as run:
        artifact = run.use_artifact('yeabsiramersha58-beijing-institute-of-technology/Malaria-Diagnosis/new_dataset:v0', type='raw_data')
        artifact_dir = artifact.download()

        print(f"Artifact downloaded to: {artifact_dir}")

        preprocessed_data = wandb.Artifact(
            name="preprocessed_dataset",
            type="preprocessed_data",
            description="A Preprocessed version of the Malaria dataset",
        )

        artifact_directory = "artifacts/new_dataset:v0/"

        dataset_x = []
        dataset_y = []

        for f in os.listdir(artifact_directory)[:1000]:
            with open(os.path.join(artifact_directory, f), 'rb') as file:
                npz_array = np.load(file, allow_pickle=True)

                x, y = npz_array['image'], npz_array['label']

                dataset_x.append(resize_rescale(x))
                dataset_y.append(y)

        with preprocessed_data.new_file("prep_dataset.npz", mode="wb") as file:
            np.savez(file, dataset_x=dataset_x, dataset_y=dataset_y)

        run.log_artifact(preprocessed_data)

In [29]:
preprocess_data()

[34m[1mwandb[0m: Downloading large artifact new_dataset:v0, 1419.75MB. 27558 files... 
[34m[1mwandb[0m:   27558 of 27558 files downloaded.  
Done. 0:0:57.0


Artifact downloaded to: /content/artifacts/new_dataset:v0


In [36]:
def split_data():
    with wandb.init(project="Malaria-Diagnosis") as run:

        artifact = run.use_artifact('yeabsiramersha58-beijing-institute-of-technology/Malaria-Diagnosis/preprocessed_dataset:v0', type='preprocessed_data')
        artifact_dir = artifact.download()

        print(f"Artifact downloaded to: {artifact_dir}")

        train_data = wandb.Artifact(
            name="train_dataset",
            type="preprocessed_data",
            description="Training dataset",
        )
        val_data = wandb.Artifact(
            name="val_dataset",
            type="preprocessed_data",
            description="Validation dataset",
        )
        test_data = wandb.Artifact(
            name="test_dataset",
            type="preprocessed_data",
            description="Test dataset",
        )

        artifact_file = "artifacts/preprocessed_dataset:v0/prep_dataset.npz"

        with open(artifact_file, 'rb') as file:
            npz_arr = np.load(file, allow_pickle=True)
            print(npz_arr.files)

            x_data, y_data = npz_arr['dataset_x'], npz_arr['dataset_y']

        train_split = 0.8
        val_split = 0.1
        test_split = 0.1

        data_len = len(x_data)

        train_arr = [x_data[0:int(train_split * data_len)], y_data[0:int(train_split * data_len)]]
        val_arr = [x_data[int(train_split * data_len):int((train_split + val_split) * data_len)],
                   y_data[int(train_split * data_len):int((train_split + val_split) * data_len)]]
        test_arr = [x_data[int((train_split + val_split) * data_len):], y_data[int((train_split + val_split) * data_len):]]

        with train_data.new_file("train_dataset.npz", mode="wb") as file:
            np.savez(file, dataset_x=train_arr[0], dataset_y=train_arr[1])

        with val_data.new_file("val_dataset.npz", mode="wb") as file:
            np.savez(file, dataset_x=val_arr[0], dataset_y=val_arr[1])

        with test_data.new_file("test_dataset.npz", mode="wb") as file:
            np.savez(file, dataset_x=test_arr[0], dataset_y=test_arr[1])

        run.log_artifact(train_data)
        run.log_artifact(val_data)
        run.log_artifact(test_data)

    print("Data split and logged successfully.")


In [37]:
split_data()

[34m[1mwandb[0m: Downloading large artifact preprocessed_dataset:v0, 574.23MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:1.4


Artifact downloaded to: /content/artifacts/preprocessed_dataset:v0
['dataset_x', 'dataset_y']


Data split and logged successfully.


In [38]:
albumentations_transform = A.Compose([
    A.Resize(IMG_SIZE, IMG_SIZE),
    A.Rotate(limit=40, p=0.5),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomScale(scale_limit=0.2, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.5),
    A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.5),
    A.GaussNoise(p=0.5),
    A.OneOf([
        A.GaussianBlur(blur_limit=(3, 7), p=0.5),
        A.Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=0.5)
    ], p=0.5),
    A.Affine(shear=(-15, 15), p=0.5),
    A.ElasticTransform(alpha=1, sigma=50, p=0.5),
    A.CoarseDropout(p=0.5),
    ToTensorV2()
])

def albumentations_augment(image):
    data = {"image": image}
    image = albumentations_transform(**data)["image"]
    image = tf.cast(image / 255.0, tf.float32)
    return image

def tensorflow_augment(image):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, 0.3)
    return image

def augment(image, label):
    aug_img = tf.numpy_function(func=albumentations_augment, inp=[image], Tout=tf.float32)

    aug_img = tensorflow_augment(aug_img)

    return aug_img, label


In [45]:
def augment_data():
    with wandb.init(project="Malaria-Diagnosis") as run:
        artifact = run.use_artifact('yeabsiramersha58-beijing-institute-of-technology/Malaria-Diagnosis/train_dataset:v0', type='preprocessed_data')
        artifact_dir = artifact.download()

        print(f"Artifact downloaded to: {artifact_dir}")

        augmented_data = wandb.Artifact(
            name="Augmented_dataset",
            type="preprocessed_data",
            description="An Augmented version of the Malaria train dataset",
        )

        artifact_file = "artifacts/train_dataset:v0/train_dataset.npz"

        with open(artifact_file, 'rb') as file:
            npz_arr = np.load(file, allow_pickle=True)

            dataset_x, dataset_y = npz_arr['dataset_x'], npz_arr['dataset_y']

        dataset_x_augmented = []

        for im in dataset_x:
            dataset_x_augmented.append(im)

        print(f"Number of augmented images: {len(dataset_x_augmented)}")

        with augmented_data.new_file("aug_dataset.npz", mode="wb") as file:
            np.savez(file, dataset_x=dataset_x_augmented, dataset_y=dataset_y)

        run.log_artifact(augmented_data)

    print("Data augmented and logged successfully.")

In [46]:
augment_data()

[34m[1mwandb[0m: Downloading large artifact train_dataset:v0, 459.38MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:1.1


Artifact downloaded to: /content/artifacts/train_dataset:v0
Number of augmented images: 800


Data augmented and logged successfully.
