# Introduction

Before begining to demonstrate all the different methods that we used to implement our model, here a presentation of the differents methods that we use during all the step of the model development.


## Dataset builder (scripts/x_ray_dataset_builder.py)

This class is designed to create and manage image datasets for building our ML models, specifically those using TensorFlow.
This class is initialized with parameters such as the directory path containing the image data, the validation split, the subset of data to use, the color mode of the images, the batch size, and the image size.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from x_ray_data_viz import plot_distribution, plot_mean


class Dataset:
    def __init__(
        self,
        dir_path: str,
        batch_size=40,
        color_mode="grayscale",
        image_size=(256, 256),
        label_mode="categorical",
        subset=None,
        validation_split=None,
    ):
        self.batch_size = batch_size
        self.class_names = None
        self.color_mode = color_mode
        self.dataset = None
        self.dir_path = dir_path
        self.image_size = image_size
        self.label_mode = label_mode
        self.normalized_dataset = None
        self.raw_dataset = None
        self.raw_x_dataset = []
        self.subset = subset
        self.validation_split = validation_split
        self.x_dataset = []
        self.y_dataset = []

Upon building the dataset using the build method, the class utilizes the image_dataset_from_directory function from TensorFlow's Keras API to load the images and labels directly from the specified directory. 
The images are then preprocessed using a caching mechanism to speed up future access, and optionally shuffled if the dataset is used for training. The images are also rescaled to a range between 0 and 1 for normalization purposes.

In [None]:
    def build(self, autotune, is_training=False):
        dataset = tf.keras.utils.image_dataset_from_directory(
            self.dir_path,
            batch_size=self.batch_size,
            color_mode=self.color_mode,
            image_size=self.image_size,
            label_mode=self.label_mode,
            labels="inferred",
            seed=123,
            subset=self.subset,
            validation_split=self.validation_split,
        )

        if is_training:
            self.dataset = dataset.cache().shuffle(1024).prefetch(buffer_size=autotune)
        else:
            self.dataset = dataset.cache().prefetch(buffer_size=autotune)

        self.class_names = dataset.class_names
        self.raw_dataset = dataset

        for x, y in dataset.unbatch().as_numpy_iterator():
            self.raw_x_dataset.append(x)

        self.raw_x_dataset = np.array(self.raw_x_dataset)

        normalization_layer = tf.keras.layers.Rescaling(1.0 / 255)
        self.normalized_dataset = self.dataset.map(
            lambda x, y: (normalization_layer(x), y)
        )

        for x, y in self.normalized_dataset.unbatch().as_numpy_iterator():
            self.x_dataset.append(x)
            self.y_dataset.append(y)

        self.x_dataset, self.y_dataset = np.array(self.x_dataset), np.array(
            self.y_dataset
        )

        return self.normalized_dataset


The Dataset class provides several utility methods to interact with the created dataset: getting class names, acquiring the shape of batches, and displaying images from a batch. 

In [None]:
    def get_class_names(self):
        return self.class_names

    def get_x_batch_shape(self):
        for image_batch, _ in self.dataset:
            return image_batch.shape

    def get_y_batch_shape(self):
        for _, labels_batch in self.dataset:
            return labels_batch.shape
        
    def display_images_in_batch(self, batch_index: int, dataset_name: str):
        images, labels = next(iter(self.dataset.take(batch_index)))

        plt.figure(figsize=(20, 10))

        for i in range(9):
            plt.subplot(3, 3, i + 1)
            plt.imshow(images[i].numpy().astype("uint8"), cmap="gray")
            plt.title(
                f"{dataset_name} - {self.class_names[np.argmax(labels[i])]} (batch {batch_index})"
            )
            plt.axis("off")

        plt.show()

    def display_batch_number(self, dataset_name: str):
        total_images = len(self.x_dataset)
        batch_size = self.batch_size

        total_batches = total_images // batch_size

        if total_images % batch_size != 0:
            total_batches += 1

        batch_indices = list(range(total_batches))
        batch_sizes = [batch_size] * total_batches

        if total_images % batch_size != 0:
            batch_sizes[-1] = total_images % batch_size

        plt.figure(figsize=(20, 10))
        plt.style.use("seaborn")
        plt.bar(batch_indices, batch_sizes, color="#ff6f00")
        plt.xlabel("Batchs")
        plt.ylabel("Images")
        plt.title(f"Batchs and images per batch in {dataset_name}")
        plt.show()

It also offers methods to visualize the number of images in each batch and to display the distribution of labels in the dataset. 

Moreover, it includes a function to calculate and plot the mean of labels. This class is beneficial for data exploration and preprocessing steps, providing a comprehensive tool for managing image datasets in machine learning applications.

In [None]:
    def display_distribution(self, dataset_name: str):
        labels_index = np.argmax(self.y_dataset, axis=1)
        labels = []

        for index in labels_index:
            labels.append(self.class_names[index])

        plot_distribution(labels, dataset_name)

    def display_mean(self, dataset_name):
        labels = np.argmax(self.y_dataset, axis=1)
        plot_mean(labels, self.class_names, dataset_name)

## Data vizualisation (scripts/data_viz.py)

The provided code contains a set of functions designed for visualizing the results of our ML model.
These functions use matplotlib and seaborn, popular libraries for data visualization in Python, as well as sklearn for some of the calculations.

The function `plot_distribution` visualizes the frequency of each class in the dataset. It creates a bar chart where the x-axis represents the classes (types of chest X-rays) and the y-axis represents the frequency of each class.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import auc, confusion_matrix, roc_curve
from sklearn.preprocessing import label_binarize


COLORS = ["#56f6ff", "#e32440"]
PLOT_WIDTH = 20
PLOT_HEIGHT = 10

def plot_distribution(labels, dataset_name):
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    sns.set_style("whitegrid")
    sns.countplot(x=labels, palette=COLORS)
    plt.title(f"{dataset_name} - Chest x-rays distribution")
    plt.xlabel("Chest x-ray")
    plt.ylabel("Frequency")
    plt.show()

The `plot_mean` function calculates and visualizes the mean occurrence of each class in the dataset. It creates a bar chart with the x-axis representing the classes and the y-axis representing the mean occurrence (in percentage) of each class.

In [None]:
def plot_mean(labels, class_names, dataset_name):
    bar_width = 0.25
    unique_labels = np.unique(labels)
    mean_train = [np.mean(labels == label) * 100 for label in unique_labels]
    index = np.arange(len(unique_labels))

    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.style.use("seaborn")
    plt.bar(
        index,
        mean_train,
        bar_width,
        label=class_names,
        tick_label=class_names,
        color=COLORS,
    )
    plt.xlabel("X-ray images")
    plt.ylabel("Mean occurence (%)")
    plt.title(f"{dataset_name} - Mean occurence of each x-ray image")
    plt.xticks(index, class_names)
    plt.legend()
    plt.tight_layout()
    plt.show()

The `plot_confusion_matrix` function visualizes a confusion matrix, which is a table layout that allows visualization of the performance of a classification model. The rows of the matrix represent the actual classes and the columns represent the predicted classes.

In [None]:
def plot_confusion_matrix(labels_true, labels_pred, class_names):
    matrix = confusion_matrix(labels_true, labels_pred)

    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    sns.set_style("whitegrid")
    sns.heatmap(
        matrix,
        annot=True,
        cmap="YlGnBu",
        fmt="d",
        xticklabels=class_names,
        yticklabels=class_names,
    )
    plt.title("Confusion matrix")
    plt.xlabel("Predicted results")
    plt.ylabel("Actual results")
    plt.show()


This function, `plot_roc_curve`, plots the Receiver Operating Characteristic (ROC) curve for each class in a multi-class classification model. 

The ROC curve is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied. 

The function calculates the ROC curve and the Area Under the Curve (AUC) for each class, as well as for the micro-average over all classes, and then plots these curves.

In [None]:
def plot_roc_curve(y_true, y_pred_probs, class_names, binary=False):
    y_true_bin = label_binarize(y_true, classes=class_names)
    n_classes = len(class_names)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        if binary:
            fpr, tpr, _ = roc_curve(y_true_bin, y_pred_probs)
            roc_auc = auc(fpr, tpr)
        else:
            fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_probs[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
    if binary:
        fpr, tpr, thresholds = roc_curve(y_true, y_pred_probs)
        roc_auc = auc(fpr, tpr)
    else:
        fpr["micro"], tpr["micro"], _ = roc_curve(
            y_true_bin.ravel(), y_pred_probs.ravel()
        )
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    sns.set_style("whitegrid")

    for i in range(n_classes):
        if binary:
            plt.plot(fpr, tpr, label=f"Xray (AUC = {roc_auc:.4f})")
        else:
            plt.plot(
                fpr[i], tpr[i], label=f"Xray {class_names[i]} (AUC = {roc_auc[i]:.4f})"
            )

    plt.plot([0, 1], [0, 1], "k--")
    plt.xlim([0.0, 0.2])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False positive rate")
    plt.ylabel("True positive rate")
    plt.title("Receiver Operating Characteristic curve")
    plt.legend(loc="lower right")
    plt.show()

The function`plot_history` plots the loss and accuracy train/test value from the history of the model.fit() training method.

In [None]:
def plot_history(history):
    plt.figure(figsize=(PLOT_WIDTH, PLOT_HEIGHT))
    plt.subplot(1, 2, 1)
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.title("Training and Validation loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history["binary_accuracy"], label="Training accuracy")
    plt.plot(history.history["val_binary_accuracy"], label="Validation accuracy")
    plt.title("Training and Validation accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.tight_layout()
    plt.show()

These functions provide visual insights into the distribution of the data and the performance of the classification model, making them valuable tools for model evaluation and interpretation.

## Model loader (scripts/x_ray_model_loader.py)

This class is designed to load, evaluate, and make predictions using a pre-trained deep learning model for chest X-ray image classification.
The model and data loading utilize TensorFlow and Keras, popular libraries for deep learning in Python, while the model evaluation uses scikit-learn, a library for machine learning in Python.

Upon initialization, the `ModelLoader` takes as input a batch size and image size. It sets up a path to the test dataset and loads a list of images to make predictions on. It then initializes a `Dataset` object to handle the test data, and builds it using TensorFlow's AUTOTUNE for optimal data loading.

In [None]:
import os
import math
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report
from x_ray_data_viz import plot_confusion_matrix, plot_roc_curve
from x_ray_dataset_builder import Dataset


class ModelLoader:
    def __init__(
        self,
        btch_size=40,
        color="grayscale",
        img_size=(256, 256),
        label_mode="categorical",
    ):
        pred_list = os.listdir("data/prediction/")

        test_dir = pathlib.Path("data/test")

        test_ds = Dataset(
            test_dir,
            batch_size=btch_size,
            color_mode=color,
            image_size=img_size,
            label_mode=label_mode,
        )

        AUTOTUNE = tf.data.AUTOTUNE
        
        test_ds.build(AUTOTUNE)

        self.class_names = test_ds.get_class_names()
        self.loaded_model = None
        self.pred_list = pred_list
        self.probability_model = None
        self.test_ds = test_ds.normalized_dataset
        self.x_test = test_ds.x_dataset
        self.y_test = test_ds.y_dataset


The `load` method of the class is used to load a trained model from a given path. It also sets up a probability model, which is the original model wrapped with a softmax layer. The softmax function is often used in the final layer of a neural network-based classifier to produce probabilities for each class.

In [None]:
    def load(self, model_pathname, **kwargs):
        print("\n\033[94mModel loading...\033[0m")

        self.loaded_model = tf.keras.models.load_model(model_pathname, **kwargs)

        self.probability_model = tf.keras.Sequential(
            [self.loaded_model, tf.keras.layers.Softmax()]
        )

        print("\n\033[92mModel successfully loaded!\033[0m")

The `evaluate` method evaluates the loaded model on the test dataset. It prints the loss, categorical accuracy, precision, recall, and area under the ROC curve (AUC) for the test dataset. It then calculates and displays the confusion matrix and ROC curve for the model's performance on the test data. Finally, it prints a classification report, which includes precision, recall, f1-score, and support for each class.

In [None]:
    def evaluate(self, binary=False):
        print("\n\033[94mEvaluating model...\033[0m\n")

        if binary:
            (
                test_loss,
                test_binary_accuracy,
                test_precision,
                test_recall,
            ) = self.loaded_model.evaluate(self.x_test, self.y_test, verbose=2)

            print("\n\033[94mEvaluation loss is: %s\033[0m" % (test_loss))
            print("\n\033[94mEvaluation binary accurancy is: %s\033[0m" % (test_binary_accuracy))
            print("\n\033[94mEvaluation precision is: %s\033[0m" % (test_precision))
            print("\n\033[94mEvaluation recall is: %s\n\033[0m" % (test_recall))
        else:
            (
                test_loss,
                categorical_accuracy,
                test_precision,
                test_recall,
            ) = self.loaded_model.evaluate(self.x_test, self.y_test, verbose=2)
            print("\n\033[94mEvaluation loss is: %s\033[0m" % (test_loss))
            print("\n\033[94mEvaluation categorical accurancy is: %s\033[0m" % (categorical_accuracy))
            print("\n\033[94mEvaluation precision is: %s\033[0m" % (test_precision))
            print("\n\033[94mEvaluation recall is: %s\n\033[0m" % (test_recall))

        predictions = self.loaded_model.predict(self.x_test)
        y_test = []
        y_pred = []

        if binary:
            y_test = self.y_test
        else:
            y_test = np.argmax(self.y_test, axis=1)

        if binary:
            y_pred = (predictions > 0.5).astype(int).reshape(-1)  # this line is updated
        else:
            y_pred = np.argmax(predictions, axis=1)

        plot_confusion_matrix(y_test, y_pred, class_names=self.class_names)

        if binary:
            plot_roc_curve(
                self.y_test,
                predictions,
                class_names=self.class_names,
                binary=True
            )
        else:
            plot_roc_curve(
                self.y_test,
                predictions,
                class_names=self.class_names
            )

        print("\033[94m\nClassification Report:\033[0m")
        print(
            classification_report(
                y_test, y_pred, target_names=self.class_names, zero_division=0
            )
        )



The `predict` method is used to make predictions on a set of images stored in a specific folder. It loads each image, processes it, and feeds it to the model for prediction. The predicted class and its probability are then displayed on the image and all images are shown in a grid format using matplotlib.

In [None]:
    def predict(self, color="grayscale", img_size=(180, 180), binary=False):
        num_cols = 4
        num_rows = math.ceil(len(self.pred_list) / num_cols)

        plt.figure(figsize=(20, 10))

        for i in range(len(self.pred_list)):
            img = tf.keras.utils.load_img(
                f"data/prediction/{self.pred_list[i]}",
                color_mode=color,
                target_size=img_size,
            )
            img_array = tf.keras.utils.img_to_array(img)
            img_array = tf.expand_dims(img_array, 0)
            predictions = self.loaded_model.predict(img_array)

            score = []

            if binary:
                score = predictions[0][0]
            else:
                score = tf.nn.softmax(predictions[0])

            plt.subplot(num_rows, num_cols, i + 1)
            plt.axis("off")
            plt.imshow(img, cmap="gray")
            
            if binary:
                if score > 0.5:
                    plt.title("Pneumonia ({:.2f}%)".format(100 * score))
                else:
                    plt.title("Normal ({:.2f}%)".format(100 * (1 - score)))
            else:
                plt.title(
                    "{} ({:.2f}%)".format(
                        self.class_names[np.argmax(score)], 100 * np.max(score)
                    )
                )

        plt.show()

In summary, this class provides a convenient way to load, evaluate, and use a pre-trained model for chest X-ray image classification 
but this class is really essential because it allow to vizualize the appropriate metrics to evaluate the model's performances.