In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use("ggplot")
from skimage import io
from skimage.transform import rescale, resize, downscale_local_mean
#import cv2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, learning_curve, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from yellowbrick.model_selection import learning_curve
import pickle 

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image as Image
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.utils import plot_model
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, CSVLogger
from keras.callbacks import ReduceLROnPlateau
import warnings
from datetime import datetime
import shutil
warnings.filterwarnings('ignore')
root = "../input/lyme-disease-dataset-cleaned/"

In [None]:
def augment_img(image):
    image_new = tf.image.random_brightness(image, max_delta=0.5)
    image_new = tf.image.random_crop(image, size=[image.shape[0], image.shape[1], 3])
    image_new = tf.image.random_flip_left_right(image)
    return image_new

In [None]:
def load_from_directory(path, color_mode='rgb', normalize=True, augment=False):
    images = []
    labels = []
    DIMS = (224, 224, 3)
    path_root = root + path
    categories = ["Negative", "Positive"]

    for category in categories:
        path = os.path.join(path_root, "Lyme_" + category)
        for img in os.listdir(path):
            try:
                img_path = os.path.join(path, img)
                img = Image.load_img(img_path, target_size=(DIMS[0], DIMS[1]), color_mode=color_mode)
                img_tensor = Image.img_to_array(img)
    #             image = np.array(dis_img).flatten()
                
                images.append(img_tensor)
                labels.append("Lyme_" + category)
                
                if augment == True:
                    aug_img_tensor = augment_img(img_tensor)
                    #aug_img_tensor = Image.img_to_array(aug_img)
                    images.append(aug_img_tensor)
                    labels.append("Lyme_" + category)
            except Exception as e:
                print(e)
#     images.pop(0)
#     labels.pop(0)
    images = np.array(images)
    labels = np.array(labels)
    plt.imshow(Image.array_to_img(images[0]))
    if normalize:
        images /= 255.
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    return (images, labels)

In [None]:
train_path = "RashData/Train/Train_2_Cases"
test_path = "RashData/Validation/Validation_2_Cases"

In [None]:
train_images, train_labels = load_from_directory(train_path, augment=True)
test_images, test_labels = load_from_directory(test_path)

# Balanced Dataset

In [None]:
def getTrainTest(x, y, size=0.2):
    x_positive = x[y == 1]
    x_negative = x[y == 0]
    
    np.random.shuffle(x_positive)
    np.random.shuffle(x_negative)
    x_neg_bal = x_negative[:x_positive.shape[0]]
    x_neg_rest = x_negative[x_positive.shape[0]:]
    y_neg_bal = np.zeros(x_positive.shape[0], dtype='int')
    
    y_neg_rest = np.zeros(x_negative.shape[0] - x_positive.shape[0], dtype='int')
    
    x_bal = np.concatenate((x_positive, x_neg_bal), axis=0)
    y_bal = np.concatenate((np.ones(x_positive.shape[0], dtype='int'), y_neg_bal))
    print(x_bal.shape)
    print(y_bal.shape)
    
    x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size=size, shuffle=True)
    print(x_train.shape)
    print(y_train.shape)
    x_test = np.concatenate((x_test, x_neg_rest), axis=0)
    y_test = np.concatenate((y_test, y_neg_rest))
    print(x_test.shape)
    print(y_test.shape)
    return (x_train, x_test, y_train, y_test)


In [None]:
balanced = True
images = np.concatenate((train_images, test_images), axis=0)
labels = np.concatenate((train_labels, test_labels), axis=0)
if balanced:
    x_train, x_test, y_train, y_test = getTrainTest(images, labels)
else:
    x_train, x_test, y_train, y_test = train_test_split(images, labels, test_size = 0.2, random_state=0, shuffle=True)
    
x_train = np.reshape(x_train, (x_train.shape[0], -1))
x_test = np.reshape(x_test, (x_test.shape[0], -1))

In [None]:
def getConfusionMatrix(model, validation, isLabelEncoded=False):
    y_pred = model.predict(validation[0])


    labels = ["Negative", "Positive"]
    conf_mat = confusion_matrix(validation[1], y_pred)
#     print(conf_mat)
    disp = ConfusionMatrixDisplay(confusion_matrix = conf_mat, display_labels=labels)
    cl_report = classification_report(validation[1], y_pred, output_dict=True)
    return (disp, cl_report)

In [None]:
images = np.concatenate((train_images, test_images), axis=0)
labels = np.concatenate((train_labels, test_labels), axis=0)
print(images.shape)
print(labels.shape)

In [None]:
data = np.reshape(images, (images.shape[0], -1))
data.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state=42, shuffle=True)

In [None]:
def plot_model_comparison(cr_list, model_names):
    """
    This function plots the different performance metrics for each class 
    for each models and saves the figures
    
        Parameters:
            cr_list (array): Array of classification reports of the models
            model_names (array): Array of model names
    """
    
    # We will save the f1 score, recalls and precision and support in following structure
    # curr_metric[class] = [cr_1.curr_metric, cr_2.curr_metric, ...]
    
    f1_scores = {}
    recalls = {}
    precisions = {}
    supports = {}
    
    for cls in ["Positive", "Negative"]:
        f1_scores[cls] = list([cr[cls]["f1-score"] for cr in cr_list])
        recalls[cls] = list([cr[cls]["recall"] for cr in cr_list])
        precisions[cls] = list([cr[cls]["precision"] for cr in cr_list])
        supports[cls] = list([cr[cls]["support"] for cr in cr_list])
        
    fig, axes = plt.subplots(2, 2, figsize=(20, 10))
    X = np.arange(len(model_names))
    
    axes[0][0].bar(X, precisions["Positive"], color="crimson", width = 0.25)
    axes[0][0].bar(X + 0.25, precisions["Negative"], color="darkcyan", width=0.25)
    axes[0][0].set_title("Precision Comparison")
    axes[0][0].set_xlabel("Models")
    axes[0][0].set_ylabel("Precision")
    
    axes[0][1].bar(X, f1_scores["Positive"], color="crimson", width=0.25)
    axes[0][1].bar(X + 0.25, f1_scores["Negative"], color="darkcyan", width=0.25)
    axes[0][1].set_title("F1 Score Comparison")
    axes[0][1].set_xlabel("Models")
    axes[0][1].set_ylabel("f1 score")
    
    axes[1][0].bar(X, recalls["Positive"], color="crimson", width=0.25)
    axes[1][0].bar(X + 0.25, recalls["Negative"], color="darkcyan", width=0.25)
    axes[1][0].set_title("Recall Comparison")
    axes[1][0].set_xlabel("Models")
    axes[1][0].set_ylabel("recall")
    
    axes[1][1].bar(X, supports["Positive"], color="crimson", width=0.25)
    axes[1][1].bar(X + 0.25, supports["Negative"], color="darkcyan", width=0.25)
    axes[1][1].set_title("Support Comparison")
    axes[1][1].set_xlabel("Models")
    axes[1][1].set_ylabel("support")
    
    for i in range(2):
        for j in range(2):
            axes[i][j].set_xticks([i + 0.25 for i in range(len(model_names))], model_names)
            axes[i][j].legend(['Positive', 'Negative'])
            
    fig.tight_layout(pad=2.0)
    fig.savefig("./Figures/comparison.png", bbox_inches="tight")

# Testing each kernel for SVM and assess the score to choose best kernel 

In [None]:
labels = ["Negative", "Positive"]
svms = {}
kernel_names = ["poly", "sigmoid", "rbf"]
cl_kernel = []
def train_svm(kernel, x_train, y_train, x_test, y_test):
    print(kernel)
    svm = SVC(kernel=kernel, gamma='auto', C=50, probability=True)
    svm.fit(x_train, y_train)
    svms[kernel] = svm
    Y_pred = svm.predict(x_test)
    cl_report = classification_report(y_test, Y_pred, target_names=labels, output_dict=True)
    cl_kernel.append(cl_report)

In [None]:
for kernel in kernel_names:
    train_svm(kernel, x_train, y_train, x_test, y_test)

In [None]:
plot_model_comparison(cl_kernel, kernel_names)

In [None]:
for cl in cl_kernel:
    print(cl["accuracy"])

In [None]:
x_train.shape
y_train.shape
print(y_train[0:10])

# Logistic Regression

In [None]:
log_reg = LogisticRegression(random_state=42)
print(learning_curve(log_reg, x_train, y_train, cv=5, scoring='accuracy'))
plt.savefig("./log_reg_yellow.png", bbox_inches="tight")

In [None]:
log_reg.fit(x_train, y_train)
log_reg.score(x_test, y_test)
pickle.dump(log_reg, open("model_log_reg.sav", "wb"))

In [None]:
log_reg_disp, log_reg_cr = getConfusionMatrix(log_reg, (x_test, y_test))

In [None]:
log_reg_disp.plot()
plt.grid(b=None)
print(log_reg_cr)

# SVM

In [None]:
svm = SVC(kernel="sigmoid", C=20, gamma="auto")
print(learning_curve(svm, x_train, y_train, cv=5, scoring="accuracy"))

In [None]:
svm.fit(x_train, y_train)
svm_disp, svm_cr = getConfusionMatrix(svm, (x_test, y_test))
pickle.dump(svm, open("model_svm.sav", "wb"))

In [None]:
svm_disp.plot()
plt.grid(b=None)
print(svm_cr)

# K Nearest Neighbor

In [None]:
knn = KNeighborsClassifier(10)
print(learning_curve(knn, x_train, y_train, cv=5, scoring="accuracy"))

In [None]:
knn.fit(x_train, y_train)
knn_disp, knn_cr = getConfusionMatrix(knn, (x_test, y_test))
pickle.dump(knn, open("model_knn.sav", "wb"))

In [None]:
knn_disp.plot()
plt.grid(b=None)
print(knn_cr)

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=150, random_state=42)
print(learning_curve(rf, x_train, y_train, cv=5, scoring="accuracy"))

In [None]:
rf.fit(x_train, y_train)
rf_disp, rf_cr = getConfusionMatrix(rf, (x_test, y_test))
pickle.dump(rf, open("model_rf.sav", "wb"))

In [None]:
rf_disp.plot()
plt.grid(b=None)
print(rf_cr)

# Naive Bayes

In [None]:
nb = MultinomialNB()
print(learning_curve(nb, x_train, y_train, cv=5, scoring="accuracy"))

In [None]:
nb.fit(x_train, y_train)
nb_disp, nb_cr = getConfusionMatrix(nb, (x_test, y_test))
pickle.dump(nb, open("model_NB.sav", "wb"))

In [None]:
nb_disp.plot()
plt.grid(b=None)
print(nb_cr)

# Saving the Classification reports as CSV for Further Visualization

In [None]:
names = ["LR", "SVM", "KNN", "RF", "NB"]
cr_list = [log_reg_cr, svm_cr, knn_cr, rf_cr, nb_cr]
for cr in cr_list:
    cr["Positive"] = cr["1"]
    del cr["1"]
    cr["Negative"] = cr["0"]
    del cr["0"]

In [None]:
print(nb_cr)

In [None]:
plot_model_comparison(cr_list, names)

In [None]:
pickle.dump({"name": names, "cr": cr_list}, open("./crs_ann", "wb"))

In [None]:
accuracy = pd.DataFrame(
    {
        "Model": names,
        "Accuracy": list([cr["accuracy"] * 100 for cr in cr_list])
    }
)

In [None]:
accuracy.head()

In [None]:
accuracy.shape

In [None]:
# sns.lineplot(x=range(1, accuracy.shape[0] + 1), y="Accuracy", )
sns.lineplot(x="Model", y="Accuracy", data=accuracy)