In [1]:
import pandas as pd
import numpy as np

In [2]:
dfs = pd.read_csv("Chest_xray_Corona_dataset_Summary.csv")

In [3]:
dfs.head()

Unnamed: 0.1,Unnamed: 0,Label,Label_1_Virus_category,Label_2_Virus_category,Image_Count
0,0,Normal,,,1576
1,1,Pnemonia,Stress-Smoking,ARDS,2
2,2,Pnemonia,Virus,,1493
3,3,Pnemonia,Virus,COVID-19,58
4,4,Pnemonia,Virus,SARS,4


In [4]:
# loading metadata
df = pd.read_csv("Chest_xray_Corona_Metadata.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,4,IM-0119-0001.jpeg,Normal,TRAIN,,


In [5]:
# we are not going to focus on the virus category because we need to classify the images as nornam vs pneumonia
df = df[["X_ray_image_name", "Label", "Dataset_type"]]
df = df.dropna()

In [6]:
df.head()

Unnamed: 0,X_ray_image_name,Label,Dataset_type
0,IM-0128-0001.jpeg,Normal,TRAIN
1,IM-0127-0001.jpeg,Normal,TRAIN
2,IM-0125-0001.jpeg,Normal,TRAIN
3,IM-0122-0001.jpeg,Normal,TRAIN
4,IM-0119-0001.jpeg,Normal,TRAIN


In [7]:
df.tail()

Unnamed: 0,X_ray_image_name,Label,Dataset_type
5905,person1637_virus_2834.jpeg,Pnemonia,TEST
5906,person1635_virus_2831.jpeg,Pnemonia,TEST
5907,person1634_virus_2830.jpeg,Pnemonia,TEST
5908,person1633_virus_2829.jpeg,Pnemonia,TEST
5909,person1632_virus_2827.jpeg,Pnemonia,TEST


In [8]:
# tqdm is a libraby for progress bar in python... it's not compulsory to use this... I am just giving it a try
from tqdm import tqdm

# we will be introducing new column to our frame... This will store the image location for the respective images
df["File_Path"] = 0

# the base path to images
base_path = "D:/adity/Projects/Chest_X_Ray_Classifier/Coronahack-Chest-XRay-Dataset/"

for index in tqdm(df.index):
    # now we will decide if the image is in test dir or train dir
    if df.loc[index, "Dataset_type"] == "TRAIN":
        path = base_path + "train/"
    else:
        path = base_path + "test/"
    # now we shall write the path to img
    df.loc[index, "File_Path"] = path + df.loc[index, "X_ray_image_name"]

100%|████████████████████████████████████████████████████████████████████████████| 5910/5910 [00:05<00:00, 1154.89it/s]


In [9]:
df.head()

Unnamed: 0,X_ray_image_name,Label,Dataset_type,File_Path
0,IM-0128-0001.jpeg,Normal,TRAIN,D:/adity/Projects/Chest_X_Ray_Classifier/Coron...
1,IM-0127-0001.jpeg,Normal,TRAIN,D:/adity/Projects/Chest_X_Ray_Classifier/Coron...
2,IM-0125-0001.jpeg,Normal,TRAIN,D:/adity/Projects/Chest_X_Ray_Classifier/Coron...
3,IM-0122-0001.jpeg,Normal,TRAIN,D:/adity/Projects/Chest_X_Ray_Classifier/Coron...
4,IM-0119-0001.jpeg,Normal,TRAIN,D:/adity/Projects/Chest_X_Ray_Classifier/Coron...


In [30]:
# Let's begin preprocessing

# importing my CNN
from my_model.Doctor import Doctor

# importing required packages
import matplotlib
matplotlib.use(backend="Agg")

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.utils import np_utils

import matplotlib.pyplot as plt
import pickle
import cv2
import os

In [13]:
# so they are using around 10% images for testing
# lets do what they want

trainX = []
trainY = []
testX = []
testY = []

for index in tqdm(df.index):
    
    image = cv2.imread(df.loc[index, "File_Path"])
    image = cv2.resize(image, (64, 64))
    
    if df.loc[index, "Dataset_type"] == "TRAIN":
        trainX.append(image)
        trainY.append(df.loc[index, "Label"])
    else:
        testX.append(image)
        testY.append(df.loc[index, "Label"])

print(len(trainX), len(trainY))
print(len(testX), len(testY))

100%|██████████████████████████████████████████████████████████████████████████████| 5910/5910 [01:30<00:00, 64.98it/s]


5286 5286
624 624


In [14]:
# converting my dataset into numpy array and preprocess by scaling pixel intensities to range [0, 1]

# train dataset
trainX = np.array(trainX, dtype="float") / 255.0

# test dataset
testX = np.array(testX, dtype="float") / 255.0

print(len(trainX), len(testX))

5286 624


In [15]:
# Encoding the train labels currently as strings, to integers and then one-hot encode them

train_le = LabelEncoder()
trainY = train_le.fit_transform(trainY)
trainY = np_utils.to_categorical(y=trainY, num_classes=len(set(trainY)))

# Encoding the test labels currently as strings, to integers and then one-hot encode them

test_le = LabelEncoder()
testY = test_le.fit_transform(testY)
testY = np_utils.to_categorical(y=testY, num_classes=len(set(testY)))

print(len(trainY), len(testY))

5286 624


In [16]:
# construct the training image generator for data augmentation
# aug will be used further to generate images from our data

aug = ImageDataGenerator(rotation_range=36, zoom_range=0.2, width_shift_range=0.25, height_shift_range=0.25, shear_range=0.2, horizontal_flip=True, fill_mode="nearest")

In [17]:
learn_rate = 1e-4
batch_size = 15
epochs = 75

In [18]:
# Initializing the optimizer and compiling model

print("[INFO] Compiling Model...")
opt = Adam(learning_rate=learn_rate, decay=learn_rate/epochs)
model = Doctor.classify(width=64, height=64, depth=3, classes=len(train_le.classes_))
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

[INFO] Compiling Model...


In [19]:
# Let's finally train our network

print("[INFO] Training network for: {} epochs...".format(epochs))
res = model.fit_generator(aug.flow(trainX, trainY, batch_size=batch_size), validation_data=(testX, testY), steps_per_epoch=len(trainX) // batch_size, epochs=epochs)

[INFO] Training network for: 75 epochs...
Epoch 1/75
Epoch 2/75


Epoch 3/75


Epoch 4/75


Epoch 5/75


Epoch 6/75


Epoch 7/75


Epoch 8/75


Epoch 9/75


Epoch 10/75


Epoch 11/75


Epoch 12/75


Epoch 13/75


Epoch 14/75


Epoch 15/75


Epoch 16/75


Epoch 17/75


Epoch 18/75


Epoch 19/75


Epoch 20/75


Epoch 21/75


Epoch 22/75


Epoch 23/75


Epoch 24/75


Epoch 25/75


Epoch 26/75


Epoch 27/75


Epoch 28/75


Epoch 29/75


Epoch 30/75


Epoch 31/75


Epoch 32/75


Epoch 33/75


Epoch 34/75


Epoch 35/75


Epoch 36/75


Epoch 37/75


Epoch 38/75


Epoch 39/75


Epoch 40/75


Epoch 41/75


Epoch 42/75


Epoch 43/75


Epoch 44/75


Epoch 45/75


Epoch 46/75


Epoch 47/75


Epoch 48/75


Epoch 49/75


Epoch 50/75


Epoch 51/75


Epoch 52/75


Epoch 53/75


Epoch 54/75


Epoch 55/75


Epoch 56/75


Epoch 57/75


Epoch 58/75


Epoch 59/75


Epoch 60/75


Epoch 61/75


Epoch 62/75


Epoch 63/75


Epoch 64/75


Epoch 65/75


Epoch 66/75


Epoch 67/75


Epoch 68/75


Epoch 69/75


Epoch 70/75


Epoch 71/75


Epoch 72/75


Epoch 73/75


Epoch 74/75


Epoch 75/75




In [20]:
# Evaluating the Network

print("[INFO] Evaluating the Network...")
preds = model.predict(testX, batch_size=batch_size)
print(classification_report(testY.argmax(axis=1), preds.argmax(axis=1), target_names=test_le.classes_))

[INFO] Evaluating the Network...
              precision    recall  f1-score   support

      Normal       0.71      0.79      0.74       234
    Pnemonia       0.86      0.81      0.83       390

    accuracy                           0.80       624
   macro avg       0.79      0.80      0.79       624
weighted avg       0.80      0.80      0.80       624



In [22]:
# we have a score of around 80%
# Save the Network i.e Model to Disk as it is more efficient

print("Serializing Network to disk")
model.save("doctor.model")

Serializing Network to disk


In [23]:
# Saving the Label Encoder to disk as well

print("Serializing Label Encoder to disk")
f = open("le.pickle", "wb")
f.write(pickle.dumps(train_le))
f.close()

Serializing Label Encoder to disk


In [31]:
#lets plot the training loss and accuracy for re-trained model

print("[INFO] Serializing the plotted graph of trained model to disk...")
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, epochs), res.history["loss"], label="train_loss")
plt.plot(np.arange(0, epochs), res.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, epochs), res.history["accuracy"], label="train_accuracy")
plt.plot(np.arange(0, epochs), res.history["val_accuracy"], label="val_accuracy")
plt.title("Training Loss and Accuracy on Dataset")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")
plt.savefig("plot.png")
print("[INFO] Saved plot as plot.png...")

[INFO] Serializing the plotted graph of trained model to disk...
[INFO] Saved plot as plot.png...
