In [38]:
# !pip install numpy
# !pip install scikit-learn
# !pip install matplotlib
# !pip install imbalanced-learn
# !pip install pandas
# !pip install more-itertools
# !pip install scipy==1.7.1
# !pip install scikit-optimize
# !pip install opencv-python==4.5.5.62


In [1]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from itertools import product
import pylatexenc
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, NuSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from skimage import exposure
import matplotlib.pyplot as plt
import scipy

In [2]:
import sklearn
sklearn.show_versions() 


System:
    python: 3.7.13 (default, Apr 20 2022, 19:06:16)  [GCC 10.2.1 20210110]
executable: /usr/local/bin/python3
   machine: Linux-5.10.76-linuxkit-x86_64-with-debian-11.3

Python dependencies:
          pip: 22.0.4
   setuptools: 57.5.0
      sklearn: 1.0.2
        numpy: 1.21.5
        scipy: 1.7.1
       Cython: None
       pandas: 1.3.5
   matplotlib: 3.5.1
       joblib: 1.1.0
threadpoolctl: 3.1.0

Built with OpenMP: True


## Data loading and preprocessing

In [3]:
data = pd.read_csv('corona_metadata.csv')
data

Unnamed: 0,index,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,4,IM-0119-0001.jpeg,Normal,TRAIN,,
...,...,...,...,...,...,...
5905,5928,person1637_virus_2834.jpeg,Pnemonia,TEST,,Virus
5906,5929,person1635_virus_2831.jpeg,Pnemonia,TEST,,Virus
5907,5930,person1634_virus_2830.jpeg,Pnemonia,TEST,,Virus
5908,5931,person1633_virus_2829.jpeg,Pnemonia,TEST,,Virus


In [4]:
data2 = data[data["Dataset_type"] =="TRAIN"]
data2

Unnamed: 0,index,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
0,0,IM-0128-0001.jpeg,Normal,TRAIN,,
1,1,IM-0127-0001.jpeg,Normal,TRAIN,,
2,2,IM-0125-0001.jpeg,Normal,TRAIN,,
3,3,IM-0122-0001.jpeg,Normal,TRAIN,,
4,4,IM-0119-0001.jpeg,Normal,TRAIN,,
...,...,...,...,...,...,...
5281,5304,1-s2.0-S0929664620300449-gr2_lrg-c.jpg,Pnemonia,TRAIN,COVID-19,Virus
5282,5305,1-s2.0-S0929664620300449-gr2_lrg-b.jpg,Pnemonia,TRAIN,COVID-19,Virus
5283,5306,1-s2.0-S0929664620300449-gr2_lrg-a.jpg,Pnemonia,TRAIN,COVID-19,Virus
5284,5307,1-s2.0-S0140673620303706-fx1_lrg.jpg,Pnemonia,TRAIN,COVID-19,Virus


In [5]:
def get_data(assinglabels):

    data = pd.read_csv('corona_metadata.csv')  # Where the images are stored 
    data2 = data[data["Dataset_type"] =="TRAIN"]
    image_name, image_label = data2['X_ray_image_name'].to_numpy(),\
                                   data2['Label'].to_numpy()
    root_path = 'corona_images'
    resized_data = []
    labels = []
    
    for name in image_name:  # Loop over the images
       
        image1 = cv2.imread(os.path.join(str(root_path), name), 0)  # Reads the images
        c = 0
       
        if image1 is not None:
            new_image = cv2.resize(image1, (320,428))  # Resize the image
            resized_data.append(new_image)  # Add the resized image to the list
            c += 1
        else:
            image_label = np.delete(image_label, c)  # If there's a problem with an image, delete the corresponding label

    ### Change labels to 1/0 or 1/-1
    for label in image_label:
        if label=='Pnemonia':
            labels.append(assinglabels[0]) 
        elif label=='Normal':
            labels.append(assinglabels[1])
    
    return resized_data, labels

In [6]:
assinglabels1 = [0,1]
assinglabels2 = [1, -1]
image_data, image_labels = get_data(assinglabels1)

In [7]:
image_data = np.reshape(image_data, newshape=(len(image_data), 136960))

In [8]:
train_data, test_data, train_labels, test_label = train_test_split(image_data, image_labels, test_size=0.2,
                                                                    random_state=123)

Finally let's standarize and normalize the data. Remember to apply dimensionality reduction.

In [9]:
# Standardize
ss = StandardScaler()
train_data = ss.fit_transform(train_data)
test_data = ss.transform(test_data)

#Reduce dimensions.
N_DIM = 10
pca = PCA(n_components=N_DIM)
train_data = pca.fit_transform(train_data)
test_data = pca.transform(test_data)

# Normalize
mms = MinMaxScaler((-1, 1))
train_data = mms.fit_transform(train_data)
test_data = mms.transform(test_data)

In [10]:
# SVM with a linear kernel
start_time = time.time()
linear_svc = LinearSVC()
linear_svc.fit(train_data, train_labels)
linear_svc_predictions = linear_svc.predict(test_data)
print('Linear SVC precision: {}\n Linear SVC recall score: {}\n Linear SVC F1-score: {}'.
      format(precision_score(test_label, linear_svc_predictions), recall_score(test_label, linear_svc_predictions),
             f1_score(test_label, linear_svc_predictions)))
print(
    "\nRunning time: {} seconds ({} minutes)".format(
        round(time.time() - start_time, 3), round((time.time() - start_time) / 60, 1)
    )
)

Linear SVC precision: 0.899581589958159
 Linear SVC recall score: 0.7992565055762082
 Linear SVC F1-score: 0.8464566929133858

Running time: 0.27 seconds (0.0 minutes)


In [11]:
# AdaBoost ensemble classifier
start_time = time.time()
n_estimators = 50
print(f"Number of Decision Tree Classifiers: {n_estimators}")
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)
adaboost.fit(train_data, train_labels)
adaboost_predictions = adaboost.predict(test_data)
print('AdaBoost precision: {}\n AdaBoost recall score: {}\n AdaBoost F1-score: {}'.
      format(precision_score(test_label, adaboost_predictions), recall_score(test_label, adaboost_predictions),
             f1_score(test_label, adaboost_predictions)))
print(
    "\nRunning time: {} seconds ({} minutes)".format(
        round(time.time() - start_time, 3), round((time.time() - start_time) / 60, 1)
    )
)

Number of Decision Tree Classifiers: 50
AdaBoost precision: 0.8524590163934426
 AdaBoost recall score: 0.7732342007434945
 AdaBoost F1-score: 0.810916179337232

Running time: 1.201 seconds (0.0 minutes)


## Results labels 1/-1

- Linear SVC precision: 0.9340659340659341
- Linear SVC recall score: 0.9695817490494296
- Linear SVC F1-score: 0.9514925373134328

- Running time: 0.27 seconds (0.0 minutes)

- AdaBoost precision: 0.9280487804878049
- AdaBoost recall score: 0.9645120405576679
- AdaBoost F1-score: 0.9459291485394654

- Running time: 0.969 seconds (0.0 minutes)

## Results labels 0/1

- Linear SVC precision: 0.899581589958159
- Linear SVC recall score: 0.7992565055762082
- Linear SVC F1-score: 0.8464566929133858

- Running time: 0.27 seconds (0.0 minutes)

- Number of Decision Tree Classifiers: 50
- AdaBoost precision: 0.8524590163934426
- AdaBoost recall score: 0.7732342007434945
- AdaBoost F1-score: 0.810916179337232

- Running time: 1.201 seconds (0.0 minutes)