In [1]:
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import cv2
import os

In [2]:
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [3]:
def getCovidData(img_COVID_paths,img_NonCOVID_paths):
    print("[INFO]  start loading images...")
    data = []
    labels = []
    for imagePath in img_COVID_paths:
        label = imagePath.split(os.path.sep)[-2]
        image = cv2.imread(imagePath) 
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(image, (224, 224))
        data.append(image)
        labels.append(label)

    for imagePath in img_NonCOVID_paths:
        label = imagePath.split(os.path.sep)[-2]
        image = cv2.imread(imagePath) ## all the images are stored as a numpy array
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = np.array(cv2.resize(image, (224, 224)))
        data.append(image)
        labels.append(label)

    # trans the string to int
    lb = preprocessing.LabelBinarizer() 
    labels_num = lb.fit_transform(labels).reshape(-1)
    # trans the img data
    data_list = []
    for image in data:
        #image_data = image.reshape(image.shape[0]*image.shape[1], )
        image_data = image.flatten()
        data_list.append(image_data)
    data_np = np.array(data_list)
    print("loading images success!")
    return data_np,labels,labels_num

In [4]:
img_path = 'Images-processed'
img_COVID_paths = list(paths.list_images(os.path.join(img_path,'CT_COVID')))
img_NonCOVID_paths = list(paths.list_images(os.path.join(img_path,'CT_NonCOVID')))
data,labels,labels_num = getCovidData(img_COVID_paths,img_NonCOVID_paths)

[INFO]  start loading images...
loading images success!


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels_num, test_size=0.2)

In [6]:
224*224

50176

In [9]:
# use the unsupervised method:PCA  http://scikit-learn.org.cn/view/610.html
from sklearn.decomposition import PCA
# 建立PCA模型，使用k个主要成分
n_components = 150  
print("Extracting the top %d features from %d covid dataset" % (n_components, X_train.shape[0]))
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

Extracting the top 150 features from 596 covid dataset


In [7]:
len(X_train[0])

50176

In [10]:
len(X_train_pca[0])

150

In [11]:
from sklearn import svm
svm_clf = svm.SVC().fit(X_train, y_train)
svm_acc = accuracy_score(y_test,svm_clf.predict(X_test))
svm_auc = roc_auc_score(y_test,svm_clf.predict(X_test))
svm_precision = precision_score(y_test,svm_clf.predict(X_test))
svm_recall = recall_score(y_test,svm_clf.predict(X_test))
svm_f1 = recall_score(y_test,svm_clf.predict(X_test))
print('SVM :recall: {:.4f}, precision: {:.4f},F1: {:.4f}, accuracy: {:.4f}, AUC: {:.4f}'.format(
        svm_recall, svm_precision, svm_f1, svm_acc, svm_auc))

SVM :recall: 0.8025, precision: 0.8025,F1: 0.8025, accuracy: 0.7867, AUC: 0.7853


In [12]:
svm_pac_clf = svm.SVC().fit(X_train_pca, y_train)
svm_acc = accuracy_score(y_test,svm_pac_clf.predict(X_test_pca))
svm_auc = roc_auc_score(y_test,svm_pac_clf.predict(X_test_pca))
svm_precision = precision_score(y_test,svm_pac_clf.predict(X_test_pca))
svm_recall = recall_score(y_test,svm_pac_clf.predict(X_test_pca))
svm_f1 = recall_score(y_test,svm_pac_clf.predict(X_test_pca))
print('PCA -> SVM :recall: {:.4f}, precision: {:.4f},F1: {:.4f}, accuracy: {:.4f}, AUC: {:.4f}'.format(
        svm_recall, svm_precision, svm_f1, svm_acc, svm_auc))

PCA -> SVM :recall: 0.9012, precision: 0.7087,F1: 0.9012, accuracy: 0.7467, AUC: 0.7332


In [13]:
from sklearn.linear_model import LogisticRegression
lg_clf = LogisticRegression().fit(X_train, y_train)
lg_acc = accuracy_score(y_test,lg_clf.predict(X_test))
lg_auc = roc_auc_score(y_test,lg_clf.predict(X_test))
lg_precision = precision_score(y_test,lg_clf.predict(X_test))
lg_recall = recall_score(y_test,lg_clf.predict(X_test))
lg_f1 = recall_score(y_test,lg_clf.predict(X_test))
print('LogisticRegression :recall: {:.4f}, precision: {:.4f},F1: {:.4f}, accuracy: {:.4f}, AUC: {:.4f}'.format(
        lg_recall, lg_precision, lg_f1, lg_acc, lg_auc))

LogisticRegression :recall: 0.7901, precision: 0.7273,F1: 0.7901, accuracy: 0.7267, AUC: 0.7211


In [14]:
lg_pca_clf = LogisticRegression().fit(X_train_pca, y_train)
lg_acc = accuracy_score(y_test,lg_pca_clf.predict(X_test_pca))
lg_auc = roc_auc_score(y_test,lg_pca_clf.predict(X_test_pca))
lg_precision = precision_score(y_test,lg_pca_clf.predict(X_test_pca))
lg_recall = recall_score(y_test,lg_pca_clf.predict(X_test_pca))
lg_f1 = recall_score(y_test,lg_pca_clf.predict(X_test_pca))
print('LogisticRegression :recall: {:.4f}, precision: {:.4f},F1: {:.4f}, accuracy: {:.4f}, AUC: {:.4f}'.format(
        lg_recall, lg_precision, lg_f1, lg_acc, lg_auc))

LogisticRegression :recall: 0.7531, precision: 0.7262,F1: 0.7531, accuracy: 0.7133, AUC: 0.7099


In [15]:
from sklearn.ensemble import RandomForestClassifier 
rftree_clf = RandomForestClassifier().fit(X_train, y_train)
rftree_acc = accuracy_score(y_test,rftree_clf.predict(X_test))
rftree_auc = roc_auc_score(y_test,rftree_clf.predict(X_test))
rftree_precision = precision_score(y_test,rftree_clf.predict(X_test))
rftree_recall = recall_score(y_test,rftree_clf.predict(X_test))
rftree_f1 = recall_score(y_test,rftree_clf.predict(X_test))
print('RandomForestClassifier :recall: {:.4f}, precision: {:.4f},F1: {:.4f}, accuracy: {:.4f}, AUC: {:.4f}'.format(
        rftree_recall, rftree_precision, rftree_f1, rftree_acc, rftree_auc))

RandomForestClassifier :recall: 0.8025, precision: 0.8333,F1: 0.8025, accuracy: 0.8067, AUC: 0.8070


In [17]:
rftree_pca_clf = RandomForestClassifier().fit(X_train_pca, y_train)
rftree_acc = accuracy_score(y_test,rftree_pca_clf.predict(X_test_pca))
rftree_auc = roc_auc_score(y_test,rftree_pca_clf.predict(X_test_pca))
rftree_precision = precision_score(y_test,rftree_pca_clf.predict(X_test_pca))
rftree_recall = recall_score(y_test,rftree_pca_clf.predict(X_test_pca))
rftree_f1 = recall_score(y_test,rftree_pca_clf.predict(X_test_pca))
print('PCA -> RandomForestClassifier :recall: {:.4f}, precision: {:.4f},F1: {:.4f}, accuracy: {:.4f}, AUC: {:.4f}'.format(
        rftree_recall, rftree_precision, rftree_f1, rftree_acc, rftree_auc))

PCA -> RandomForestClassifier :recall: 0.8519, precision: 0.7188,F1: 0.8519, accuracy: 0.7400, AUC: 0.7303
