In [1]:
# Required python libraries         
import numpy as np         
import os                  
from random import shuffle
import random
import matplotlib.pyplot as plt
import glob
from tqdm import tqdm 
import time
import pickle
from tqdm import tqdm
from statistics import mode


# OpenCV and scikit-learn
from sklearn.utils import resample
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances, cosine_distances
from sklearn.metrics import silhouette_score
# from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import davies_bouldin_score
import cv2 

# Pandas 
# import pandas as pd

# Tensorflow
import tensorflow as tf
import pandas as pd
from tensorflow.keras.applications.densenet import DenseNet169
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet101 
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
select_model = int(input("Enter the number for: \n 1) VGGNET16 \n 2) Resnet101  \n 3) Densenet161 "))

select_distance = int(input("Enter the number for: \n 1) Euclidean  \n 2) Manhattan \n 3) Cosine"))

In [3]:
data= pd.read_csv("./ct_scan_dataset/all_image_paths.txt", sep=" ", header=None)
data.columns = ['filename', 'label', 'xmin','ymin','xmax','ymax']

In [4]:
# nor_img, pne_img, cov_img = [],[],[]
nor_id, pne_id, cov_id = [], [], []
id_count = 0
for i in range(len(data)):

    if data["label"][i]== 0:
        # nor_img.append(data["filename"][i])
        nor_id.append(i)
    elif data["label"][i] == 1:
        # pne_img.append(data["filename"][i])
        pne_id.append(i)
    else:
        # cov_img.append(data["filename"][i])
        cov_id.append(i)

In [7]:
random.seed(42)
shuffle(nor_id)
# random.seed(42)
shuffle(pne_id)
# random.seed(42)
shuffle(cov_id)

cov_select = cov_id[:5000]
nor_select = nor_id[:2500]
pne_select = pne_id[:2500]

In [8]:
# count = 0     # Count to record the ids of files. Each file has a unique ID.
img_size = 224
def get_dataset(files,cov_select, nor_select, pne_select):  
  count=0      
  dataset=[]  # List to hold all the dataset. Each element is a dictionary
  # image_only=[]
  for i in tqdm(cov_select + nor_select + pne_select):  # Loop over each file location
    data_dict = {}  
    data_dict["id"] = count
    data_dict["filepath"] = os.path.join("./ct_scan_dataset/3A_images/",files["filename"][i])
    img=cv2.imread(os.path.join("./ct_scan_dataset/3A_images/",files["filename"][i]))

    x_min, y_min, x_max, y_max = files["xmin"][i], files["ymin"][i], files["xmax"][i], files["ymax"][i]
    cropped_img = img[y_min:y_max,x_min:x_max,:]
    
    img_resized = cv2.resize(cropped_img,(img_size,img_size))
    data_dict["image"]=img_resized
    # image_only.append(img_resized)
    if files["label"][i]== 0 or files["label"][i]==1:
      data_dict["label"]= 0
    else:
      data_dict["label"]=1
    count +=1
    dataset.append(data_dict)
  return dataset

In [None]:
dataset = get_dataset(data, cov_select, nor_select, pne_select)

In [10]:
image_only = []
for data in dataset:
    image_only.append(data["image"])

image_only = np.array(image_only)
batch_size=2000

In [11]:
img_datagen = ImageDataGenerator()
batch_img= img_datagen.flow(image_only, batch_size=batch_size, shuffle = False)

In [12]:
# Method to return the pretrained models
def all_models(img_size, model_sel):
 
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape = (img_size, img_size, 3),include_top = False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel==2:
    resnet_pre_t= ResNet101(input_shape = (img_size, img_size, 3),include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel==3:
    densenet169_pre_t = DenseNet169(input_shape = (img_size, img_size, 3),include_top = False, weights ='imagenet' )
    return densenet169_pre_t, 81536
  

In [None]:
all_fea = []
model,feature_size= all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size,feature_size)
  except:
    img_len=len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len,feature_size)
  all_fea.extend(features)

In [14]:
for i in range(len(dataset)):
  dataset[i]['image']= all_fea[i]

In [15]:
random.seed(42)
shuffle(dataset)

In [17]:
def correct_mispredictions(query, fea_label,train_label, train_id, ind_data, decision,data_frame_1, count):
    if mode(decision) != query["label"]:
        count +=1 
        data_frame_1["Image name"].append(query["filepath"].split("/")[-1])
        data_frame_1["Mistake ID"].append(query['id'])
        data_frame_1["Original label"].append(query['label'])
        data_frame_1["Predicted label"].append(mode(decision))
        data_frame_1["Mistake index"].append(ind_data)
        fea_label[query['label']].append(query["image"])
        train_label[query['label']].append(query["label"])
        train_id[query['label']].append(query['id'])
    else:
        fea_label[query['label']].append(query["image"])
        train_label[query['label']].append(query["label"])
        train_id[query['label']].append(query['id'])
    return count,data_frame_1,fea_label,train_label,train_id

In [18]:
def distance1(query, fea_label, select_distance, id_pred, label_pred, n_neighbours,count,train_label, train_id, ind_data, data_frame_1,supervised_data): # Query is the raw dictionary (from pickle file) // fea_label is dictionary of {0: [], 1:[]} (distance) // select distance is int
  exp_query = np.expand_dims(query['image'], axis=0)
  pos_tup, neg_tup = [], []
  
  if select_distance==1: # Euclidean distance
    
    neg_dist = np.linalg.norm(query['image']-fea_label[0], axis=1)  # Calculating the Euclidean distance using numpy (axis=1) to calculate all at ones   
    pos_dist = np.linalg.norm(query['image']-fea_label[1],axis=1)
  
  elif select_distance==2: # Manhattan distance
     neg_dist = np.squeeze(manhattan_distances(fea_label[0],exp_query))  # convert (1,n) to (,n)
     pos_dist=np.squeeze(manhattan_distances(fea_label[1],exp_query))

  elif select_distance==3: # Cosine distance
    neg_dist = np.squeeze(cosine_distances(exp_query,fea_label[0]))  # convert (1,n) to (,n)
    pos_dist=np.squeeze(cosine_distances(exp_query,fea_label[1]))

  
  for dist_single in pos_dist:
    pos_tup.append((dist_single,1))

  for dist_single in neg_dist:
    neg_tup.append((dist_single,0))

  pos_tup.extend(neg_tup)
  tup_dist = sorted(pos_tup)[:n_neighbours]
  
  
  decision = [y for (x,y) in tup_dist]
  if supervised_data:
    count,data_frame_1,fea_label,train_label,train_id=correct_mispredictions(query, fea_label,train_label,train_id, ind_data, decision,data_frame_1, count)
  
  else:
    if decision.count(0) > decision.count(1):
      fea_label[0].append(query["image"])
      id_pred[0].append(query["id"])
      label_pred[0].append((query['id'],decision.count(1)/n_neighbours))

    else:
      fea_label[1].append(query["image"])
      id_pred[1].append(query["id"])
      label_pred[1].append((query['id'],decision.count(1)/n_neighbours))
  
  return fea_label, id_pred, label_pred, data_frame_1, count, train_label, train_id

In [19]:
def classification_metrics(label_gt,id_pred):
  TP,FP,FN,TN = 0,0,0,0

  for tp in id_pred[1]:   # TP --> When correctly classified covid
    if tp in label_gt[1]:
      TP +=1

  for tn in id_pred[0]:  # TN --> When correctly classified healthy (non-covid)
    if tn in label_gt[0]:
      TN +=1

  for fp in id_pred[1]: # FP --> When incorrectly classified healthy (Classified healthy as covid)
    if fp in label_gt[0]:
      FP +=1

  for fn in id_pred[0]: # FN --> When missed covid classification (Covid cases missed)
    if fn in label_gt[1]:
      FN +=1

  accuracy= (TP+TN)/(TP+TN+FP+FN)
  specificity = TN/(TN+FP)
  sensitivity = (TP)/(TP+FN)
  # f1_score = (2*precision*recall)/(precision + recall)
  
  print("TP: ", TP)
  print("FP: ", FP)
  print("FN: ", FN)
  print("TN: ", TN)

  return accuracy, specificity, sensitivity,TP,TN,FP,FN

def roc_auc_curve(label_gt,label_pred):
  gt_labels= sorted(label_gt[0]+ label_gt[1])  # Contains (id,labels) tuple of binary class 
  pred_labels = sorted(label_pred[0]+label_pred[1]) # Contains (id,labels) tuple of binary class --> sorted to match each element in gt_labels and pred_labels
  y_test = [y for (x,y) in gt_labels]   # Get only the labels
  y_scores = [y for (x,y) in pred_labels]
  fpr, tpr, threshold = roc_curve(y_test, y_scores)
  roc_auc = auc(fpr, tpr)
  return roc_auc

def cluster_metrics(fea_label,train_label,id_pred):
  print("Calculating Dunn's index...")
  intra_dist1 = euclidean_distances(fea_label[0]).max()
  intra_dist2 = euclidean_distances(fea_label[1]).max()
  inter_dist = euclidean_distances(fea_label[0],fea_label[1]).min()

  if intra_dist1>intra_dist2:
    max_intra_dist= intra_dist1  
  else:
    max_intra_dist = intra_dist2 

  Dunn_index = inter_dist/max_intra_dist

  print("Calculating Davies Bouldin index...")

  # Davies Bouldin and Silhouette score from sklearn library.
  class_0 =np.concatenate((np.zeros(shape=(len(train_label[0])),dtype=int),np.zeros(shape=(len(id_pred[0])),dtype=int)))
  class_1 = np.concatenate((np.ones(shape=(len(train_label[1])),dtype=int),np.ones(shape=(len(id_pred[1])),dtype=int)))
  class_all = np.concatenate((class_0,class_1))
  feature_all = np.concatenate((fea_label[0],fea_label[1]))

  davies_bouldin_index = davies_bouldin_score(feature_all,class_all)
  silhouette_index = silhouette_score(feature_all,class_all)

  print("davies: ", davies_bouldin_index)
  print("silhouette_sklearn: ", silhouette_index)
  
  return Dunn_index,davies_bouldin_index, silhouette_index

In [20]:
labeled_size = [200,400,800,1550]

In [21]:
def data_loader(dataset,n): # Method to return three sets of labeled dataset for experiment
  labeled_data, unlabeled_data = [], [] 

  l_data = dataset[:n]    # First dataset // labeled
  ul_data = dataset[n:]   # First dataset // unlabeled
  labeled_data.append(l_data)
  unlabeled_data.append(ul_data)

  l_data = dataset[3200:3200+n]    # second dataset // labeled
  ul_data = dataset[:3200]+dataset[3200+n:]
  labeled_data.append(l_data)
  unlabeled_data.append(ul_data)

  l_data = dataset[6400:6400+n]     # Third dataset // labeled
  ul_data = dataset[:6400]+dataset[6400+n:]
  labeled_data.append(l_data)
  unlabeled_data.append(ul_data)
  return labeled_data, unlabeled_data

In [None]:
n_neighbours=31
data_frame = {"Labeled data": [],
              "Dataset": [],
              "Accuracy": [],
              "Specificity": [],
              "Sensitivity": [],
              "AUC":[],
              "Dunn index": [],
              "Davies Bouldin": [],
              "Silhouette index":[],
              "TP":[],
              "TN":[],
              "FP":[],
              "FN":[],
              "pos_labeled_img":[],
              "neg_labeled_img":[],
              "corrected_count":[]
    
}


for size in labeled_size:
  labeled_data, unlabeled_data = data_loader(dataset, size)
  print(f"labeled data length {len(labeled_data)}")
  print(f"Unlabeled data length {len(unlabeled_data)}")
  select=0         # To select the dataset out of three sets ==> three sets: [d11, d12, d13] ==> eg: [200,200,200]
  
  while(select < 3):
    data_frame_1 = {  "Image name": [],
                  "Mistake index": [],
                  "Mistake ID": [],
                  "Original label": [],
                  "Predicted label": []
                  
}
    pos_img,neg_img=0,0
    # mis_predict_id = {0: [],    
    # 1 :[]}

    label_gt = {0: [],    
    1 :[]}    
                        # Collect the ground truth (label) of all the predicting images
    train_label = {0: [],    
    1 :[]}    

    label_pred = {0: [],
    1 :[]}               # Collect the predicted label for all the images

    id_gt = {0: [], 
         1: [] }         # Collect the ground truth (id) of all the predicting images

    id_pred = {0: [],
           1: []}        # Collect the predicted id for all the images 

    fea_label = {0: [],
           1: []}

    train_id ={0: [],
         1:[]}
    
    training_data, supervised_data = labeled_data[select][:200], labeled_data[select][200:]
    
    for data in training_data:

      if data["label"] == 1:
        fea_label[1].append(data['image'])
        train_id[1].append(data['id'])
        train_label[1].append((data['id'],data['label']))
        pos_img +=1

      else:
        fea_label[0].append(data['image'])
        train_id[0].append(data['id'])
        train_label[0].append((data['id'],data['label']))
        neg_img +=1
    
    print(f"Feature length neg: {neg_img}")
    print(f"Feature length pos: {pos_img} ")  

    # supervised_data= True
    count,ind_data=0,200
    for data in supervised_data:
      fea_label, id_pred, label_pred, data_frame_1, count, train_label, train_id=distance1(data,fea_label,select_distance,id_pred,label_pred,n_neighbours, count, train_label, train_id, ind_data, data_frame_1 ,supervised_data=True)
      
      ind_data +=1
    data_f_1 = pd.DataFrame.from_dict(data_frame_1)
    data_f_1.to_csv(f"./test/densenet121_manhattan_mistake_{size}_{select}.csv",index=False)
      
    # # supervised_data = False
    for data in tqdm(unlabeled_data[select]):
      if data["label"]==1:
        id_gt[1].append(data['id'])
        label_gt[1].append((data['id'],data['label']))
      
      else:
        id_gt[0].append(data['id'])
        label_gt[0].append((data['id'],data['label']))

      fea_label,id_pred,label_pred,_,_,_,_ = distance1(data,fea_label,select_distance,id_pred,label_pred,n_neighbours,count=None,train_label=None, train_id=None, ind_data=None, data_frame_1=None, supervised_data=False)  
    accuracy, specificity, sensitivity,TP,TN,FP,FN= classification_metrics(id_gt,id_pred)
    dunn_index, davies_bouldin_index, silhouette_index = cluster_metrics(fea_label,train_label,id_pred)
    cl_auc = roc_auc_curve(label_gt,label_pred)
    data_frame["Labeled data"].append(size)
    data_frame["Dataset"].append(f"d_{select}")
    data_frame["Accuracy"].append(accuracy)
    data_frame["Specificity"].append(specificity)
    data_frame["Sensitivity"].append(sensitivity)
    data_frame["AUC"].append(cl_auc)
    data_frame["Dunn index"].append(dunn_index)
    data_frame["Davies Bouldin"].append(davies_bouldin_index)
    data_frame["Silhouette index"].append(silhouette_index)
    data_frame["TP"].append(TP)
    data_frame["TN"].append(TN)
    data_frame["FP"].append(FP)
    data_frame["FN"].append(FN)
    data_frame["pos_labeled_img"].append(pos_img)
    data_frame["neg_labeled_img"].append(neg_img)
    data_frame["corrected_count"].append(count)

    print(f"Labeled image: {size} \t Dataset: d_{select} \t Accuracy: {accuracy} \t Specificity: {specificity} \t Sensitivity: {sensitivity} \t Dunn index: {dunn_index}  \t Davies Bouldin: {davies_bouldin_index} \t Silhouette index: {silhouette_index} \t AUC: {cl_auc} \t Corrected count: {count}")
    select +=1 

In [None]:
if select_model==1:
    s_model= 'vgg16'
elif select_model==2:
    s_model= 'resnet101'
elif select_model==3:
    s_model='densenet169'

if select_distance==1:
    s_distance='euclidean'
elif select_distance==2:
    s_distance='manhattan'
elif select_distance==3:
    s_distance='cosine'
data_f=pd.DataFrame.from_dict(data_frame)
data_f.to_csv(f"./test/{s_model}_{s_distance}_dist.csv",index=False)