In [1]:
# Required python libraries         
import numpy as np         
import os                  
from random import shuffle
import random
import matplotlib.pyplot as plt
import glob
from tqdm import tqdm 
import time
import pickle
from tqdm import tqdm
from statistics import mode


# OpenCV and scikit-learn
from sklearn.utils import resample
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances, cosine_distances
from sklearn.metrics import silhouette_score
# from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans
import cv2 

# Pandas 
# import pandas as pd

# Tensorflow
# import tensorflow as tf
import pandas as pd
# from tensorflow.keras.applications.densenet import DenseNet169
# from tensorflow.keras.applications.vgg16 import VGG16
# from tensorflow.keras.applications.resnet import ResNet101 
# from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
select_model = int(input("Enter the number for: \n 1) VGGNET16 \n 2) Resnet101  \n 3) Densenet161 "))

select_distance = int(input("Enter the number for: \n 1) Euclidean  \n 2) Manhattan \n 3) Cosine"))

In [3]:
dataset_path = "./x-ray_dataset"

In [4]:
labels = [0,1] # 1 = Covid // 0 = Noncovid 

all_files=[]

for i in os.listdir(dataset_path):  # Get all the files from the directory in a two element list. First element is list of file location to covid images and second element is list of file location to non-covid images.
  file1 = glob.glob(os.path.join(dataset_path,i, "*.png"))
  file2 = glob.glob(os.path.join(dataset_path,i, "*.jpg")) # .jpg files are also present.
  file1.extend(file2)  # Only extends when there is .jpg file present
  all_files.append(file1)

In [5]:
count=0     # Count to record the ids of files. Each file has a unique ID.
img_size = 224
def get_dataset(files, label,count):        
  dataset=[]  # List to hold all the dataset. Each element is a dictionary
  
  for j in tqdm(files):  # Loop over each file location
    data_dict = {}  
    data_dict["id"] = count
    data_dict["filepath"] = j
    img=cv2.imread(j)
    img = cv2.resize(img,(img_size,img_size))
    data_dict["image"]= img
    data_dict["label"]= label
    count +=1
    dataset.append(data_dict)
  return dataset, count

In [None]:
c_dataset, nc_dataset, t_dataset = [], [], []  

for i,data in enumerate(all_files[1:]):
  dataset,count=get_dataset(data,labels[i],count)
  if labels[i]==1:
    c_dataset = dataset
  else:
    nc_dataset = dataset
t_dataset = c_dataset + nc_dataset

In [7]:
print(len(t_dataset))
batch_size=2000

4715


In [8]:
image_only, label_only, id_only, img_name = [], [], [], []
for data in t_dataset:
  image_only.append(data["image"])
  label_only.append(data["label"]) 
  id_only.append(data['id'])
  img_name.append(data["filepath"].split("/")[-1])
image_only=np.array(image_only)

In [9]:
img_datagen = ImageDataGenerator()
batch_img= img_datagen.flow(image_only, batch_size=batch_size, shuffle = False)

In [10]:
def all_models(img_size, model_sel):
 
  if model_sel == 1:
    vgg_pre_t = VGG16(input_shape = (img_size, img_size, 3),include_top = False, weights ='imagenet')
    return vgg_pre_t, 25088

  elif model_sel==2:
    resnet_pre_t= ResNet101(input_shape = (img_size, img_size, 3),include_top=False, weights='imagenet')
    return resnet_pre_t, 100352

  elif model_sel==3:
    densenet169_pre_t = DenseNet169(input_shape = (img_size, img_size, 3),include_top = False, weights ='imagenet' )
    return densenet169_pre_t, 81536
  

In [None]:
all_fea = []
model,feature_size= all_models(img_size, select_model)
for data in tqdm(range(len(batch_img))):
  try:
    features = model.predict(batch_img[data]).flatten().reshape(batch_size,feature_size)
  except:
    img_len=len(batch_img[data])
    features = model.predict(batch_img[data]).flatten().reshape(img_len,feature_size)
  all_fea.extend(features)

In [13]:
for i in range(len(t_dataset)):
  t_dataset[i]['image']= all_fea[i]

In [2]:
with open('./pickle_files/image_net/VGGNet16.pickle','rb') as handle:
   t_dataset  = pickle.load(handle)

In [3]:
len(t_dataset)

4715

In [4]:
random.seed(42)
shuffle(t_dataset)
t_dataset.pop(0)

{'id': 283,
 'filepath': '/content/gdrive/My Drive/Dataset/Covid/MIDRC-RICORD-1C-419639-003112-48985-0.png',
 'image_features': array([ 0.       ,  5.3459306,  2.563743 , ...,  0.       , 18.554409 ,
         0.       ], dtype=float32),
 'label': 1}

In [5]:
def sub_clusters(features):
    kmeans = KMeans(n_clusters=8, random_state=0, n_init="auto").fit(features)
    output= kmeans.labels_
    clusters = [np.squeeze(np.array(features)[[np.where(output==i)[0]]],axis=0) for i in range(len(np.unique(output)))]
    return kmeans.cluster_centers_, clusters

In [34]:
def data_separation(dataset,label):
    add_data= []
    i=0
    while len(add_data)!=20:
        if dataset[i]["label"]==label:
            add_data.append(dataset[i]['image_features'])
            del dataset[i]
        i+=1
    return add_data

In [147]:
def mean_features(positive, negative):
    # print(f"pure_pf: {positive}")
    # print(f"p_type: {type(positive)}")
    # print(f"len_p: {len(positive)}")
    mpos_features=np.array([np.mean(i,axis=0) for i in positive])  # Mean of all positive sub clusters 
    mneg_features=np.array([np.mean(i,axis=0) for i in negative])  # Mean of all negative sub clusters
    # print(mpos_features)
    return mpos_features, mneg_features

In [223]:
a=np.array([[[2],[5],[22],[90]], 
            [[52],[28],[1]],
            [[44],[28,27],[54],[97],[89]],
            [[42],[35],[82]]],dtype=object)
# np.argmax(np.array(a))
# b=np.concatenate((a[-1],np.array([[69]])),axis=0)
# b = np.append(a[0],np.array([[99,20,0,9]]),axis=0)
# b=np.array([99,20,0,9])
# np.insert(a,np.array([[99,20,0,9]]))
# c=np.stack((a,b))
# z=list(a)
# z.append(np.array([[43,67,86]]))
a[0]=np.concatenate((a[0],[[71068]]),axis=0)
a


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [224]:
def update_subclusters(all_dist, query, fea_label, id_pred, label_pred, features, decision, n_neighbours, cluster):
    max_ind=np.argmax(all_dist)
    features[max_ind]=np.concatenate((features[max_ind],np.expand_dims(query["image_features"], axis=0)),axis=0)
    # fea_label[cluster].append(np.expand_dims(query["image_features"], axis=0))  # Have doubt here
    id_pred[cluster].append(query["id"])
    label_pred[cluster].append((query['id'],decision.count(1)/n_neighbours))
    return features, fea_label, id_pred, label_pred

In [231]:
def correct_mispredictions(query, fea_label,train_label, train_id, ind_data, decision,data_frame_1, count, pos_dist, neg_dist, pos_features, neg_features):
    if mode(decision) != query["label"]:
        # print("here")
        count +=1 
        data_frame_1["Image name"].append(query["filepath"].split("/")[-1])
        data_frame_1["Mistake ID"].append(query['id'])
        data_frame_1["Original label"].append(query['label'])
        data_frame_1["Predicted label"].append(mode(decision))
        data_frame_1["Mistake index"].append(ind_data)
        if query["label"]==1:
            pos_features_list= list(pos_features)
            pos_features_list.append(np.expand_dims(query["image_features"], axis=0))
            pos_features = np.array(pos_features_list)
            # pos_features= np.concatenate((pos_features,np.expand_dims(query["image_features"], axis=0)),axis=0)
        else:
            neg_features_list= list(neg_features)
            neg_features_list.append(np.expand_dims(query["image_features"], axis=0))
            neg_features = np.array(neg_features_list)
            # neg_features = np.concatenate((neg_features,np.expand_dims(query["image_features"], axis=0)),axis=0)
        train_label[query['label']].append(query["label"])
        train_id[query['label']].append(query['id'])

    else:
        if query['label'] == 0:
            max_ind = np.argmax(neg_dist)
            neg_features[max_ind] = np.concatenate((neg_features[max_ind],np.expand_dims(query["image_features"], axis=0)),axis=0)
            # fea_label[query['label']].append(np.concatenate((fea_label[query['label']],np.expand_dims(query["image_features"], axis=0)),axis=0))
        else:
            max_ind = np.argmax(pos_dist)

            pos_features[max_ind] = np.concatenate((pos_features[max_ind],np.expand_dims(query["image_features"], axis=0)),axis=0)
            # fea_label[query['label']].append(np.concatenate((mpos_features,np.expand_dims(query["image_features"], axis=0)),axis=0))
        train_label[query['label']].append(query["label"])
        train_id[query['label']].append(query['id'])
    return count,data_frame_1,fea_label,train_label,train_id,pos_features,neg_features

In [232]:
def distance2(query, fea_label, select_distance, id_pred, label_pred, n_neighbours, count, train_label, train_id, ind_data, data_frame_1, pos_features, neg_features, supervised_data): # Query is the raw dictionary (from pickle file) // fea_label is dictionary of {0: [], 1:[]} (distance) // select distance is int
  exp_query = np.expand_dims(query['image_features'], axis=0)
  pos_tup, neg_tup = [], []

  if select_distance==1: # Euclidean distance
    # print(f"Type: {type(fea_label[0])}")
    # print(f"Shape: {fea_label[0].shape}")
    neg_dist = np.linalg.norm(query['image_features']- fea_label[0], axis=1)  # Calculating the Euclidean distance using numpy (axis=1) to calculate all at ones   
    pos_dist = np.linalg.norm(query['image_features']- fea_label[1], axis=1)

  # elif select_distance==2: # Manhattan distance
  #   neg_dist = np.squeeze(manhattan_distances(fea_label[0],exp_query))  # convert (1,n) to (,n)
  #   pos_dist=np.squeeze(manhattan_distances(fea_label[1],exp_query))

  # elif select_distance==3: # Cosine distance
  #   neg_dist = np.squeeze(cosine_distances(exp_query,fea_label[0]))  # convert (1,n) to (,n)
  #   pos_dist=np.squeeze(cosine_distances(exp_query,fea_label[1]))
  
  for dist_single in pos_dist:
    # print(dist_single)
    pos_tup.append((dist_single,1))

  for dist_single in neg_dist:
    neg_tup.append((dist_single,0))

  pos_tup.extend(neg_tup)
  tup_dist = sorted(pos_tup)[:n_neighbours]
  
  decision = [y for (x,y) in tup_dist]

  if supervised_data:
    count,data_frame_1,fea_label,train_label,train_id, pos_features,neg_features=correct_mispredictions(query, fea_label,train_label,train_id, ind_data, decision,data_frame_1, count, pos_dist, neg_dist, pos_features, neg_features)
    
  else:
    if decision.count(0) > decision.count(1):
      neg_features, fea_label, id_pred, label_pred = update_subclusters(neg_dist,query,fea_label,id_pred,label_pred,neg_features, decision, n_neighbours, cluster=0)
      
    else:
      pos_features, fea_label, id_pred, label_pred = update_subclusters(pos_dist,query,fea_label,id_pred,label_pred,pos_features, decision,n_neighbours, cluster=1)
  
  return id_pred, label_pred, data_frame_1, count, train_label, train_id, pos_features, neg_features

In [241]:
def classification_metrics(label_gt,id_pred):
  TP,FP,FN,TN = 0,0,0,0

  for tp in id_pred[1]:   # TP --> When correctly classified covid
    if tp in label_gt[1]:
      TP +=1

  for tn in id_pred[0]:  # TN --> When correctly classified healthy (non-covid)
    if tn in label_gt[0]:
      TN +=1

  for fp in id_pred[1]: # FP --> When incorrectly classified healthy (Classified healthy as covid)
    if fp in label_gt[0]:
      FP +=1

  for fn in id_pred[0]: # FN --> When missed covid classification (Covid cases missed)
    if fn in label_gt[1]:
      FN +=1

  accuracy= (TP+TN)/(TP+TN+FP+FN)
  specificity = TN/(TN+FP)
  sensitivity = (TP)/(TP+FN)
  # f1_score = (2*precision*recall)/(precision + recall)
  
  print("TP: ", TP)
  print("FP: ", FP)
  print("FN: ", FN)
  print("TN: ", TN)

  return accuracy, specificity, sensitivity,TP,TN,FP,FN

def roc_auc_curve(label_gt,label_pred):
  gt_labels= sorted(label_gt[0]+ label_gt[1])  # Contains (id,labels) tuple of binary class 
  pred_labels = sorted(label_pred[0]+label_pred[1]) # Contains (id,labels) tuple of binary class --> sorted to match each element in gt_labels and pred_labels
  y_test = [y for (x,y) in gt_labels]   # Get only the labels
  y_scores = [y for (x,y) in pred_labels]
  fpr, tpr, threshold = roc_curve(y_test, y_scores)
  roc_auc = auc(fpr, tpr)
  return roc_auc

def cluster_metrics(pos_features, neg_features, train_label,id_pred):
  print("Calculating Dunn's index...")
  intra_dist1 = euclidean_distances(neg_features).max()
  intra_dist2 = euclidean_distances(pos_features).max()
  inter_dist = euclidean_distances(neg_features,pos_features).min()

  if intra_dist1>intra_dist2:
    max_intra_dist= intra_dist1  
  else:
    max_intra_dist = intra_dist2 

  Dunn_index = inter_dist/max_intra_dist

  print("Calculating Davies Bouldin index...")

  # Davies Bouldin and Silhouette score from sklearn library.
  class_0 =np.concatenate((np.zeros(shape=(len(train_label[0])),dtype=int),np.zeros(shape=(len(id_pred[0])),dtype=int),np.zeros(shape=(20),dtype=int)))
  class_1 = np.concatenate((np.ones(shape=(len(train_label[1])),dtype=int),np.ones(shape=(len(id_pred[1])),dtype=int),np.zeros(shape=(20),dtype=int)))
  class_all = np.concatenate((class_0,class_1))
  feature_all = np.concatenate((neg_features,pos_features))

  davies_bouldin_index = davies_bouldin_score(feature_all,class_all)
  silhouette_index = silhouette_score(feature_all,class_all)

  print("davies: ", davies_bouldin_index)
  print("silhouette_sklearn: ", silhouette_index)
  
  return Dunn_index,davies_bouldin_index, silhouette_index

In [245]:
# labeled_size = [200,400,800,1100,1300,1550]
labeled_size = [1550]
def data_loader(dataset,n): # Method to return three sets of labeled dataset for experiment
  labeled_data, unlabeled_data = [], [] 

  l_data = dataset[:n]    # First dataset // labeled
  ul_data = dataset[n:]   # First dataset // unlabeled
  labeled_data.append(l_data)
  unlabeled_data.append(ul_data)

  l_data = dataset[1500:1500+n]    # second dataset // labeled
  ul_data = dataset[:1500]+dataset[1500+n:]
  labeled_data.append(l_data)
  unlabeled_data.append(ul_data)

  l_data = dataset[3000:3000+n]     # Third dataset // labeled
  ul_data = dataset[:3000]+dataset[3000+n:]
  labeled_data.append(l_data)
  unlabeled_data.append(ul_data)
  return labeled_data, unlabeled_data

In [246]:
def flatten_features(features):
    all_features = []
    for i in features:
        for j in i:
            all_features.append(j)
    return all_features

In [248]:
n_neighbours=15

data_frame = {"Labeled data": [],
              "Dataset": [],
              "Accuracy": [],
              "Specificity": [],
              "Sensitivity": [],
              "AUC":[],
              "Dunn index": [],
              "Davies Bouldin": [],
              "Silhouette index":[],
              "TP":[],
              "TN":[],
              "FP":[],
              "FN":[],
              "pos_labeled_img":[],
              "neg_labeled_img":[],
              "corrected_count":[]
    
}
# fea_label1={0: [],
#             1:[]}


for size in labeled_size:
  labeled_data, unlabeled_data = data_loader(t_dataset, size)
#   print(f"labeled data length {len(labeled_data)}")
#   print(f"Unlabeled data length {len(unlabeled_data)}")
  select=0         # To select the dataset out of three sets ==> three sets: [d11, d12, d13] ==> eg: [200,200,200]




  while(select < 3):
    data_frame_1 = {  "Image name": [],
                  "Mistake index": [],
                  "Mistake ID": [],
                  "Original label": [],
                  "Predicted label": []
                  
    }
    pos_img, neg_img=0, 0

    fpos, fneg= [], []

    label_gt = {0: [],    
        1 :[]}    
                            # Collect the ground truth (label) of all the predicting images
    train_label = {0: [],    
        1 :[]}    

    label_pred = {0: [],
        1 :[]}               # Collect the predicted label for all the images

    id_gt = {0: [], 
            1: [] }         # Collect the ground truth (id) of all the predicting images

    id_pred = {0: [],
            1: []}        # Collect the predicted id for all the images 

    fea_label = {0: [],
            1: []}

    train_id ={0: [],
            1:[]}
        
    # print(type(labeled_data[0][0]))
    # for data in labeled_data[select]:
    #     if data["label"] == 1:
    #         fpos.append(data['image_features'])
    #         train_id[1].append(data['id'])
    #         train_label[1].append((data['id'],data['label']))
    #         pos_img +=1

    #     else:
    #         fneg.append(data['image_features'])
    #         train_id[0].append(data['id'])
    #         train_label[0].append((data['id'],data['label']))
    #         neg_img +=1

    # print(f"Blen: {len(labeled_data[select])}")
    fpositive = data_separation(labeled_data[select],1)    # Get 20 features of each class

    
    fnegative = data_separation(labeled_data[select],0)


    mneg_features,neg_features= sub_clusters(fnegative)  # Get the subclusters (Using K-means algorithm)
    mpos_features,pos_features= sub_clusters(fpositive)    

        

    count, ind_data=0, 40
    for data in labeled_data[select]:
        fea_label={0: mneg_features,
            1: mpos_features}
        id_pred, label_pred, data_frame_1, count, train_label, train_id, pos_features, neg_features= distance2(data,fea_label,1,id_pred,label_pred,n_neighbours, count, train_label, train_id, ind_data, data_frame_1, pos_features, neg_features, supervised_data=True)
        mpos_features, mneg_features = mean_features(pos_features, neg_features)    # Get the mean of the features
        ind_data +=1

    data_f_1 = pd.DataFrame.from_dict(data_frame_1)
    data_f_1.to_csv(f"./csv_results_x-ray_counts/new/resnet101_euclidean_mistake_{size}_{select}.csv",index=False)

    for data in tqdm(unlabeled_data[select]):
      if data["label"]==1:
        id_gt[1].append(data['id'])
        label_gt[1].append((data['id'],data['label']))
      
      else:
        id_gt[0].append(data['id'])
        label_gt[0].append((data['id'],data['label']))
      
      fea_label={0: mneg_features,
            1: mpos_features}

      id_pred, label_pred, _, _, _, _, pos_features, neg_features = distance2(data,fea_label,1,id_pred,label_pred,n_neighbours, count, train_label, train_id, ind_data, data_frame_1, pos_features, neg_features,supervised_data=False) # ind_data is the index of misclassification
      mpos_features, mneg_features = mean_features(pos_features, neg_features)    # Get the mean of the features

    accuracy, specificity, sensitivity,TP,TN,FP,FN= classification_metrics(id_gt,id_pred)
    flattened_pos_features = flatten_features(pos_features) 
    flattened_neg_features = flatten_features(neg_features)
    dunn_index, davies_bouldin_index, silhouette_index = cluster_metrics(flattened_pos_features, flattened_neg_features, train_label,id_pred)
    cl_auc = roc_auc_curve(label_gt,label_pred)
    data_frame["Labeled data"].append(size)
    data_frame["Dataset"].append(f"d_{select}")
    data_frame["Accuracy"].append(accuracy)
    data_frame["Specificity"].append(specificity)
    data_frame["Sensitivity"].append(sensitivity)
    data_frame["AUC"].append(cl_auc)
    data_frame["Dunn index"].append(dunn_index)
    data_frame["Davies Bouldin"].append(davies_bouldin_index)
    data_frame["Silhouette index"].append(silhouette_index)
    data_frame["TP"].append(TP)
    data_frame["TN"].append(TN)
    data_frame["FP"].append(FP)
    data_frame["FN"].append(FN)
    data_frame["pos_labeled_img"].append(pos_img)
    data_frame["neg_labeled_img"].append(neg_img)
    data_frame["corrected_count"].append(count)

    print(f"Labeled image: {size} \t Dataset: d_{select} \t Accuracy: {accuracy} \t Specificity: {specificity} \t Sensitivity: {sensitivity} \t Dunn index: {dunn_index}  \t Davies Bouldin: {davies_bouldin_index} \t Silhouette index: {silhouette_index} \t AUC: {cl_auc} \t Corrected count: {count}")
    select +=1 
  

  pos_features = np.array(pos_features_list)
  neg_features = np.array(neg_features_list)
100%|██████████| 3164/3164 [00:50<00:00, 62.46it/s]


TP:  1573
FP:  190
FN:  8
TN:  1393
Calculating Dunn's index...
Calculating Davies Bouldin index...
davies:  2.716889398994854
silhouette_sklearn:  0.11548696
Labeled image: 1550 	 Dataset: d_0 	 Accuracy: 0.9374209860935525 	 Specificity: 0.8799747315224258 	 Sensitivity: 0.9949399114484504 	 Dunn index: 1.6634847952445853e-06  	 Davies Bouldin: 2.716889398994854 	 Silhouette index: 0.11548695713281631 	 AUC: 0.9450981990416039 	 Corrected count: 107


  neg_features = np.array(neg_features_list)
  pos_features = np.array(pos_features_list)
 24%|██▎       | 750/3164 [00:07<00:25, 94.83it/s] 


KeyboardInterrupt: 

In [251]:
len(mpos_features)

12

In [19]:
select_model, select_distance = 2,2
if select_model==1:
    s_model= 'vgg16'
elif select_model==2:
    s_model= 'resnet101'
elif select_model==3:
    s_model='densenet169'

if select_distance==1:
    s_distance='euclidean'
elif select_distance==2:
    s_distance='manhattan'
elif select_distance==3:
    s_distance='cosine'
data_f=pd.DataFrame.from_dict(data_frame)
data_f.to_csv(f"./csv_results_x-ray_counts/{s_model}_{s_distance}_dist_1100_1300.csv",index=False)

In [348]:
def xyz(a,*args, **kwargs):
    print(a)
    print(kwargs.get('dist'))
    print(args)

In [350]:
xyz(1,[1,2,3])

1
None
([1, 2, 3],)


In [346]:
config = {"dist":123,
          "ok":12}

In [347]:
xyz(1, **config)

1
123
()
