**Import Statements**

In [0]:
import cv2
import os 
#Data Manipulation
import pandas as pd
import numpy as np
import math
#Classifier
from sklearn import svm
from sklearn.cluster import KMeans
#Evaluation
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import roc_curve, auc,accuracy_score,silhouette_score,roc_auc_score
#Data Visualisation
import matplotlib.pyplot as plt

In [0]:
# !apt-get -qq install -y libsm6 libxext6 && pip install -q -U opencv-python==3.4.2.16
# !apt-get -qq install -y libsm6 libxext6 && pip install -q -U opencv-contrib-python==3.4.2.16

**File Paths**

In [0]:
train_pos_img_dir = '/content/drive/My Drive/Computer_Vision_Individual/Images/Train_Set_Positive/'
train_neg_img_dir = '/content/drive/My Drive/Computer_Vision_Individual/Images/Train_Set_Negative/'
test_pos_img_dir = '/content/drive/My Drive/Computer_Vision_Individual/Images/Test_Pos/'
test_neg_img_dir = '/content/drive/My Drive/Computer_Vision_Individual/Images/Test_Neg/'

print("Num of Positive training image :", len(os.listdir(train_pos_img_dir)))
print("Num of Negative training image :",len(os.listdir(train_neg_img_dir)))
print("Num of positive test image :",len(os.listdir(test_pos_img_dir)))
print("Num of negative test image :",len(os.listdir(test_neg_img_dir)))

**Feature Extractor Initialisation**

In [0]:
winSize = (80,64)
blockSize = (40,32)
blockStride = (20,16)
cellSize = (20,16)
nbins = 9
derivAperture = 1
winSigma = -1.
histogramNormType = 0
L2HysThreshold = 0.2
gammaCorrection = 1
nlevels = 64
signedGradients = True
hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins,derivAperture,winSigma,histogramNormType,L2HysThreshold,gammaCorrection,nlevels, signedGradients)
sift = cv2.xfeatures2d.SIFT_create()
surf = cv2.xfeatures2d.SURF_create()

**Utility Functions**

In [0]:
def create_df_set(hog,sift,surf,data_category, dir_path,class_label):
  num_img_in_dir = len(dir_path)
  i = 0
  dir_name = dir_path.rsplit('/', 2)[1]
  img_files = os.listdir(dir_path)
  for img_file in img_files :
      img_filepath = dir_path + img_file
      img_retrieved = cv2.imread(img_filepath,0)
      hog_fd = hog.compute(img_retrieved)
      sift_kp ,sift_desc = sift.detectAndCompute(img_retrieved,None)
      surf_kp ,surf_desc = surf.detectAndCompute(img_retrieved,None)
      name = dir_name + img_file
      label = class_label
      data_category = data_category.append({'name':name ,'hog':hog_fd,'sift_kp':sift_kp,'sift_desc':sift_desc,'surf_kp':surf_kp,'surf_desc':surf_desc,'label':label}, ignore_index=True)
  data_category = data_category.sort_values(by=['name'], ascending=False)
  data_category.fillna(0)
  return data_category

def get_feature_target(data_record,feature,target):
  X = []
  y = []
  for item in data_record[feature]:
    X.append(item)
  X = np.array(X)
  for item in data_record[target]:
    y.append(item)
  y = np.array(y)
  n_images,rows,cols = X.shape
  X = X.reshape((n_images,rows*cols))
  return X,y

def svc_param_selection(X, y, C_values, nfolds):
    param_grid = {'C': C_values}
    grid_search = GridSearchCV(svm.LinearSVC(random_state = 10 ,max_iter = 40000), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    results = grid_search.cv_results_['mean_test_score']
    return best_params,results

def get_all_desc(data_records,feature_desc):
  #dimension of each descriptor
  for k in data_records[feature_desc]:
    if k is not None:
      desc_dim = k.shape[1] #64 for SURF, 128 for SIFT 
      break
  print( "Dimension of each descriptor",desc_dim )
  total_desc = 0
  #Count num of desc
  for img_desc_list in data_records[feature_desc]: #for each Image, we have descriptor list
    if img_desc_list is not None: #If desc list is not None
      for desc in img_desc_list:  #for each desc in desc list
        total_desc = total_desc +1
  print( "Num of descriptors",total_desc )
  all_desc = np.zeros((total_desc,desc_dim))
  i=0
  for img_desc_list in data_records[feature_desc]: #for each Image, we have descriptor list
    if img_desc_list is not None: #If desc list is not None
      for desc in img_desc_list:  #for each desc in desc list
        all_desc[i,:] = desc
        i=i+1
  all_desc = all_desc.astype('float32')
  return all_desc

def plot(x_values,y_values,x_label, y_label,x_range_value,y_range_value):
  x = x_values
  y = y_values
  fig = plt.figure()
  plt.plot(x, y)  
  plt.xlabel(x_label) 
  plt.ylabel(y_label)
  x_start,x_end,x_step = x_range_value
  y_start,y_end,y_step = y_range_value
  plt.xticks(np.arange(x_start,x_end,x_step))
  plt.yticks(np.arange(y_start,y_end,y_step))
  plt.grid()
  plt.show()
  return None

def get_optimal_k_value(data_points, range_value):
  max_score = 0 
  k_list =[]
  sil_score = []
  for n_clusters in range_value:
      clusterer = KMeans(n_clusters=n_clusters)
      preds = clusterer.fit_predict(data_points)
      centers = clusterer.cluster_centers_
      score = silhouette_score(data_points, preds)
      #print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
      k_list.append(n_clusters)
      sil_score.append(score)
      if score > max_score:
        max_score = score
        k = n_clusters
  plot(k_list,sil_score,"K values", "Silhouette score",(15,65,3),(0.01,0.09,0.005))
  return k

def build_histogram_train(num_cluster,train_record,label,feature_desc,desc_hist):
  m = 0
  for _, row in train_record.iterrows():
      #print(index,row['name'], row['hist'])
      histogram = np.zeros(num_cluster,dtype=np.float32)
      if row[feature_desc] is not None:
        for desc_item in row[feature_desc]: #for each desc in desc list
          label_val = label[m][0]
          histogram[label_val] = histogram[label_val] + 1
          m = m+1
          row[desc_hist] = histogram
      else:
        row[desc_hist] = np.zeros(num_cluster,dtype=np.float32)

def build_histogram_test(num_cluster,test_record,feature_desc,desc_hist,knn):
  for _, row in test_record.iterrows():
      #print(index,row['name'], row['hist'])
      histogram = np.zeros(num_cluster,dtype=np.float32)
      #print("length",len(row['sift_desc']))
      if row[feature_desc] is not None:
        ret,label,neighbours,dist = knn.findNearest(row[feature_desc],k=1)
        #print("Labels",label)
        for p in label:
          label_value = int(p[0])
          histogram[label_value]= histogram[label_value] +1
        row[desc_hist] = histogram
      else:
        row[desc_hist] = np.zeros(num_cluster,dtype=np.float32)

def x_y_train_histogram(train_records,desc_hist,target,knn):
  X_train= []
  y_train = []
  for item in train_records[desc_hist]:
    X_train.append(item)
  X_train = np.array(X_train)
  for item in train_records[target]:
    y_train.append(item)
  y_train = np.array(y_train)
  knn = cv2.ml.KNearest_create()
  knn.train(X_train,cv2.ml.ROW_SAMPLE,y_train) 
  return knn       

def model_prediction(model,test_records,desc_hist,target):
  X_test = []
  y_test = [] 
  for item in test_records[desc_hist]:
    X_test.append(item)
  X_test = np.array(X_test)
  for item in test_records[target]:
    y_test.append(item)
  y_test = np.array(y_test)
  # Predicting image class
  ret,result,neighbours,dist = model.findNearest(X_test,k=1)
  return y_test,result

def roc_plot(test, pred):
  fpr = dict()
  tpr = dict()
  roc_auc = dict()
  for i in range(2):
      fpr[i], tpr[i], _ = roc_curve(test, pred)
      roc_auc[i] = auc(fpr[i], tpr[i])
  #print(roc_auc_score(test, pred))
  plt.figure()
  plt.plot(fpr[1], tpr[1])
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver operating characteristic')
  plt.show()

def classifier(train_data,test_data,feature_val,feature_kp_val,feature_desc_val,desc_hist_val,target_val):
  feature = feature_val
  feature_kp = feature_kp_val
  feature_desc = feature_desc_val
  desc_hist = desc_hist_val
  target = target_val
  #Fetch all data points (all sift descriptors of every keypoint
  all_desc = get_all_desc(train_data.copy(),feature_desc)

  #Find optimal k value for the data points
  criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
  range_value = list (range(15,65,5))
  num_cluster = get_optimal_k_value(all_desc, range_value)

  #Clustering the datapoints into k cluster
  #print(num_cluster)
  ret,label,centers=cv2.kmeans(all_desc,num_cluster,None,criteria,4,cv2.KMEANS_RANDOM_CENTERS)

  #Building the histogram of the image in the train set as per the desclist  of sift descriptor
  build_histogram_train(num_cluster,train_data,label,feature_desc,desc_hist)

  # Assigning every descriptors of desc list of a test Image to one of the clustered class
  labels = np.linspace(0,num_cluster-1,num=num_cluster,dtype=np.int32).reshape(-1,1)
  knn = cv2.ml.KNearest_create()
  knn.train(centers,cv2.ml.ROW_SAMPLE,labels)

  #Create histogram as per the clusters for each item in the desc_list of a test Image
  build_histogram_test(num_cluster,test_data,feature_desc,desc_hist,knn)
  model = x_y_train_histogram(train_data,desc_hist,target,knn)

  #Prediction for Test Data
  y_test,y_prediction = model_prediction(model,test_data,desc_hist,target)
  print ("Test Accuracy : ", accuracy_score(y_test, y_prediction)*100)
  roc_plot(y_test, y_prediction)


**DataFrame Creation - Train and Test**

In [0]:
train_df = pd.DataFrame( columns = ['name','hog','sift_kp','sift_desc','sift_hist','surf_kp','surf_desc','surf_hist','label'])
test_df = pd.DataFrame( columns = ['name','hog','sift_kp','sift_desc','sift_hist','surf_kp','surf_desc','surf_hist','label'])

**Data Retrieval and Storage**

In [0]:
#Loading training set
train_df = create_df_set(hog,sift,surf,train_df,train_pos_img_dir,1)
train_df = create_df_set(hog,sift,surf,train_df,train_neg_img_dir,0)
#Loading test set
test_df = create_df_set(hog,sift,surf,test_df,test_pos_img_dir,1)
test_df = create_df_set(hog,sift,surf,test_df,test_neg_img_dir,0)

# **SVM Classifier on Hog Feature**

In [0]:
# Get Feature and targets for Test and Train
target_val = 'label'
SVM_X_train,SVM_y_train = get_feature_target(train_df,'hog',target_val)
SVM_X_test,SVM_y_test = get_feature_target(test_df,'hog',target_val)

In [0]:
SVM_X_train = np.nan_to_num(SVM_X_train) 
SVM_X_test = np.nan_to_num(SVM_X_test) 
svm_linear = svm.LinearSVC(random_state=10,max_iter = 15000)
svm_linear.fit(SVM_X_train,SVM_y_train)
SVM_test_pred = svm_linear.predict(SVM_X_test) 
print ("Default Accuracy Test : ", accuracy_score(SVM_y_test, SVM_test_pred)*100)
roc_plot(SVM_y_test, SVM_test_pred)

**SVM Hyper-Parameter -Optimal value for C using roc_auc ranking and Cross Validation**

In [0]:
# c_values = [0.1,0.3,0.6]
# nfolds = 10
# best_params,roc_auc_values = svc_param_selection(SVM_X_train,SVM_y_train, c_values, nfolds)
# print('[',end='')
# for i in range(len(c_values)):
#   print(c_values[i],':',roc_auc_values[i],' , ',end ='',sep='')
# print(']')
# best_c_value = best_params['C']
# print("Best C value :",best_c_value)

In [0]:
# svm_linear = svm.LinearSVC(random_state=10,C=best_c_value,max_iter = 20000)
# svm_linear.fit(SVM_X_train,SVM_y_train)
# SVM_test_pred = svm_linear.predict(SVM_X_test) 
# print ("Enhanced Accuracy Test : ", accuracy_score(SVM_y_test, SVM_test_pred)*100)
# roc_plot(SVM_y_test, SVM_test_pred)

# **KNN Classifier Using SIFT feature**

In [0]:
print("--------------------------------------------------------------")
print("KNN CLASSIFICATION USING SIFT FEATURE")
print("--------------------------------------------------------------")
classifier(train_df,test_df,sift,'sift_kp','sift_desc','sift_hist','label')

# **KNN Classifier Using SURF feature**

In [0]:
print("--------------------------------------------------------------")
print("KNN CLASSIFICATION USING SURF FEATURE")
print("--------------------------------------------------------------")
classifier(train_df,test_df,surf,'surf_kp','surf_desc','surf_hist','label')