# Support Vector Machines & Logistic Regression with SIFT and SURF features extractors

Code inspired from <br> 
https://www.learnopencv.com <br>
https://github.com/hugos94 <br>
https://pysource.com/2018/03/21/feature-detection-sift-surf-obr-opencv-3-4-with-python-3-tutorial-25 <br>
https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_feature2d/py_surf_intro/py_surf_intro.html <br>
https://www.pyimagesearch.com/2015/07/16/where-did-sift-and-surf-go-in-opencv-3 <br>

In [1]:
from pathlib import Path
import numpy as np
import cv2
from scipy.cluster.vq import vq, kmeans
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

In [2]:
# dataset, training set and validation set paths
root = Path('./')
data_path = Path('dataset') 
train_path = data_path.joinpath('train')
val_path = data_path.joinpath('validation')

In [3]:
# Function for detecting keypoints and descriptors for SURF & SIFT
# The function returns descriptors and the labels of the dataset.
# SURF descriptors is an array of 64 
# SIFT descriptors is an array of 128

def detectAndCompute(train_path,featureType):
    descriptors = []
    labels = []
    if featureType == 'SURF':
        detector = cv2.xfeatures2d.SURF_create()
    elif featureType == 'SIFT':
        detector = cv2.xfeatures2d.SIFT_create()
    else:
        raise ValueError('please select a valid feature type, options: SURF, SIFT')
    for child in train_path.iterdir():
        for image_path in child.glob('*'):
            student_id = str(child.absolute()).split('\\')[-1]
            image = cv2.imread(str(image_path.absolute()),0)
            image = cv2.resize(image,(224,224))
            kp, des  = detector.detectAndCompute(image, None)
            descriptors.append((str(image_path.absolute()),des))
            labels.append(student_id)
    return descriptors,labels   

In [4]:
# Vertical stack for descriptors

def stackDescriptors(descriptors,featureType):
    if featureType == 'SURF':
        elementsinarray = 64
    elif featureType == 'SIFT':
        elementsinarray = 128
    else:
        raise ValueError('please select a valid feature type, options: SURF, SIFT')
    
    VstackDescriptors = np.array([], dtype=np.float32).reshape(0,elementsinarray)
    for _, descriptor in descriptors:
        VstackDescriptors = np.vstack((VstackDescriptors,descriptor))
    return VstackDescriptors

In [5]:
# Creating vocabulary with kmeans, k=750 centroids which is voc size
# This will create our codebook or vocabulary from data set

def createVocabulary(VstackDescriptors,vocabularySize=750):
    vocabulary, _ = kmeans(VstackDescriptors, vocabularySize, 1)
    return vocabulary

In [6]:
# Creates image features which is called bag of features

def bagOfFeatures(descriptors,vocabulary,vocabularySize):
    set_size = len(descriptors)
    imageFeatures = np.zeros((set_size, vocabularySize), "float32")
    for i,(_ , descriptor) in enumerate(descriptors):
        words, _ = vq(descriptor, vocabulary)
        for word in words:
            imageFeatures[i][word] +=1          
    return imageFeatures

## SURF & SIFT feature extraction || SVM & LR trainings & evaluation steps:

Since we are trying 4 combinations of our feature extractors: SURF & SIFT and classifiers SVM & LR, the steps we will be repetitive. The main steps are explained below:

#### Training models
1- Extract image features and descriptors on the training set with featureType set to SURF or SIFT <br>
2- Stack the descriptors into NumPy array vertically  with featureType set to SURF or SIFT  <br>
3- Create a vocabulary of k=750 using kmeans clustering <br>
4- Create image features of the training set from the descriptors and vocabulary <br>
5- Standarize the features by scaling which happens independently on each feature by computing the relevant statistics on the samples in the training set. Mean and standard deviation are then stored to be used on later data using transform <br>

####  Evaluating models on validation Set
1- Create image features and descriptors on the validation set with featureType set to SURF or SIFT <br>
2- Create image features of the validation set from the descriptors and vocabulary <br> 
3- Standarize the features by scaling the validation data <br>
4- Predict <br>
5- Display accuracy <br>

Note: Training and validation accuracy is displayed below for every combination. A Classification report will be printed only for  best combo.

## *****************************************************     SURF    *****************************************************

In [7]:
featureType = 'SURF'
vocabularySize = 750
descriptors,labels = detectAndCompute(train_path,featureType)
VstackDescriptors =  stackDescriptors(descriptors,featureType)
vocabulary = createVocabulary(VstackDescriptors,vocabularySize)
imageFeatures =  bagOfFeatures(descriptors,vocabulary,vocabularySize)
imageFeaturesScaler = StandardScaler().fit(imageFeatures)
imageFeaturesTransformed = imageFeaturesScaler.transform(imageFeatures)

In [8]:
# Save SURF vocabulary
np.save('vocabulary/SURFvocabulary750.npy', vocabulary, allow_pickle=True)

### SURF & SVM

In [9]:
# Support vector machines grid search parameters

Cs = [0.1, 1, 10, 100, 1000]
gammas = [0.001, 0.005, 0.01, 0.05, 0.1]

param_grid = {'C': Cs, 'gamma' : gammas}

In [10]:
# Initiate classfier to support vector machines
svm = SVC(kernel='rbf', class_weight='balanced')

# Start training with grid search cv set to 5 and utlizing all workers
svmGrid = GridSearchCV(svm, param_grid, cv = 3, n_jobs = -1, iid=False)

# SVM Grid fit with images features
svmGrid.fit(imageFeaturesTransformed, np.array(labels))

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
# Display best score and parameters

print('SVM best paramters from grid search:%s.' %svmGrid.best_params_)
print('SVM accuracy from grid search: %s.' %svmGrid.best_score_)

SVM best paramters from grid search:{'C': 100, 'gamma': 0.001}.
SVM accuracy from grid search: 0.866929056809575.


In [12]:
# Save best SVM model
svmBest = svmGrid.best_estimator_
# Set labels to list and store
labelList = list(set(labels))

# Persist an arbitrary Python object using joblib and saving all features, vocabulary and best classifier into one compressed pickle file for SURF & SVM 
joblib.dump((svmBest, labelList, imageFeaturesScaler, vocabularySize, vocabulary), 'models/SURF&SVM_K750.pkl', compress = 3)

['models/SURF&SVM_K750.pkl']

### Evaluating SURF & SVM model on validation Set

In [13]:
descriptorsValidation,labelsValidation = detectAndCompute(val_path,featureType)
imageFeaturesValidation =  bagOfFeatures(descriptorsValidation,vocabulary,vocabularySize)
imageFeaturesScalerValidation = StandardScaler().fit(imageFeaturesValidation)
imageFeaturesTransformedValidation = imageFeaturesScalerValidation.transform(imageFeaturesValidation)

In [14]:
prediction = svmBest.predict(imageFeaturesTransformedValidation)

In [15]:
# calculate accuracy 
accuracy_score(labelsValidation,list(prediction))

0.9517857142857142

In [16]:
# printing classification report for best performing combination (SURF & SVM)

print('\nClassification Report\n')
print(classification_report(labelsValidation, prediction, target_names=labelList))


Classification Report

              precision    recall  f1-score   support

   unknown 7       1.00      0.90      0.95        10
          42       1.00      1.00      1.00        10
           2       0.83      1.00      0.91        10
   unknown 3       0.82      0.90      0.86        10
   unknown 5       1.00      1.00      1.00        10
           4       1.00      1.00      1.00        10
           6       1.00      1.00      1.00        10
          17       1.00      1.00      1.00        10
          11       1.00      1.00      1.00        10
          40       1.00      0.90      0.95        10
           1       0.91      1.00      0.95        10
          22       1.00      1.00      1.00        10
          10       1.00      1.00      1.00        10
          15       0.91      1.00      0.95        10
          33       0.89      0.80      0.84        10
          28       1.00      1.00      1.00        10
          14       0.90      0.90      0.90        10
   

### SURF & LR

In [17]:
# Logisitc regression grid search parameters

solvers = ['lbfgs', 'liblinear']
pen  = ['l2']
Cs =  [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 1000, 5000]
  
param_grid = [{'C':Cs, 'penalty':pen, 'solver':solvers, 'multi_class':['auto']}]

In [18]:
# Initiate classfier to support vector machines
logistic = LogisticRegression(max_iter=10000, tol=0.1)
# Start training with grid search cv set to 5 and workers 2
lrGrid = GridSearchCV(logistic, param_grid, cv = 3, n_jobs = -1, iid=False)
# LR Grid fit with images features
lrGrid.fit(imageFeaturesTransformed, np.array(labels))

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.1, verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid=[{'C': array([1.00000e-04, 4.64159e-02, 2.15443e+01, 1.00000e+04]), 'penalty': ['l2'], 'solver': ['lbfgs'], 'multi_class': ['auto']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
# Display best score and parameters

print('LR best paramters from grid search:%s.' %lrGrid.best_params_)
print('LR accuracy from grid search: %s.' %lrGrid.best_score_)

LR best paramters from grid search:{'C': 0.046415888336127774, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}.
LR accuracy from grid search: 0.8754980695224338.


In [20]:
# Save best Logistic regression model
lrBest = lrGrid.best_estimator_
# Set labels to list and store
labelList = list(set(labels))

# Persist an arbitrary Python object using joblib and saving all features, vocabulary and best classifier into one compressed pickle file for SURF & SVM 
joblib.dump((lrBest, labelList, imageFeaturesScaler, vocabularySize, vocabulary), 'models/SURF&LR_K750.pkl', compress = 3)

['models/SURF&LR_K750.pkl']

### Evaluating SURF & LR model on validation Set

In [21]:
descriptorsValidation,labelsValidation = detectAndCompute(val_path,featureType)
imageFeaturesValidation =  bagOfFeatures(descriptorsValidation,vocabulary,vocabularySize)
imageFeaturesScalerValidation = StandardScaler().fit(imageFeaturesValidation)
imageFeaturesTransformedValidation = imageFeaturesScalerValidation.transform(imageFeaturesValidation)

In [22]:
prediction = lrBest.predict(imageFeaturesTransformedValidation)

In [23]:
# calculate accuracy 
accuracy_score(labelsValidation,list(prediction))

0.9160714285714285

## *****************************************************     SIFT    *****************************************************

In [24]:
featureType = 'SIFT'
vocabularySize = 750
descriptors,labels = detectAndCompute(train_path,featureType)
VstackDescriptors =  stackDescriptors(descriptors,featureType)
vocabulary = createVocabulary(VstackDescriptors,vocabularySize)
imageFeatures =  bagOfFeatures(descriptors,vocabulary,vocabularySize)
imageFeaturesScaler = StandardScaler().fit(imageFeatures)
imageFeaturesTransformed = imageFeaturesScaler.transform(imageFeatures)

In [25]:
# Save SIFT vocabulary
np.save('vocabulary/SIFTvocabulary750.npy', vocabulary, allow_pickle=True)

### SIFT & SVM

In [40]:
# Support vector machines grid search parameters

Cs = [0.1, 1, 10, 100, 1000]
gammas = [0.001, 0.005, 0.01, 0.05, 0.1]

param_grid = {'C': Cs, 'gamma' : gammas}

In [41]:
# Initiate classfier to support vector machines with RBF kernel
svm = SVC(kernel='rbf', class_weight='balanced')

# Start training with grid search cv set to 5 and utlizing all workers
svmGrid = GridSearchCV(svm, param_grid, cv = 3, n_jobs = -1, iid=False)

# SVM Grid fit with images features
svmGrid.fit(imageFeaturesTransformed, np.array(labels))

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.005, 0.01, 0.05, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [42]:
# display best score and parameters 
print(svmGrid.best_params_)
print(svmGrid.best_score_)

{'C': 100, 'gamma': 0.001}
0.7825437392209477


In [43]:
# Save best SVM model
svmBest = svmGrid.best_estimator_
# Set labels to list and store
labelList = list(set(labels))

# Persist an arbitrary Python object using joblib and saving all features, vocabulary and best classifier into one compressed pickle file for SURF & SVM 
joblib.dump((svmBest, labelList, imageFeaturesScaler, vocabularySize, vocabulary), 'models/SIFT&SVM_K750.pkl', compress = 3)

['models/SIFT&SVM_K750.pkl']

### Evaluating SIFT & SVM model on validation Set

In [44]:
descriptorsValidation,labelsValidation = detectAndCompute(val_path,featureType)
imageFeaturesValidation =  bagOfFeatures(descriptorsValidation,vocabulary,vocabularySize)
imageFeaturesScalerValidation = StandardScaler().fit(imageFeaturesValidation)
imageFeaturesTransformedValidation = imageFeaturesScalerValidation.transform(imageFeaturesValidation)

In [45]:
prediction = svmBest.predict(imageFeaturesTransformedValidation)

In [46]:
# calculate accuracy 
accuracy_score(labelsValidation,list(prediction))

0.8803571428571428

### SIFT & LR

In [63]:
# Logisitc regression grid search parameters

solvers = ['lbfgs', 'liblinear']
pen  = ['l2']
Cs =  [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 1000, 5000]
  
param_grid = [{'C':Cs, 'penalty':pen, 'solver':solvers, 'multi_class':['auto']}]

In [64]:
# Initiate classfier to support vector machines
logistic = LogisticRegression(max_iter=10000, tol=0.1)

# Start training with grid search cv set to 5 and workers 2
lrGrid = GridSearchCV(logistic, param_grid, cv = 3, n_jobs = -1, iid=False)

# LR Grid fit with images features
lrGrid.fit(imageFeaturesTransformed, np.array(labels))

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.1, verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid=[{'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 1000, 5000], 'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear'], 'multi_class': ['auto']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [65]:
# Display best score and parameters

print('LR best paramters from grid search:%s.' %lrGrid.best_params_)
print('LR accuracy from grid search: %s.' %lrGrid.best_score_)

LR best paramters from grid search:{'C': 5000, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'lbfgs'}.
LR accuracy from grid search: 0.8227237216059814.


In [66]:
# Save best Logistic regression model
lrBest = lrGrid.best_estimator_
# Set labels to list and store
labelList = list(set(labels))

# Persist an arbitrary Python object using joblib and saving all features, vocabulary and best classifier into one compressed pickle file for SURF & SVM 
joblib.dump((lrBest, labelList, imageFeaturesScaler, vocabularySize, vocabulary), 'models/SIFT&LR_K750.pkl', compress = 3)

['models/SIFT&LR_K750.pkl']

### Evaluating SIFT & LR model on validation Set

In [67]:
descriptorsValidation,labelsValidation = detectAndCompute(val_path,featureType)
imageFeaturesValidation =  bagOfFeatures(descriptorsValidation,vocabulary,vocabularySize)
imageFeaturesScalerValidation = StandardScaler().fit(imageFeaturesValidation)
imageFeaturesTransformedValidation = imageFeaturesScalerValidation.transform(imageFeaturesValidation)

In [68]:
prediction = lrBest.predict(imageFeaturesTransformedValidation)

In [69]:
# calculate accuracy 
accuracy_score(labelsValidation,list(prediction))

0.8625