# Train classifier notebook

Imports cell

In [1]:
import os
import numpy as np

from lib.data import get_dataset
from lib.classifier import SVM
from lib.bag_of_words import get_features
from lib.descriptors import get_keypoints_orb
from lib.mask import mask_apple_tomato

os.chdir('..')  # change to the root directory
print(f'Curent working directory: {os.getcwd()}')

Curent working directory: /home/valentingoldite/Documents/Projets_ML/BagOfWordsCV


Dataset, vocabulary and masking function

In [2]:
# Take dataset with same number of images on each class (apple and tomato)
infos = {'apple_a': (0, 500), 'apple_b': (1, 500), 'apple_c': (1, 0),
                'tomato': (1, 0)}
(data, y), (data_test, y_test) = get_dataset('dataset/fruits',
                                             infos, val_ratio=0.15)

# Get vocabulary
vocabulary_apple = np.load('vocabulary/fruits/apple_a_20_orb.npy')
vocabulary_tomato = np.load('vocabulary/fruits/apple_b_20_orb.npy')
vocabulary = np.concatenate((vocabulary_apple, vocabulary_tomato), axis=0)

# Get masking function
mask_func = mask_apple_tomato

print('Train & validation set:', len(data), 'samples.')
print('Test set:', len(data_test), 'samples.')

Train & validation set: 653 samples.
Test set: 114 samples.


Get descriptors and make bag of words features

In [3]:
dist = 'euclidian'
desc, _ = get_keypoints_orb(data, select=500,
                                mask_func=mask_func)
desc_test, _ = get_keypoints_orb(data_test, select=500,
                                mask_func=mask_func)
X = get_features(desc, vocabulary)
X_test = get_features(desc_test, vocabulary, dist=dist)
print('Features extracted.')

Features extracted.


Cross-validation and gridsearch for SVM

In [4]:
data_prop = 0.2
n_folds = 20
C_range = [1, 10, 100, 1000]
gamma_range = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1]

for C in C_range:
    for gamma in gamma_range:
        mean_acc = 0.0
        for _ in range(n_folds):
            val_idx = np.random.choice(len(X), int(len(X) * data_prop),
                                       replace=False)
            train_idx = np.array([i for i in range(len(X)) if i not in val_idx])
            X_val, y_val = X[val_idx], y[val_idx]
            X_train, y_train = X[train_idx], y[train_idx]
            svm = SVM(C=C, kernel='rbf', gamma=gamma, verbose=False)
            svm.fit(X_train, y_train)
            preds = svm.predict(X_val)
            mean_acc += np.mean(preds == y_val) / n_folds
        print(f'C={C}, gamma={gamma:.6f} - val acc={100 * mean_acc:.2f}%')

C=1, gamma=0.000010 - val acc=65.35%
C=1, gamma=0.000100 - val acc=65.81%
C=1, gamma=0.001000 - val acc=65.58%
C=1, gamma=0.010000 - val acc=83.88%
C=1, gamma=0.100000 - val acc=87.85%
C=1, gamma=1.000000 - val acc=74.92%
C=10, gamma=0.000010 - val acc=66.69%
C=10, gamma=0.000100 - val acc=66.27%
C=10, gamma=0.001000 - val acc=83.85%
C=10, gamma=0.010000 - val acc=88.04%
C=10, gamma=0.100000 - val acc=90.04%
C=10, gamma=1.000000 - val acc=76.58%
C=100, gamma=0.000010 - val acc=65.81%
C=100, gamma=0.000100 - val acc=83.00%
C=100, gamma=0.001000 - val acc=87.96%
C=100, gamma=0.010000 - val acc=90.73%
C=100, gamma=0.100000 - val acc=91.88%
C=100, gamma=1.000000 - val acc=75.23%
C=1000, gamma=0.000010 - val acc=82.12%
C=1000, gamma=0.000100 - val acc=87.46%
C=1000, gamma=0.001000 - val acc=88.77%
C=1000, gamma=0.010000 - val acc=90.54%
C=1000, gamma=0.100000 - val acc=90.81%
C=1000, gamma=1.000000 - val acc=74.69%


Test SVM on test set

In [5]:
C = 100
gamma = 0.1
svm = SVM(C=C, kernel='rbf', gamma=gamma, verbose=False)
svm.fit(X_train, y_train)
preds = svm.predict(X_test)
accuracy = np.mean(preds == y_test)
print(f'Test accuracy: {100 * accuracy:.2f}%')

test accuracy: 90.35%
