Coding Image Classifier using Bag Of Visual Words

1. Import the required libraries

In [481]:
import os
import random
from skimage.io import imread_collection
from skimage.feature import hog
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
import time
from skimage.feature import hog
from numpy.linalg import norm
from sklearn import svm
import glob
import cv2

2. Get the training and testing images path

In [482]:
#This function is to gets the path for each training and testing images, as well as the lists with the label of each training and testing image
def get_training_images_from_folder(img_path, img_categories, nums_training_per_cat):
    #The number of scene categories.
    nums_img_categories = len(img_categories)
    #The training data consists of 100 images for each of the 15 scene classes (15 categories * 100 training = 1500 training images)
    #Initialise the lists of image paths and labels for training and testing  
    training_image_paths = [None] * (nums_img_categories * (nums_training_per_cat - 30))
    testing_image_paths = [None] * (nums_img_categories * (nums_training_per_cat - 70))
    training_labels = [None] * (nums_img_categories * (nums_training_per_cat - 30))
    testing_labels  = [None] * (nums_img_categories * (nums_training_per_cat - 70))
    for img_inx_1,img_category in enumerate(img_categories):
        #Get the images path
        all_images = glob.glob(os.path.join(img_path, 'training', img_category, '*.jpg'))
        all_images = imread_collection(all_images)
        #print(len(all_images))
        #Split all image datasets into training (70%) and testing (30%) dataset from training image folder
        for img_inx_2 in range(len(all_images)):
            if img_inx_2 <= 69:
                training_image_paths[img_inx_1 * (nums_training_per_cat -30) + img_inx_2] = all_images[img_inx_2]
                training_labels[img_inx_1 * (nums_training_per_cat - 30) + img_inx_2] = img_category
            else:
                testing_image_paths[img_inx_1 * 30 + img_inx_2 - 70] = all_images[img_inx_2]
                testing_labels[img_inx_1 * 30 + img_inx_2 - 70] = img_category
    return (training_image_paths, training_labels, testing_image_paths, testing_labels)

3. Feature extraction and clustering

In [483]:
#This function is to extact features by sampling descriptors from the training images and perform K Means clustering on descriptors
def extract_and_cluster_visual_words(images, words_size):
    pixels_per_cell = (4, 4)
    cells_per_block = (2, 2)
    len_vector = cells_per_block[0]
    img_feature_vectors = []
    for img in images:
        #The hog() function takes 6 parameters as input: 
        #image: The target image you want to apply feature extraction
        #orientations: Number of bins in the histogram we want to create
        #pixels_per_cell: Determine the size of the cell
        #cells_per_block: Number of cells per block
        #visualize: A boolean whether to return the image
        #multichannel: We set it to True to tell the function that the last dimension is considered as a color channel, instead of spatial.
        feature_vector = hog(img, feature_vector = True, pixels_per_cell = pixels_per_cell, cells_per_block = cells_per_block, visualize = False).reshape(-1, len_vector*len_vector*9)
        img_feature_vectors.append(feature_vector)
    img_feature_vectors = np.vstack(img_feature_vectors)
    #Compared with KMeans, MiniBatchKMeans takes less time, consumes less resources, and is relatively inferior in quality, but the difference is not significant.
    #K_means = KMeans(n_clusters=words_size, max_iter=5).fit(images_feature_vectors)
    K_means = MiniBatchKMeans(n_clusters = words_size, max_iter = 200).fit(img_feature_vectors)
    words_lists = np.vstack(K_means.cluster_centers_)
    return words_lists

  

4. Vector quantisation

In [484]:
import matplotlib.pyplot as plt
def quantiseate_bags_of_visual_words(images):
    words = np.load('words.npy')
    images_histograms = []
    pixels_per_cell = (4, 4)
    cells_per_block = (2, 2)
    len_vector = cells_per_block[0]
    words_length = words.shape[0]
    #feature_vector = []
    #After getting the feature vectors for an image, you will build up a histogram 
    #that represents what words are contained within the image.
    #For each feature, find the closest vocab word, and add 1 to the histogram at the index of that word. 
    #For example, if the closest vector in the vocabulary is the 103rd word, then 1 should be added to the 103rd histogram bin. 
    #Your histogram should have as many bins as vocabulary words.
    for i, img in enumerate(images):
        feature_vector = hog(img, feature_vector = True, pixels_per_cell = pixels_per_cell, cells_per_block = cells_per_block, visualize = False).reshape(-1, len_vector*len_vector*9)
        #Calculate the distance between the feature of the current picture and the bag of words
        distance = cdist(words, feature_vector, metric='euclidean')
        #Select the shortest distance and calculate the histogram
        min_distance = np.argmin(distance, axis=0)
        #indices, counts = np.unique(closest_vocab.)
        img_histogram, bin_edges = np.histogram(min_distance, bins=len(words))
        #Nomalizate the histogram of bags of visual words
        img_histogram = img_histogram / norm(img_histogram)
        images_histograms.append(img_histogram)
        #histogram = np.zeros(words_length)
        #distances = cdist(feature_vector, words)
        #closest_word = np.argsort(distances, axis = 1)[:,0]
        #indices, counts = np.unique(closest_word, return_counts = True)
        #histogram[indices] += counts
        #histogram = histogram / norm(histogram)
        #images_histograms[i] = histogram   
    #plt.hist(images_histograms, bins = 10)
    #plt.show()
        #print(images_histograms)
    return np.array(images_histograms)

5. Classification

In [485]:
#This function will predict a category for every test image by training 15 many-versus-one linear SVM classifiers on the training data, then using those learned classifiers on the testing data.
def svm_classify(training_image_features, training_labels, testing_image_features):
    linear_svc = svm.LinearSVC(random_state=0, tol=1e-5)
    #Train Linear SVM classifier
    linear_svc.fit(training_image_features, training_labels)
    #Make prediction
    linear_svc_predictiors = linear_svc.predict(testing_image_features)
    return linear_svc_predictiors

6. Test

In [486]:
categories = ['kitchen', 'store', 'bedroom', 'livingRoom', 'Office',
       'industrial', 'Suburb', 'InsideCity', 'TallBuilding', 'Street',
       'Highway', 'OpenCountry', 'Coast', 'Mountain', 'Forest']

training_image_paths, training_labels, testing_image_paths, testing_labels = get_training_images_from_folder('./',categories,100)

print(testing_image_paths)
print(testing_labels)

[array([[255, 255, 254, ...,  75,  73,  72],
       [255, 255, 254, ...,  77,  75,  74],
       [255, 255, 254, ...,  80,  76,  75],
       ...,
       [247, 247, 246, ...,  20,  20,  20],
       [247, 248, 246, ...,  19,  19,  19],
       [247, 246, 244, ...,  19,  19,  18]], dtype=uint8), array([[73, 75, 79, ..., 63, 68, 66],
       [76, 76, 79, ..., 62, 65, 66],
       [81, 79, 77, ..., 62, 65, 65],
       ...,
       [15, 14, 11, ..., 21, 22, 27],
       [19, 16, 11, ..., 25, 28, 35],
       [20, 15, 12, ..., 28, 35, 40]], dtype=uint8), array([[ 76,  78,  76, ..., 113, 112, 112],
       [ 74,  78,  79, ..., 112, 112, 112],
       [ 75,  78,  80, ..., 113, 114, 114],
       ...,
       [ 33,  33,  30, ...,  49,  51,  49],
       [ 32,  30,  28, ...,  49,  49,  51],
       [ 11,  13,  17, ...,  49,  47,  50]], dtype=uint8), array([[214, 214, 214, ..., 251, 252, 253],
       [211, 214, 214, ..., 252, 252, 253],
       [215, 216, 215, ..., 251, 252, 253],
       ...,
       [103, 105, 

In [487]:
t0 = time.time()
words_size = 500
words = extract_and_cluster_visual_words(training_image_paths, words_size)
np.save('words.npy', words)
training_image_features = quantiseate_bags_of_visual_words(training_image_paths)
testing_image_features = quantiseate_bags_of_visual_words(testing_image_paths)

In [488]:
predict_scenes = svm_classify(training_image_features, training_labels, testing_image_features)
print(predict_scenes)

['kitchen' 'store' 'bedroom' 'kitchen' 'kitchen' 'InsideCity' 'kitchen'
 'kitchen' 'kitchen' 'kitchen' 'kitchen' 'OpenCountry' 'livingRoom'
 'Office' 'industrial' 'kitchen' 'industrial' 'kitchen' 'kitchen'
 'kitchen' 'kitchen' 'kitchen' 'kitchen' 'kitchen' 'kitchen' 'kitchen'
 'InsideCity' 'kitchen' 'kitchen' 'kitchen' 'store' 'industrial' 'store'
 'store' 'InsideCity' 'kitchen' 'store' 'store' 'store' 'store' 'Mountain'
 'store' 'store' 'kitchen' 'store' 'Street' 'Mountain' 'store'
 'industrial' 'industrial' 'store' 'store' 'InsideCity' 'store' 'store'
 'industrial' 'store' 'store' 'store' 'Mountain' 'industrial' 'livingRoom'
 'livingRoom' 'TallBuilding' 'livingRoom' 'livingRoom' 'Office' 'Office'
 'Highway' 'Office' 'livingRoom' 'Office' 'kitchen' 'TallBuilding'
 'bedroom' 'kitchen' 'bedroom' 'bedroom' 'livingRoom' 'kitchen' 'Office'
 'livingRoom' 'bedroom' 'kitchen' 'kitchen' 'Office' 'livingRoom'
 'livingRoom' 'livingRoom' 'bedroom' 'kitchen' 'livingRoom' 'kitchen'
 'Office' 'livin

In [489]:
label_counter = 0
for i in range(len(testing_labels)):
    if testing_labels[i] == predict_scenes[i]:
        label_counter += 1
print("correct rate：", label_counter/450*100)
print('time spend：', time.time() - t0)

correct rate： 65.11111111111111
time spend： 300.43082427978516


7. Validation 

In [337]:
testing_image_names = os.listdir('testing')
#Arrange the picture names in numerical order
testing_image_names.sort(key=lambda x: int(x.split('.')[0]))
print(len(testing_image_names))

2985


In [338]:
#This function is to gets the path for each training and testing images, as well as the lists with the label of each training and testing image
def get_testing_images_from_folder(img_path, img_categories, nums_training_per_cat):
    #The number of scene categories.
    nums_img_categories = len(img_categories)
    training_image_paths = [None] * (nums_img_categories * (nums_training_per_cat))
    testing_image_paths = [None] * (2985)
    training_labels = [None] * (nums_img_categories * (nums_training_per_cat))
    testing_labels  = [None] * (2985)
    for img_inx_1, img_category in enumerate(img_categories):
        all_training_images = glob.glob(os.path.join(img_path, 'training', img_category, '*.jpg'))
        all_training_images = imread_collection(all_training_images)
        for img_inx_2 in range(len(all_training_images)):
            training_image_paths[img_inx_1 * (nums_training_per_cat) + img_inx_2] = all_training_images[img_inx_2]
            training_labels[img_inx_1 * (nums_training_per_cat) + img_inx_2] = img_category
    all_testing_images = glob.glob(os.path.join(img_path, 'testing', '*.jpg'))
    
    all_testing_images = imread_collection(all_testing_images)
    for img_inx in range(len(all_testing_images)):
        testing_image_paths[img_inx] = all_testing_images[img_inx]
            
    return (training_image_paths, training_labels, testing_image_paths, testing_labels)

In [339]:
categories = ['kitchen', 'store', 'bedroom', 'livingRoom', 'Office',
       'industrial', 'Suburb', 'InsideCity', 'TallBuilding', 'Street',
       'Highway', 'OpenCountry', 'Coast', 'Mountain', 'Forest']

training_image_paths, training_labels, testing_image_paths, testing_labels = get_testing_images_from_folder('./',categories,100)

print(training_image_paths,  training_labels)
print(testing_image_paths, testing_labels)



[array([[119, 167, 221, ...,  29,  29,  29],
       [ 84, 122, 171, ...,  30,  30,  30],
       [ 68,  85, 118, ...,  32,  32,  32],
       ...,
       [ 54,  54,  54, ..., 122, 124, 133],
       [ 53,  53,  53, ..., 129, 129, 137],
       [ 52,  52,  52, ..., 138, 136, 144]], dtype=uint8), array([[161, 161, 161, ...,  44,  45,  45],
       [161, 161, 162, ...,  45,  45,  45],
       [162, 162, 163, ...,  46,  46,  46],
       ...,
       [183, 184, 184, ...,  72,  71,  71],
       [182, 182, 183, ...,  72,  71,  71],
       [181, 181, 182, ...,  72,  71,  71]], dtype=uint8), array([[ 65,  63,  61, ..., 117, 116, 116],
       [ 67,  65,  63, ..., 118, 117, 116],
       [ 69,  67,  64, ..., 118, 117, 117],
       ...,
       [ 85,  85,  86, ..., 141, 140, 140],
       [ 83,  84,  85, ..., 141, 140, 140],
       [ 81,  82,  84, ..., 141, 140, 140]], dtype=uint8), array([[176, 176, 177, ..., 155, 158, 162],
       [177, 177, 178, ..., 156, 158, 162],
       [178, 179, 179, ..., 156, 159, 

[array([[254, 254, 254, ..., 254, 254, 254],
       [254, 254, 254, ..., 254, 254, 254],
       [254, 254, 254, ..., 254, 254, 254],
       ...,
       [ 30,  33,  26, ...,  33,  44,  78],
       [ 25,  33,  32, ...,  34,  56, 101],
       [ 24,  34,  38, ...,  40,  70, 122]], dtype=uint8), array([[ 30,  35,  54, ...,  14,   4,   6],
       [ 52,  52,  50, ...,   9,   5,  20],
       [ 66,  80,  67, ...,   3,   0,   0],
       ...,
       [183, 160, 153, ...,  14,  13,   1],
       [217, 183, 171, ...,  15,  20,   0],
       [190, 189, 188, ...,   2,  17,  12]], dtype=uint8), array([[185, 184, 181, ...,  81,  86,  88],
       [185, 183, 181, ...,  98, 100, 100],
       [184, 183, 181, ...,  89,  90,  80],
       ...,
       [118, 122, 123, ...,  69,  69,  62],
       [128, 129, 129, ...,  49,  47,  45],
       [127, 118, 126, ...,  46,  35,  36]], dtype=uint8), array([[221, 222, 222, ..., 224, 223, 220],
       [222, 223, 222, ..., 224, 224, 223],
       [224, 223, 223, ..., 224, 223, 

In [341]:
words_size = 500
words = extract_and_cluster_visual_words(training_image_paths, words_size)
np.save('words.npy', words)
training_image_features = quantiseate_bags_of_visual_words(training_image_paths)
testing_image_features = quantiseate_bags_of_visual_words(testing_image_paths)

In [342]:
predict_scenes = svm_classify(training_image_features, training_labels, testing_image_features)
print(predict_scenes)

['Street' 'Forest' 'InsideCity' ... 'livingRoom' 'Office' 'store']


In [343]:
with open("run2.txt","w") as f:
    for i in range(len(testing_image_paths)):
        f.write(testing_image_names[i]+ ' ' + predict_scenes[i].lower() + '\n')