In [101]:
#Visual BoW - Clustering using MiniBatchKMeans

In [102]:
import cv2 as cv
import os
import numpy as np
import pickle
import time
from sys import getsizeof
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix


In [103]:
path_dir = os.getcwd()
car_dir = "Car_Split"
car_splits = os.listdir(car_dir)
noise_dir = "Noise_Split"
noise_splits = os.listdir(noise_dir)

In [104]:
#Exacting all the training splits of the car dataset
split_path = "/".join((car_dir, car_splits[0]))
with open(split_path, 'rb') as file_name:
    car_pickle_0 = pickle.load(file_name)

split_path = "/".join((car_dir, car_splits[1]))
with open(split_path, 'rb') as file_name:
    car_pickle_1 = pickle.load(file_name)

split_path = "/".join((car_dir, car_splits[2]))
with open(split_path, 'rb') as file_name:
    car_pickle_2 = pickle.load(file_name)

split_path = "/".join((car_dir, car_splits[3]))
with open(split_path, 'rb') as file_name:
    car_pickle_3 = pickle.load(file_name)

In [105]:
#Exacting all the training splits of the noise dataset
split_path = "/".join((noise_dir, noise_splits[0]))
with open(split_path, 'rb') as file_name:
    noise_pickle_0 = pickle.load(file_name)

split_path = "/".join((noise_dir, noise_splits[1]))
with open(split_path, 'rb') as file_name:
    noise_pickle_1 = pickle.load(file_name)

split_path = "/".join((noise_dir, noise_splits[2]))
with open(split_path, 'rb') as file_name:
    noise_pickle_2 = pickle.load(file_name)

split_path = "/".join((noise_dir, noise_splits[3]))
with open(split_path, 'rb') as file_name:
    noise_pickle_3 = pickle.load(file_name)

In [108]:
# Each image descriptor has a shape (m, n, 32). 
# Where m = number of descriptors
# n = number of descriptor vectors for each descriptor
# 32 - number for each vector

# We need to reshape such that we get (m x n, 32)
# np.shape(car_pickle_0[0][0]) 

In [109]:
def reshape_descriptor(pickle_file):
    print("Before Extraction: \t", np.shape(pickle_file))
    desc_arr = []
    for l in pickle_file:
        if (not l is None) and len(l) != 0:
            for desc in l:
                desc_arr.append(desc)

    print("After Extraction: \t", np.shape(desc_arr))

    return(desc_arr)


In [110]:
#Using a function to reshape the images based on descriptor length for clustering
car_desc_0 = reshape_descriptor(car_pickle_0)
noise_desc_0 = reshape_descriptor(noise_pickle_0)

car_desc_1 = reshape_descriptor(car_pickle_1)
noise_desc_1 = reshape_descriptor(noise_pickle_1)

car_desc_2 = reshape_descriptor(car_pickle_2)
noise_desc_2 = reshape_descriptor(noise_pickle_2)

car_desc_3 = reshape_descriptor(car_pickle_3)
noise_desc_3 = reshape_descriptor(noise_pickle_3)

Before Extraction: 	 (1639,)
After Extraction: 	 (1489715, 64)
Before Extraction: 	 (1861,)
After Extraction: 	 (2636113, 64)
Before Extraction: 	 (1638,)
After Extraction: 	 (1565370, 64)
Before Extraction: 	 (1638,)
After Extraction: 	 (2217862, 64)
Before Extraction: 	 (1639,)
After Extraction: 	 (1506032, 64)
Before Extraction: 	 (1639,)
After Extraction: 	 (2211980, 64)
Before Extraction: 	 (1638,)
After Extraction: 	 (1555876, 64)
Before Extraction: 	 (1638,)
After Extraction: 	 (2409413, 64)


In [112]:
print(np.shape(car_desc_0))

(1489715, 64)


In [117]:
cluster_no = 1500
iters = 1000
bs = 32
rs = 5

In [118]:
#Create clustering model
kmeans_batch = MiniBatchKMeans( n_clusters = cluster_no,
                                max_iter = iters,
                                batch_size = bs,
                                random_state = rs)

In [119]:
#MiniBatchKMeans Clustering using Partial Fits

In [120]:
start_time = time.time()
kmeans_batch.partial_fit(car_desc_0)
print("Time: ", time.time() - start_time)

Time:  1022.6850101947784


In [121]:
start_time = time.time()
kmeans_batch.partial_fit(car_desc_1)
print("Time: ", time.time() - start_time)

Time:  53.34075379371643


In [122]:
start_time = time.time()
kmeans_batch.partial_fit(car_desc_2)
print("Time: ", time.time() - start_time)

Time:  69.83170676231384


In [123]:
start_time = time.time()
kmeans_batch.partial_fit(car_desc_3)
print("Time: ", time.time() - start_time)

Time:  63.25464916229248


In [124]:
start_time = time.time()
kmeans_batch.partial_fit(noise_desc_0)
print("Time: ", time.time() - start_time)

Time:  108.87318110466003


In [125]:
start_time = time.time()
kmeans_batch.partial_fit(noise_desc_1)
print("Time: ", time.time() - start_time)

Time:  148.33403491973877


In [126]:
start_time = time.time()
kmeans_batch.partial_fit(noise_desc_2)
print("Time: ", time.time() - start_time)

Time:  101.79124593734741


In [127]:
start_time = time.time()
kmeans_batch.partial_fit(noise_desc_3)
print("Time: ", time.time() - start_time)

Time:  147.5459587574005


In [128]:
name = "_".join(("KMeans", "c", str(cluster_no), "b", str(bs), "rs", str(rs), ".sav"))
print(name)

KMeans_c_1500_b_32_rs_5_.sav


In [129]:
#Dumping the trained model
pickle.dump(kmeans_batch, open(name, 'wb'))