In [267]:
import cv2 as cv
import os
import pandas as pd
import numpy as np
import json
import pickle
import time

In [268]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [269]:
#Load the model
with open("KMeans_c_1500_b_32_rs_5_.sav", 'rb') as f_name:
    kmeans_batch = pickle.load(f_name)

In [270]:
# For each image, a histogram needs to be created
# For each descriptor, a cluster number or centroid label is given.
# Then for the entire image, a histogram counting cluster labels is generated.

In [271]:
#import all the feature descriptors
path_dir = os.getcwd()
train_car_dir = "Car_Split"
car_splits = os.listdir(train_car_dir)
train_noise_dir = "Noise_Split"
noise_splits = os.listdir(train_noise_dir)

In [272]:
car_splits

['car_desc_split_4.pkl',
 'car_desc_split_3.pkl',
 'car_desc_split_2.pkl',
 'car_desc_split_1.pkl']

In [273]:
#Extract all the different splits of the car train dataset
split_path = "/".join((train_car_dir, car_splits[0]))
with open(split_path, 'rb') as file_name:
    car_pickle_0 = pickle.load(file_name)

split_path = "/".join((train_car_dir, car_splits[1]))
with open(split_path, 'rb') as file_name:
    car_pickle_1 = pickle.load(file_name)

split_path = "/".join((train_car_dir, car_splits[2]))
with open(split_path, 'rb') as file_name:
    car_pickle_2 = pickle.load(file_name)

split_path = "/".join((train_car_dir, car_splits[3]))
with open(split_path, 'rb') as file_name:
    car_pickle_3 = pickle.load(file_name)

In [274]:
#Extracting all the different splits of the noise train dataset
split_path = "/".join((train_noise_dir, noise_splits[0]))
with open(split_path, 'rb') as file_name:
    noise_pickle_0 = pickle.load(file_name)

split_path = "/".join((train_noise_dir, noise_splits[1]))
with open(split_path, 'rb') as file_name:
    noise_pickle_1 = pickle.load(file_name)

split_path = "/".join((train_noise_dir, noise_splits[2]))
with open(split_path, 'rb') as file_name:
    noise_pickle_2 = pickle.load(file_name)

split_path = "/".join((train_noise_dir, noise_splits[3]))
with open(split_path, 'rb') as file_name:
    noise_pickle_3 = pickle.load(file_name)

In [275]:
#returns a list of cluster labels for each feature in pickle_file, using the clustering model model
def cluster_labels(pickle_file, model):
    start_time = time.time()
    #pickle_file = car_descriptors
    
    img_cluster = []
    for img_desc in pickle_file:
        cluster_desc = []
        if (not img_desc is None) and len(img_desc)> 0:
            #for desc in img_desc:
            cluster_desc = model.predict(img_desc)
            img_cluster.append(cluster_desc)
    print(time.time() - start_time)
    return(img_cluster)

In [276]:
#get cluster labels
car_desc_0 = cluster_labels(pickle_file = car_pickle_0, model = kmeans_batch)
noise_desc_0 = cluster_labels(noise_pickle_0, model = kmeans_batch)
print("\n")
car_desc_1 = cluster_labels(pickle_file = car_pickle_1, model = kmeans_batch)
noise_desc_1 = cluster_labels(pickle_file = noise_pickle_1, model = kmeans_batch)
print("\n")
car_desc_2 = cluster_labels(pickle_file = car_pickle_2, model = kmeans_batch)
noise_desc_2 = cluster_labels(pickle_file = noise_pickle_2, model = kmeans_batch)
print("\n")
car_desc_3 = cluster_labels(pickle_file = car_pickle_3, model = kmeans_batch)
noise_desc_3 = cluster_labels(pickle_file = noise_pickle_3, model = kmeans_batch)

111.20344305038452
133.01310920715332


66.71006321907043
92.39143300056458


67.10342812538147
131.9479386806488


121.54415702819824
185.01462602615356


In [277]:
car_clusters = np.concatenate([car_desc_0, car_desc_1, car_desc_2, car_desc_3])
noise_clusters = np.concatenate([noise_desc_0, noise_desc_1, noise_desc_2, noise_desc_3])



In [278]:
display(np.shape(car_clusters))
display(np.shape(noise_clusters))

(6554,)

(6749,)

In [279]:
#create frequency histogram for each list of cluster labels
def cluster_histogram(img_clusters, n_clusters = 500):
    start_time = time.time()
    hist_arr = []
    for img in img_clusters:
        hist = np.zeros(n_clusters)
        for cluster in img:
            hist[cluster] += 1
        hist_arr.append(hist)
    print(time.time() - start_time)
    return(hist_arr)

In [280]:
#create histogram
n_cluster = 1500
#Shouldn't take more than 3 seconds for each function
car_hist = cluster_histogram(img_clusters = car_clusters, n_clusters = n_cluster)
noise_hist = cluster_histogram(img_clusters = noise_clusters, n_clusters = n_cluster)

18.88273787498474
17.853725910186768


2.6755881309509277


In [281]:
#standardise the histogram data so that it can be used in a classifier
car_df_v1 = pd.DataFrame(car_hist)
sscaler = StandardScaler()
car_df_ss = pd.DataFrame(sscaler.fit_transform(car_df_v1))

In [282]:
print(np.max(car_df_v1[0]))

60.0


In [283]:
car_df = pd.DataFrame(car_hist)
mm_scalar = MinMaxScaler()
car_df_mm = pd.DataFrame(mm_scalar.fit_transform(car_df))

In [284]:
noise_df = pd.DataFrame(noise_hist)
sscaler = StandardScaler()
noise_df_ss = pd.DataFrame(sscaler.fit_transform(noise_df))

In [285]:
noise_df = pd.DataFrame(noise_hist)
mm_scalar = MinMaxScaler()
noise_df_mm = pd.DataFrame(mm_scalar.fit_transform(noise_df))

In [286]:
car_df_mm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.0,0.0,0.046512,0.0,0.0,0.0,0.015385,0.0,0.0,0.0,...,0.1,0.1875,0.0,0.0,0.02381,0.01087,0.0,0.0,0.0,0.0
1,0.0,0.0,0.116279,0.016949,0.0,0.041667,0.0,0.0,0.02439,0.044444,...,0.0,0.0,0.0,0.0,0.02381,0.0,0.027778,0.0,0.0625,0.0
2,0.0,0.047619,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.022222,...,0.05,0.125,0.0,0.0,0.02381,0.0,0.0,0.0,0.0,0.0
3,0.016667,0.0,0.023256,0.016949,0.0,0.041667,0.0,0.022727,0.0,0.0,...,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.019048,0.0,0.0
4,0.0,0.047619,0.023256,0.016949,0.0,0.0,0.0,0.022727,0.0,0.0,...,0.05,0.0625,0.0,0.0,0.02381,0.032609,0.0,0.019048,0.0,0.0


In [288]:
sample_car_mm = car_df_mm
sample_noise_mm = noise_df_mm

sample_car_ss = car_df_ss
sample_noise_ss = noise_df_ss

In [289]:
# It happens that sometimes there are no freq found for that particular cluster
print(np.any(sample_car_mm.isna())) 
print(np.any(sample_noise_mm.isna()))

False
False


In [290]:
# sample_car = sample_car.fillna(value = -1)
# sample_noise = sample_noise.fillna(value = -1)

In [291]:
# print(np.any(sample_car_mm.isna())) 
# print(np.any(sample_noise_mm.isna()))

In [292]:
#save histogram data to csv files
sample_car_mm.to_csv("Car_Hist_MinMax_Max1500_v4.csv", sep = ',', index= False)
sample_car_ss.to_csv("Car_Hist_SS_Max1500_v4.csv", sep = ',', index= False)

In [293]:
sample_noise_mm.to_csv("Noise_Hist_MinMax_Max1500_v4.csv", sep = ',', index= False)
sample_noise_ss.to_csv("Noise_Hist_SS_Max1500_v4.csv", sep = ',', index= False)