# Grid Search for K-Means in Latent Space

In [None]:
import os

import threading
from threading import Lock

import numpy as np

from tensorflow.keras.models import load_model

from autoencoder import Autoencoder
from helper_funcs import *

import pandas
pandas.set_option('display.max_rows', None)

from params import kmeans_test

2024-01-03 12:17:52.036953: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-03 12:17:52.378524: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-03 12:17:52.380401: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
models = os.listdir('./models/')

dataset = b'MNIST/input.dat'
query   = b'MNIST/query.dat'

model_to_files = {}
for i, model in enumerate(models):
    normalized_dataset = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_dataset.dat'
    normalized_query   = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_query.dat'
    encoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_dataset.dat'
    encoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_query.dat'

    model_to_files.update({models[i] : [normalized_dataset, normalized_query,
                                        encoded_dataset, encoded_query]})

n = 60000

In [3]:
for model in model_to_files:
    normalized_dataset, normalized_query, encoded_dataset, encoded_query = model_to_files[model]

    model = b'models/' + model.encode()

    # load model
    autoencoder = load_model(model.decode())
    shape = autoencoder.layers[-2].output_shape[1:] # get shape of encoded layer

    # load dataset
    x_train = load_dataset(dataset)
    x_train = x_train.astype('float32') / 255.
    x_test = load_dataset(query)
    x_test = x_test.astype('float32') / 255.
    if len(shape) == 3: # if model type is convolutional
        x_train = np.reshape(x_train, (len(x_train), 28, 28, 1))
        x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))
    else:
        x_train = np.reshape(x_train, (len(x_train), 784))
        x_test = np.reshape(x_test, (len(x_test), 784))

    encoded_train = autoencoder.encode(x_train)
    encoded_test = autoencoder.encode(x_test)

    # deflatten encoded datasets
    encoded_train = deflatten_encoded(encoded_train, shape)
    encoded_test = deflatten_encoded(encoded_test, shape)

    # save original datasets normalized
    save_decoded_binary(x_train, normalized_dataset)
    save_decoded_binary(x_test, normalized_query)

    # normalize encoded datasets
    encoded_train = normalize(encoded_train)
    encoded_test = normalize(encoded_test)

    # save encoded datasets
    save_encoded_binary(encoded_train, encoded_dataset)
    save_encoded_binary(encoded_test, encoded_query)



In [4]:
best_params_lsh = [4, 7, 1, 0.6]            # L, k, limit_queries, window
best_params_hypercube = [67, 3, 1000, 0.42] # M, k, probes, window

print_lock = Lock()
rows_lock = Lock()
rows = []

In [5]:
def print_results(clustering_time, stotal_latent, silhouette):
    print_lock.acquire()
    print("Clustering time :", clustering_time)
    print("Total silhouette:", stotal_latent)
    print("Silhouette per cluster:", silhouette)
    print("-------------------------")
    print_lock.release()

def rows_append(row):
    rows_lock.acquire()
    rows.append(row)
    rows_lock.release()

def run_kmeans_classic(model):
    encoded_dataset = model_to_files[model][2]

    config = {
        'model': b'CLASSIC',
        'vals': [],
        'dataset': encoded_dataset,
    }

    clustering_time, stotal_latent, silhouette = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val)

    sil_val = silhouette.val

    rows_append([model, 'CLASSIC', clustering_time.value, stotal_latent.value, sil_val])

    del silhouette

def run_kmeans_lsh(model):
    encoded_dataset = model_to_files[model][2]

    config = {
        'model': b'LSH',
        'vals': best_params_lsh[:-1],
        'window': best_params_lsh[-1],
        'dataset': encoded_dataset,
    }

    clustering_time, stotal_latent, silhouette = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val)

    sil_val = silhouette.val

    rows_append([model, 'LSH', clustering_time.value, stotal_latent.value, sil_val])

    del silhouette

def run_kmeans_cube(model):
    encoded_dataset = model_to_files[model][2]

    config = {
        'model': b'CUBE',
        'vals': best_params_hypercube[:-1],
        'window': best_params_hypercube[-1],
        'dataset': encoded_dataset,
    }

    clustering_time, stotal_latent, silhouette = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val)

    sil_val = silhouette.val

    rows_append([model, 'CUBE', clustering_time.value, stotal_latent.value, sil_val])

    del silhouette

In [6]:
for model in models:
    t_classic = threading.Thread(target=run_kmeans_classic, args=(model,))
    t_lsh     = threading.Thread(target=run_kmeans_lsh, args=(model,))
    t_cube    = threading.Thread(target=run_kmeans_cube, args=(model,))

    t_classic.start()
    t_lsh.start()
    t_cube.start()

    t_classic.join()
    t_lsh.join()
    t_cube.join()

10 inner and 2 outer loops
32 iterations
69 inner and 13 outer loops
Clustering time : 2.207725
Total silhouette: 0.15273642994280354
Silhouette per cluster: [0.24574403556823302, 0.08079417370992151, 0.1368033325988184, 0.17758788425782418, 0.11568465456148772, 0.18743069725986283, 0.3005274267050865, 0.12717935696812493, 0.13097833459471633, 0.11887556556010243]
-------------------------
Clustering time : 1.104387
Total silhouette: 0.13564754823688918
Silhouette per cluster: [0.1545314060036698, 0.09576984904089328, 0.16802078248082467, 0.12049373711694306, 0.11909871695151265, 0.08381139113079664, 0.1864696214170033, 0.0835582612740332, 0.0666212260375827, 0.10523512929165242]
-------------------------
Clustering time : 4.438576
Total silhouette: 0.08659116301553049
Silhouette per cluster: [-0.02372875736467623, 0.04895615395190362, 0.2696109621402956, 0.05585206724619963, 0.020722599119138877, 0.06486370027361175, 0.030854966353112936, 0.03457951516911508, 0.02797603310842175, 0.05

In [7]:
# methods_to_functions = {'CLASSIC': run_kmeans_classic, 'LSH': run_kmeans_lsh, 'CUBE': run_kmeans_cube}

# threads = []
# for model in models:
#     for method in methods_to_functions:
#         t = threading.Thread(target=methods_to_functions[method], args=(model,))
#         t.start()
#         threads.append(t)

# for t in threads:
#     t.join()

In [8]:
col_models, col_methods, col_clustering_time, col_stotal_latent = [], [], [], []

for row in rows:
    model, method, clustering_time, stotal_latent, silhouette = row

    col_models.append(model)
    col_methods.append(method)
    col_clustering_time.append(clustering_time)
    col_stotal_latent.append(stotal_latent)

col_dict = {'model': col_models, 'method': col_methods,
            'clustering time': col_clustering_time, 'silhouette total': col_stotal_latent}

df = pandas.DataFrame(data=col_dict)
df

Unnamed: 0,model,method,clustering time,silhouette total
0,model_conv_46.keras,CLASSIC,2.207725,0.152736
1,model_conv_46.keras,CUBE,1.104387,0.135648
2,model_conv_46.keras,LSH,4.438576,0.086591
3,model_conv_12.keras,CLASSIC,2.760023,0.137733
4,model_conv_12.keras,LSH,2.755275,0.105477
5,model_conv_12.keras,CUBE,1.186346,0.125372
6,model_dense_43.keras,CLASSIC,2.96709,0.115513
7,model_dense_43.keras,CUBE,1.57011,0.084312
8,model_dense_43.keras,LSH,3.829148,0.06654
9,model_conv_19.keras,CLASSIC,4.409456,0.139195
