# Grid Search for K-Means in Latent Space (+ Comparison to Initial Space)

In [None]:
import os

from multiprocessing.pool import ThreadPool

import ctypes
from ctypes import *

import numpy as np

from tensorflow.keras.models import load_model

from autoencoder import Autoencoder

from helper_funcs import *

import pandas
pandas.set_option('display.max_rows', None)

from params import get_kmeans_eval_object, get_centroids, convert_to_2d_array, get_stotal, free_centroids, free_kmeans

2024-01-04 12:11:49.348022: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-04 12:11:49.410271: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-04 12:11:49.411229: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
models = os.listdir('./models/')

dataset = b'MNIST/input.dat'
query   = b'MNIST/query.dat'

model_to_files = {}
for i, model in enumerate(models):
    normalized_dataset = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_dataset.dat'
    normalized_query   = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_query.dat'
    encoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_dataset.dat'
    encoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_query.dat'
    decoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_decoded_dataset.dat'
    decoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_decoded_query.dat'

    model_to_files.update({models[i] : [normalized_dataset, normalized_query,
                                        encoded_dataset, encoded_query,
                                        decoded_dataset, decoded_query]})

n = 60000

In [3]:
for model in model_to_files:
    normalized_dataset, normalized_query, encoded_dataset, encoded_query, decoded_dataset, decoded_query = model_to_files[model]

    model = b'models/' + model.encode()

    # load model
    autoencoder = load_model(model.decode())
    shape = autoencoder.layers[-2].output_shape[1:] # get shape of encoded layer

    # load dataset
    x_train = load_dataset(dataset)
    x_train = x_train.astype('float32') / 255.
    x_test = load_dataset(query)
    x_test = x_test.astype('float32') / 255.
    if len(shape) == 3: # if model type is convolutional
        x_train = np.reshape(x_train, (len(x_train), 28, 28, 1))
        x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))
    else:
        x_train = np.reshape(x_train, (len(x_train), 784))
        x_test = np.reshape(x_test, (len(x_test), 784))

    encoded_train = autoencoder.encode(x_train)
    encoded_test = autoencoder.encode(x_test)

    # deflatten encoded datasets
    encoded_train = deflatten_encoded(encoded_train, shape)
    encoded_test = deflatten_encoded(encoded_test, shape)

    # decode encoded datasets
    decoded_train = autoencoder.decode(encoded_train)
    decoded_test = autoencoder.decode(encoded_test)

    # save original datasets normalized
    save_decoded_binary(x_train, normalized_dataset)
    save_decoded_binary(x_test, normalized_query)

    # normalize encoded datasets
    encoded_train = normalize(encoded_train)
    encoded_test = normalize(encoded_test)

    # save encoded datasets
    save_encoded_binary(encoded_train, encoded_dataset)
    save_encoded_binary(encoded_test, encoded_query)

    # normalize decoded datasets
    decoded_train = normalize(decoded_train)
    decoded_test = normalize(decoded_test)

    # save decoded datasets
    save_decoded_binary(decoded_train, decoded_dataset)
    save_decoded_binary(decoded_test, decoded_query)



In [4]:
best_params_lsh = [4, 7, 1, 0.6]            # L, k, limit_queries, window
best_params_hypercube = [67, 3, 1000, 0.42] # M, k, probes, window

rows = []

In [5]:
def print_results(stotal_lat_init, silouette_lat_init):
    print("Total silhouette:", stotal_lat_init)
    print("Silhouette per cluster:", silouette_lat_init)
    print("-------------------------")

def convert_centroids_to_2d_array(model, centroids):
    model = b'models/' + model.encode()

    autoencoder = load_model(model.decode())
    shape = autoencoder.layers[-2].output_shape[1:] # shape of encoded layer

    centroids = deflatten_encoded(centroids, shape)
    decoded_centroids = autoencoder.decode(centroids)
    decoded_centroids = flatten_encoded(decoded_centroids)

    decoded_centroids = decoded_centroids.astype(np.float64)
    decoded_centroids = decoded_centroids.flatten()
    decoded_centroids = decoded_centroids.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
    decoded_centroids = convert_to_2d_array(decoded_centroids, 784)

    return decoded_centroids

def run_kmeans_classic(model):
    normalized_dataset, encoded_dataset = model_to_files[model][0], model_to_files[model][2]

    config = {
        'model': b'CLASSIC',
        'vals': [],
        'dataset': normalized_dataset,
        'encoded_dataset': encoded_dataset,
    }
    
    kmeans = get_kmeans_eval_object(conf=config)
    centroids, dim = get_centroids(kmeans)

    decoded_centroids = convert_centroids_to_2d_array(model, centroids)

    # total silhouette and silhouette per cluster in latent space converted to initial space
    stotal_lat_init, silouette_lat_init = get_stotal(config, dim, kmeans, decoded_centroids)

    free_centroids(decoded_centroids)
    free_kmeans(kmeans)

    print_results(stotal_lat_init.value, silouette_lat_init.val)

    rows.append([model, 'CLASSIC', stotal_lat_init.value])

    del silouette_lat_init

def run_kmeans_lsh(model):
    normalized_dataset, encoded_dataset = model_to_files[model][0], model_to_files[model][2]

    config = {
        'model': b'LSH',
        'vals': best_params_lsh[:-1],
        'window': best_params_lsh[-1],
        'dataset': normalized_dataset,
        'encoded_dataset': encoded_dataset,
    }

    kmeans = get_kmeans_eval_object(conf=config)
    centroids, dim = get_centroids(kmeans)

    decoded_centroids = convert_centroids_to_2d_array(model, centroids)

    # total silhouette and silhouette per cluster in latent space converted to initial space
    stotal_lat_init, silouette_lat_init = get_stotal(config, dim, kmeans, decoded_centroids)

    free_centroids(decoded_centroids)
    free_kmeans(kmeans)

    print_results(stotal_lat_init.value, silouette_lat_init.val)

    rows.append([model, 'LSH', stotal_lat_init.value])

    del silouette_lat_init

def run_kmeans_cube(model):
    normalized_dataset, encoded_dataset = model_to_files[model][0], model_to_files[model][2]

    config = {
        'model': b'CUBE',
        'vals': best_params_hypercube[:-1],
        'window': best_params_hypercube[-1],
        'dataset': normalized_dataset,
        'encoded_dataset': encoded_dataset,
    }

    kmeans = get_kmeans_eval_object(conf=config)
    centroids, dim = get_centroids(kmeans)

    decoded_centroids = convert_centroids_to_2d_array(model, centroids)

    # total silhouette and silhouette per cluster in latent space converted to initial space
    stotal_lat_init, silouette_lat_init = get_stotal(config, dim, kmeans, decoded_centroids)

    free_centroids(decoded_centroids)
    free_kmeans(kmeans)

    print_results(stotal_lat_init.value, silouette_lat_init.val)

    rows.append([model, 'CUBE', stotal_lat_init.value])

    del silouette_lat_init

In [6]:
pool = ThreadPool(processes=4)

for model in models:
    pool.apply_async(run_kmeans_classic, (model,))
    pool.apply_async(run_kmeans_lsh, (model,))
    pool.apply_async(run_kmeans_cube, (model,))

pool.close()
pool.join()

10 inner and 2 outer loops
37 iterations
35 inner and 7 outer loops
62 iterations
Total silhouette: 0.09440222612147606
Silhouette per cluster: [0.14804095569455247, 0.05058881662168121, 0.008640928030901657, 0.0748170566329078, 0.03811378867382249, 0.09796761704554012, 0.23731423753168318, 0.08053473069981237, -0.03535705496137443, 0.11119362913233169]
-------------------------
35 inner and 7 outer loops
Total silhouette: 0.17263094127899897
Silhouette per cluster: [0.07213831135011047, 0.14870385359457214, 0.36175451686098864, 0.14818657406574007, 0.10516854526484454, 0.19455551091347262, 0.13836174511447222, 0.16753711490510634, 0.07354964673146142, 0.2380553042661853]
-------------------------
9 inner and 2 outer loops
Total silhouette: 0.08490649607195802
Silhouette per cluster: [0.07264655617607536, 0.09468925987599623, 0.051373747067596086, 0.022735180004488202, 0.04723501049195511, 0.2267749541776766, -0.007089719789733917, 0.06613178987270046, 0.05801696755657048, 0.0624378992

  return ops.EagerTensor(value, ctx.device_name, dtype)


Total silhouette: 0.06598782684847887
Silhouette per cluster: [0.3671613938983649, 0.04502183690845602, 0.022762353809171482, 0.012280346585255914, 0.034104530822866956, 0.012881639957545805, 0.02757694317276298, 0.06356456962586712, 0.042756985182794735, 0.09234013379846846]
-------------------------
Total silhouette: -0.01900470296787488
Silhouette per cluster: [0.3093517140973288, 0.002112894590953251, -0.06951735320514488, 0.01151488829600666, -0.04664717642325755, -0.004328341231388344, -0.035165163766627504, 0.020042789588437766, -0.0025613094313014255, 0.003577128749187453]
-------------------------
Total silhouette: 0.10088760862910588
Silhouette per cluster: [0.060292292955651014, 0.027235573524612706, 0.1638341794660782, 0.07847660325935131, 0.146503879784006, 0.10616753976179095, 0.05024588261655992, 0.022639902959237797, 0.0045657015759732, 0.04217948955650269]
-------------------------
Total silhouette: 0.12002735297267389
Silhouette per cluster: [0.09835591724948659, -0.0

In [7]:
col_models, col_methods, col_stotal_lat_init = [], [], []

for row in rows:
    model, method, stotal_lat_init = row

    col_models.append(model)
    col_methods.append(method)
    col_stotal_lat_init.append(stotal_lat_init)

col_dict = {'model': col_models, 'method': col_methods, 'silhouette total (latent to initial)': col_stotal_lat_init}

df = pandas.DataFrame(data=col_dict)
df

Unnamed: 0,model,method,silhouette total (latent to initial)
0,model_conv_46.keras,CUBE,0.094402
1,model_conv_12.keras,CLASSIC,0.172631
2,model_conv_46.keras,LSH,0.084906
3,model_conv_46.keras,CLASSIC,0.134756
4,model_conv_12.keras,CUBE,0.230499
5,model_conv_12.keras,LSH,0.069918
6,model_dense_43.keras,CLASSIC,0.114697
7,model_dense_43.keras,LSH,0.00321
8,model_conv_19.keras,CLASSIC,0.220935
9,model_dense_43.keras,CUBE,0.028575
