# Grid Search for K-Means in Initial and Latent Space (without projecting it back to Initial for evaluation)

In [1]:
import os

import threading
from threading import Lock

import numpy as np

from tensorflow.keras.models import load_model

from multiprocessing.pool import ThreadPool

from autoencoder import Autoencoder
from helper_funcs import *

import pandas
pandas.set_option('display.max_rows', None)

from params import kmeans_test

2024-01-07 01:46:26.705526: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-07 01:46:26.780300: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-07 01:46:26.781820: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Create files for later

In [2]:
models = os.listdir('./models/')

dataset = b'MNIST/input.dat'
query   = b'MNIST/query.dat'

model_to_files = {}
for i, model in enumerate(models):
    normalized_dataset = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_dataset.dat'
    normalized_query   = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_query.dat'
    encoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_dataset.dat'
    encoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_query.dat'

    model_to_files.update({models[i] : [normalized_dataset, normalized_query,
                                        encoded_dataset, encoded_query]})

n = 60000

In [3]:
for model in model_to_files:
    normalized_dataset, normalized_query, encoded_dataset, encoded_query = model_to_files[model]

    model = b'models/' + model.encode()

    # load model
    autoencoder = load_model(model.decode())
    shape = autoencoder.layers[-2].output_shape[1:] # get shape of encoded layer

    # load dataset
    x_train = load_dataset(dataset)
    x_train = x_train.astype('float32') / 255.
    x_test = load_dataset(query)
    x_test = x_test.astype('float32') / 255.
    if len(shape) == 3: # if model type is convolutional
        x_train = np.reshape(x_train, (len(x_train), 28, 28, 1))
        x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))
    else:
        x_train = np.reshape(x_train, (len(x_train), 784))
        x_test = np.reshape(x_test, (len(x_test), 784))

    encoded_train = autoencoder.encode(x_train)
    encoded_test = autoencoder.encode(x_test)

    # deflatten encoded datasets
    encoded_train = deflatten_encoded(encoded_train, shape)
    encoded_test = deflatten_encoded(encoded_test, shape)

    # save original datasets normalized
    save_decoded_binary(x_train, normalized_dataset)
    save_decoded_binary(x_test, normalized_query)

    # normalize encoded datasets
    encoded_train = normalize(encoded_train)
    encoded_test = normalize(encoded_test)

    # save encoded datasets
    save_encoded_binary(encoded_train, encoded_dataset)
    save_encoded_binary(encoded_test, encoded_query)



# Grid Search for K-Means in Initial Space

In [4]:
normalized_dataset = b'MNIST/model_conv_12_normalized_dataset.dat'

In [5]:
best_params_lsh = [5, 6, 1, 2222]          # L, k, limit_queries, window
best_params_hypercube = [2153, 13, 950, 2222] # M, k, probes, window

rows = []

In [6]:
def print_results(clustering_time, stotal_latent, silhouette, obj_func):
    print("Clustering time       :", clustering_time)
    print("Total silhouette      :", stotal_latent)
    print("Silhouette per cluster:", silhouette)
    print("Objective function    :", obj_func)
    print("-------------------------")

def run_kmeans_classic_initial():
    config = {
        'model': b'CLASSIC',
        'vals': [],
        'dataset': normalized_dataset,
    }

    clustering_time, stotal_latent, silhouette, obj = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val, obj.value)

    sil_val = silhouette.val

    rows.append(['CLASSIC', clustering_time.value, stotal_latent.value, sil_val, obj.value])

    del silhouette

def run_kmeans_lsh_initial():
    config = {
        'model': b'LSH',
        'vals': best_params_lsh[:-1],
        'window': best_params_lsh[-1],
        'dataset': normalized_dataset,
    }

    clustering_time, stotal_latent, silhouette, obj = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val, obj.value)

    sil_val = silhouette.val

    rows.append(['LSH', clustering_time.value, stotal_latent.value, sil_val, obj.value])

    del silhouette

def run_kmeans_cube_initial():
    config = {
        'model': b'CUBE',
        'vals': best_params_hypercube[:-1],
        'window': best_params_hypercube[-1],
        'dataset': normalized_dataset,
    }

    clustering_time, stotal_latent, silhouette, obj = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val, obj.value)

    sil_val = silhouette.val

    rows.append(['CUBE', clustering_time.value, stotal_latent.value, sil_val, obj.value])

    del silhouette

In [7]:
pool = ThreadPool(processes=3)

pool.apply_async(run_kmeans_classic_initial)
pool.apply_async(run_kmeans_lsh_initial)
pool.apply_async(run_kmeans_cube_initial)

pool.close()
pool.join()

9 inner and 2 outer loops
36 iterations
28 inner and 7 outer loops
Clustering time       : 74.809344
Total silhouette      : 0.08592976263543374
Silhouette per cluster: [0.1983131333229411, 0.09306968384503408, 0.08296453104567188, 0.03503438971780184, 0.10200916959619197, 0.09217179118413021, 0.026123027577865266, 0.06025725018063737, 0.024839090133952167, 0.052457763154282314]
Objective function    : 2368436.3376048254
-------------------------
Clustering time       : 280.661419
Total silhouette      : 0.041104934944983344
Silhouette per cluster: [0.07482437251162441, 0.04245913255659548, 0.01640818842095041, 0.00682412895904634, 0.08013389406670296, 0.0060228218939041645, 0.03458173458985561, 0.01933585973110541, 0.0032572657276000484, 0.021514952337834112]
Objective function    : 143204868.21116585
-------------------------
Clustering time       : 26.497587
Total silhouette      : 0.07154555982587808
Silhouette per cluster: [0.027925581958038452, 0.07931754609019022, 0.041093067112

In [8]:
col_methods, col_clustering_time, col_stotal_latent, obj_func = [], [], [], []

for row in rows:
    method, clustering_time, stotal_latent, silhouette, obj = row

    col_methods.append(method)
    col_clustering_time.append(clustering_time)
    col_stotal_latent.append(stotal_latent)
    obj_func.append(obj)

col_dict = {'method': col_methods, 'clustering time': col_clustering_time,
            'silhouette total': col_stotal_latent, 'objective function': obj_func}

df = pandas.DataFrame(data=col_dict)
df

Unnamed: 0,method,clustering time,silhouette total,objective function
0,CLASSIC,74.809344,0.08593,2368436.0
1,LSH,280.661419,0.041105,143204900.0
2,CUBE,26.497587,0.071546,4.414002e+121


# Grid Search for K-Means in Latent Space

In [9]:
best_params_lsh = [4, 7, 1, 0.6]            # L, k, limit_queries, window
best_params_hypercube = [67, 3, 1000, 0.42] # M, k, probes, window

rows = []

In [10]:
def print_results(clustering_time, stotal_latent, silhouette, obj_func):
    print("Clustering time       :", clustering_time)
    print("Total silhouette      :", stotal_latent)
    print("Silhouette per cluster:", silhouette)
    print("Objective function    :", obj_func)
    print("-------------------------")

def run_kmeans_classic_latent(model):
    encoded_dataset = model_to_files[model][2]

    config = {
        'model': b'CLASSIC',
        'vals': [],
        'dataset': encoded_dataset,
    }

    clustering_time, stotal_latent, silhouette, obj = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val, obj.value)

    sil_val = silhouette.val

    rows.append([model, 'CLASSIC', clustering_time.value, stotal_latent.value, sil_val, obj.value])

    del silhouette

def run_kmeans_lsh_latent(model):
    encoded_dataset = model_to_files[model][2]

    config = {
        'model': b'LSH',
        'vals': best_params_lsh[:-1],
        'window': best_params_lsh[-1],
        'dataset': encoded_dataset,
    }

    clustering_time, stotal_latent, silhouette, obj = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val, obj.value)

    sil_val = silhouette.val

    rows.append([model, 'LSH', clustering_time.value, stotal_latent.value, sil_val, obj.value])

    del silhouette

def run_kmeans_cube_latent(model):
    encoded_dataset = model_to_files[model][2]

    config = {
        'model': b'CUBE',
        'vals': best_params_hypercube[:-1],
        'window': best_params_hypercube[-1],
        'dataset': encoded_dataset,
    }

    clustering_time, stotal_latent, silhouette, obj = kmeans_test(conf=config, int_data=0)

    print_results(clustering_time.value, stotal_latent.value, silhouette.val, obj.value)

    sil_val = silhouette.val

    rows.append([model, 'CUBE', clustering_time.value, stotal_latent.value, sil_val, obj.value])

    del silhouette

In [11]:
pool = ThreadPool(processes=4)

for model in models:
    pool.apply_async(run_kmeans_classic_latent, (model,))
    pool.apply_async(run_kmeans_lsh_latent, (model,))
    pool.apply_async(run_kmeans_cube_latent, (model,))

pool.close()
pool.join()

10 inner and 2 outer loops
16 iterations
25 inner and 5 outer loops
129 iterations
Clustering time       : 1.677482
Total silhouette      : 0.15909130173574704
Silhouette per cluster: [0.3371529467194405, 0.08891376218960721, 0.09294598612176265, 0.18537552812824704, 0.19447939106472178, 0.1193459430965385, 0.08320774803612165, 0.11895262297531944, 0.19955684383172817, 0.13187936633485184]
Objective function    : 6842.633540583035
-------------------------
30 inner and 6 outer loops
Clustering time       : 1.48773
Total silhouette      : 0.11714581605509584
Silhouette per cluster: [0.06789285790094304, 0.12512752691288462, 0.0885741714492321, 0.0520836675496866, 0.07529046042306131, 0.17960507750028087, 0.14117835074982943, 0.0699568857004385, 0.0965757069824684, 0.07152993401681204]
Objective function    : 17468978160692.648
-------------------------
10 inner and 2 outer loops
Clustering time       : 3.397638
Total silhouette      : 0.10291066245205349
Silhouette per cluster: [0.02523

In [12]:
col_models, col_methods, col_clustering_time, col_stotal_latent, obj_func = [], [], [], [], []

for row in rows:
    model, method, clustering_time, stotal_latent, silhouette, obj = row

    col_models.append(model)
    col_methods.append(method)
    col_clustering_time.append(clustering_time)
    col_stotal_latent.append(stotal_latent)
    obj_func.append(obj)

col_dict = {'model': col_models, 'method': col_methods,
            'clustering time': col_clustering_time, 'silhouette total': col_stotal_latent, 'objective function': obj_func}

df = pandas.DataFrame(data=col_dict)
df

Unnamed: 0,model,method,clustering time,silhouette total,objective function
0,model_conv_46.keras,CLASSIC,1.677482,0.159091,6842.634
1,model_conv_46.keras,CUBE,1.48773,0.117146,17468980000000.0
2,model_conv_46.keras,LSH,3.397638,0.102911,281998200000.0
3,model_conv_12.keras,CLASSIC,11.850349,0.140008,6783.894
4,model_dense_43.keras,CLASSIC,3.308557,0.115182,11469.64
5,model_conv_12.keras,LSH,4.155892,0.099618,1.764218e+25
6,model_conv_12.keras,CUBE,1.724643,0.103347,2.6701329999999996e+80
7,model_dense_43.keras,LSH,5.625478,0.073951,4604730000000.0
8,model_conv_19.keras,CLASSIC,6.123389,0.143006,7547.121
9,model_dense_43.keras,CUBE,2.045627,0.088174,6.376966e+17


# Results

## Best K-Means variation for each latent space

| model | latent dimension | best (total silhouette) | best (objective function) | best (clustering time) |
|:-----:|:----------------:|:-----------------------:|:-------------------------:|:----------------------:|
| `model_conv_12.keras`    | $25$ | CLASSIC | CLASSIC | CUBE |
| `model_conv_19.keras`    | $32$ | CLASSIC | CLASSIC | CUBE |
| `model_conv_46.keras`    | $18$ | CLASSIC | CLASSIC | CUBE |
| `model_dense_1.keras`    | $46$ | CLASSIC | CLASSIC | CUBE |
| `model_dense_26.keras`   | $23$ | CLASSIC | CLASSIC | CUBE |
| `model_dense_43.keras`   | $33$ | CLASSIC | CLASSIC | CUBE |

## Worst K-Means variation for each latent space

| model | latent dimension | worst (total silhouette) | worst (objective function) | worst (clustering time) |
|:-----:|:----------------:|:-----------------------:|:-------------------------:|:----------------------:|
| `model_conv_12.keras`    | $25$ | LSH  | CUBE | CLASSIC |
| `model_conv_19.keras`    | $32$ | LSH  | CUBE | CLASSIC |
| `model_conv_46.keras`    | $18$ | LSH  | CUBE | LSH     |
| `model_dense_1.keras`    | $46$ | LSH  | CUBE    | CLASSIC |
| `model_dense_26.keras`   | $23$ | LSH  | CUBE | LSH |
| `model_dense_43.keras`   | $33$ | LSH  | CUBE    | LSH |

# Conclusions

The clustering time has reduced to under $10$ sec for all variations of the K-Means algorithm in all latent spaces and the total silhouette of the clusters has almost doubled in all cases.

Looking at the tables given above, it is observed that the classic Lloyd's method for K-Means is the one that yields the best silhouette and objective function values in all latent spaces. However, that comes at the expense of inreasing the clustering time in latent spaces of lower dimension.

The Reverse Hypercube variation of the K-Means algorithm is the one that converges the fastest in all latent spaces, but this choice comes with a great downside; the objective function values are $10$ to $10^{55}$ times higher than the ones recorded for the rest of the variations. For this reason, it should be avoided if the minimization of the objective function is a priority.

The Reverse LSH variation always yields the worst silhouette and has the slowest converge when it comes to latent spaces of higher dimension, so it should generally be avoided.

Calculating silhouette and objective function values for the K-Means algorithms in Latent space without projection for evaluating the original MNIST dataset, it is not indicative of the performance of the algorithms in the Initial space. Though, the analysis inside the current notebook shows how the various K-Means algorithms generally perform in lower dimensional spaces, especially for the case of the time complexity which is significantly reduced.