In [1]:
import os

import numpy as np

from tensorflow.keras.models import load_model

from autoencoder import Autoencoder
from helper_funcs import *

import pandas
pandas.set_option('display.max_rows', None)

from params import kmeans_test

2024-01-02 00:38:11.197190: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-02 00:38:11.252580: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-02 00:38:11.253599: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
models = os.listdir('./models/')

dataset = b'MNIST/input.dat'
query   = b'MNIST/query.dat'

model_to_files = {}
for i, model in enumerate(models):
    normalized_dataset = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_dataset.dat'
    normalized_query   = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_query.dat'
    encoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_dataset.dat'
    encoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_query.dat'
    decoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_decoded_dataset.dat'
    decoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_decoded_query.dat'

    model_to_files.update({models[i] : [normalized_dataset, normalized_query,
                                        encoded_dataset, encoded_query,
                                        decoded_dataset, decoded_query]})

n = 60000

In [3]:
for model in model_to_files:
    normalized_dataset, normalized_query, encoded_dataset, encoded_query, decoded_dataset, decoded_query = model_to_files[model]

    model = b'models/' + model.encode()

    # load model
    autoencoder = load_model(model.decode())
    shape = autoencoder.layers[-2].output_shape[1:] # get shape of encoded layer

    # load dataset
    x_train = load_dataset(dataset)
    x_train = x_train.astype('float32') / 255.
    x_test = load_dataset(query)
    x_test = x_test.astype('float32') / 255.
    if len(shape) == 3: # if model type is convolutional
        x_train = np.reshape(x_train, (len(x_train), 28, 28, 1))
        x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))
    else:
        x_train = np.reshape(x_train, (len(x_train), 784))
        x_test = np.reshape(x_test, (len(x_test), 784))

    encoded_train = autoencoder.encode(x_train)
    encoded_test = autoencoder.encode(x_test)

    # deflatten encoded datasets
    encoded_train = deflatten_encoded(encoded_train, shape)
    encoded_test = deflatten_encoded(encoded_test, shape)

    # decode encoded datasets
    decoded_train = autoencoder.decode(encoded_train)
    decoded_test = autoencoder.decode(encoded_test)

    # save original datasets normalized
    save_decoded_binary(x_train, normalized_dataset)
    save_decoded_binary(x_test, normalized_query)

    # normalize encoded datasets
    encoded_train = normalize(encoded_train)
    encoded_test = normalize(encoded_test)

    # save encoded datasets
    save_encoded_binary(encoded_train, encoded_dataset)
    save_encoded_binary(encoded_test, encoded_query)

    # normalize decoded datasets
    decoded_train = normalize(decoded_train)
    decoded_test = normalize(decoded_test)

    # save decoded datasets
    save_decoded_binary(decoded_train, decoded_dataset)
    save_decoded_binary(decoded_test, decoded_query)



In [4]:
best_params_lsh = [4, 7, 0.6]               # L, k, window
best_params_hypercube = [67, 3, 1000, 0.42] # M, k, probes, window
best_params = {'CLASSIC': [], 'LSH': best_params_lsh, 'CUBE': best_params_hypercube}
methods = ['CLASSIC', 'LSH', 'CUBE']

rows = []
for model in models:
    normalized_dataset, normalized_query, encoded_dataset, encoded_query, decoded_dataset, decoded_query = model_to_files[model]

    for method in methods:

        vals = best_params[method]
        if method != 'CLASSIC':
            vals = best_params[method][:-1]
            window = best_params[method][-1]

        config = {
            'model': bytes(method, encoding='ascii'),
            'vals': vals,
            'dataset': normalized_dataset,
            'query': normalized_query,
            'encoded_dataset': encoded_dataset,
            'decoded_dataset': decoded_dataset,
        }

        if method != 'CLASSIC':
            config.update({'window': window})

        print(model, method)
        
        clustering_time, stotal_latent, silhouette = kmeans_test(conf=config, int_data=0)

        print("Clustering time :", clustering_time.value)
        print("Total silhouette:", stotal_latent.value)
        print("Silhouette per cluster:", silhouette.val)
        print("-------------------------")

        rows.append([model, method, clustering_time.value, stotal_latent.value, silhouette.val])

model_conv_46.keras CLASSIC
22 iterations
Clustering time : 15.899307
Total silhouette: 0.07890865297981675
Silhouette per cluster: [0.05065689632094605, 0.19091071730086365, 0.03281045045532906, 0.0805780157516034, 0.09497994904937085, 0.08890440974906783, 0.022097532314671417, 0.12032924893138658, 0.09534568642091969, 0.04140982230393051]
-------------------------
model_conv_46.keras LSH
190 inner and 38 outer loops
Clustering time : 949.799609
Total silhouette: 0.06732741133455376
Silhouette per cluster: [0.04519220721134544, 0.09893339542827263, 0.03635526355437012, 0.025192514472264005, 0.06034095393093595, 0.03638561389343547, 0.07364804818046239, 0.1301007352926966, 0.06117189744717166, 0.045547942448266696]
-------------------------
model_conv_46.keras CUBE
9 inner and 2 outer loops
Clustering time : 5.159251
Total silhouette: 0.055229458979181634
Silhouette per cluster: [0.07925560092244639, 0.027911835197695224, 0.041896277247997125, 0.04453352115523977, 0.01340710284403069, 

In [5]:
col_models, col_methods, col_clustering_time, col_stotal_latent = [], [], [], []

for row in rows:
    model, method, clustering_time, stotal_latent, silhouette = row

    col_models.append(model)
    col_methods.append(method)
    col_clustering_time.append(clustering_time)
    col_stotal_latent.append(stotal_latent)

col_dict = {'model': col_models, 'method': col_methods,
            'clustering time': col_clustering_time, 'silhouette total': col_stotal_latent}

df = pandas.DataFrame(data=col_dict)
df

Unnamed: 0,model,method,clustering time,silhouette total
0,model_conv_46.keras,CLASSIC,15.899307,0.078909
1,model_conv_46.keras,LSH,949.799609,0.067327
2,model_conv_46.keras,CUBE,5.159251,0.055229
3,model_conv_12.keras,CLASSIC,35.740265,0.084699
4,model_conv_12.keras,LSH,1078.105773,0.07587
5,model_conv_12.keras,CUBE,5.42455,0.057079
6,model_dense_43.keras,CLASSIC,31.314821,0.077825
7,model_dense_43.keras,LSH,460.032422,0.078763
8,model_dense_43.keras,CUBE,5.304123,0.068384
9,model_conv_19.keras,CLASSIC,22.162195,0.08433
