# Comparison of results in initial and latent space

[1. Optimize LSH](#1-optimize-lsh)

[2. Optimize Hypercube](#2-optimize-hypercube)

[3. Optimize K-Means](#3-optimize-k-means)

[4. Optimize GNNS](#4-optimize-gnns)

[5. Optimize MRNG](#5-optimize-mrng)

[6. Optimize NSG](#6-optimize-nsg)

[7. Grid Search](#7-grid-search)

[8. Results](#8-results)

+ [8. a. Optimization Results](#8-a-optimization-results)

+ [8. b. Grid Search Results](#8-b-grid-search-results)

[9. Conclusions](#9-conclusions)

# Import libraries

In [None]:
import os

import numpy as np

from tensorflow.keras.models import load_model

from autoencoder import Autoencoder
from helper_funcs import *

import pandas
pandas.set_option('display.max_rows', None)

import optuna
from optuna.visualization import plot_pareto_front, plot_optimization_history, plot_slice

from params import lsh_test, hypercube_test, kmeans_test, gnn_test, mrng_test, nsg_test, get_aaf

In [None]:
models = os.listdir('./models/')

dataset = b'MNIST/input.dat'
query   = b'MNIST/query.dat'

model_to_files = {}
for i, model in enumerate(models):
    normalized_dataset = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_dataset.dat'
    normalized_query   = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_normalized_query.dat'
    encoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_dataset.dat'
    encoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_encoded_query.dat'
    decoded_dataset    = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_decoded_dataset.dat'
    decoded_query      = b'MNIST/' + models[i].removesuffix('.keras').encode() + b'_decoded_query.dat'

    model_to_files.update({models[i] : [normalized_dataset, normalized_query,
                                        encoded_dataset, encoded_query,
                                        decoded_dataset, decoded_query]})

n = 60000

In [None]:
for model in model_to_files:
    normalized_dataset, normalized_query, encoded_dataset, encoded_query, decoded_dataset, decoded_query = model_to_files[model]

    model = b'models/' + model.encode()

    # load model
    autoencoder = load_model(model.decode())
    shape = autoencoder.layers[-2].output_shape[1:] # get shape of encoded layer

    # load dataset
    x_train = load_dataset(dataset)
    x_train = x_train.astype('float32') / 255.
    x_test = load_dataset(query)
    x_test = x_test.astype('float32') / 255.
    if len(shape) == 3: # if model type is convolutional
        x_train = np.reshape(x_train, (len(x_train), 28, 28, 1))
        x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))
    else:
        x_train = np.reshape(x_train, (len(x_train), 784))
        x_test = np.reshape(x_test, (len(x_test), 784))

    encoded_train = autoencoder.encode(x_train)
    encoded_test = autoencoder.encode(x_test)

    # deflatten encoded datasets
    encoded_train = deflatten_encoded(encoded_train, shape)
    encoded_test = deflatten_encoded(encoded_test, shape)

    # decode encoded datasets
    decoded_train = autoencoder.decode(encoded_train)
    decoded_test = autoencoder.decode(encoded_test)

    # save original datasets normalized
    save_decoded_binary(x_train, normalized_dataset)
    save_decoded_binary(x_test, normalized_query)

    # normalize encoded datasets
    encoded_train = normalize(encoded_train)
    encoded_test = normalize(encoded_test)

    # save encoded datasets
    save_encoded_binary(encoded_train, encoded_dataset)
    save_encoded_binary(encoded_test, encoded_query)

    # normalize decoded datasets
    decoded_train = normalize(decoded_train)
    decoded_test = normalize(decoded_test)

    # save decoded datasets
    save_decoded_binary(decoded_train, decoded_dataset)
    save_decoded_binary(decoded_test, decoded_query)

# 1. Optimize LSH

To skip logs, click [here](#visualize-lsh-study-results).

In [None]:
# to be copied from optimize_lsh.ipynb (3 cells)

## Visualize LSH study results

In [None]:
# to be copied from optimize_lsh.ipynb (5 cells)

# 2. Optimize Hypercube

## Optimize probes

To skip logs, click [here](#visualize-hypercube-study-results-probes).

In [None]:
def objective_hypercube(trial):
    model = trial.suggest_categorical('model', model_to_files.keys())
    param_dict = {'k': trial.suggest_int('k', 2, 30),
                  'probes': trial.suggest_int('probes', 1, 1000),
                  'N': trial.suggest_int('N', 1, 10),
                  'window': trial.suggest_float('window_size', 0.01, 1)}
    
    print("Trial parameters:", param_dict)

    encoded_dataset, encoded_query = model_to_files[model][2:4]
    
    average_time, aaf_latent = hypercube_test(encoded_dataset, encoded_query, queries_num=100, **param_dict, M = 60000, int_data=0)

    return aaf_latent.value, average_time.value

In [None]:
%%time
hypercube_study = optuna.create_study(study_name='hypercube', directions=['minimize', 'minimize'])
hypercube_study.optimize(objective_hypercube, n_trials=50, n_jobs=-1)
print("-----------------------------------------------------")

trials = sorted(hypercube_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = hypercube_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize Hypercube study results (probes)

In [None]:
plot_pareto_front(hypercube_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

## Optimize M

To skip logs, click [here](#visualize-hypercube-study-results-m).

In [None]:
def objective_hypercube(trial):
    model = trial.suggest_categorical('model', model_to_files.keys())
    param_dict = {'k': trial.suggest_int('k', 2, 30),
                  'M': trial.suggest_int('M', 10, 1000),
                  'N': trial.suggest_int('N', 1, 10)}
    
    print("Trial parameters:", param_dict)

    encoded_dataset, encoded_query = model_to_files[model][2:4]
    
    average_time, aaf_latent = hypercube_test(encoded_dataset, encoded_query, queries_num=100, **param_dict, probes = 5000)

    return aaf_latent.value, average_time.value

In [None]:
%%time
hypercube_study = optuna.create_study(study_name='hypercube', directions=['minimize', 'minimize'])
hypercube_study.optimize(objective_hypercube, n_trials=50, n_jobs=-1)
print("-----------------------------------------------------")

trials = sorted(hypercube_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = hypercube_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize Hypercube study results (M)

In [None]:
plot_pareto_front(hypercube_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

# 3. Optimize K-Means

To skip logs, click [here](#visualize-k-means-study-results).

In [None]:
def objective_kmeans(trial):
    model = trial.suggest_categorical('model', model_to_files.keys())
    param_dict = {'method': trial.suggest_categorical('method', ['CLASSIC', 'LSH', 'CUBE'])}
    
    enc_vals = []
    if param_dict['method'] == 'LSH':
        param_dict.update({'k': trial.suggest_int('k', 1, 10)})
        param_dict.update({'L': trial.suggest_int('L', 1, 10)})
        param_dict.update({'window': trial.suggest_float('window', 0.01, 1)})
        enc_vals = [param_dict['k'], param_dict['L']]
    elif param_dict['method'] == 'CUBE':
        param_dict.update({'M': trial.suggest_int('M', 10, 5000)})
        param_dict.update({'k': trial.suggest_int('k', 2, 30)})
        param_dict.update({'probes': trial.suggest_int('probes', 1, 1000)})
        param_dict.update({'window': trial.suggest_float('window', 0.01, 1)})
        enc_vals = [param_dict['M'], param_dict['k'], param_dict['probes']]

    normalized_dataset, normalized_query, encoded_dataset, encoded_query, decoded_dataset, decoded_query = model_to_files[model]

    config = {
        'model': bytes(param_dict['method'], encoding='ascii'),
        'enc_vals': enc_vals,
        'dataset': normalized_dataset,
        'query': normalized_query,
        'encoded_dataset': encoded_dataset,
        'decoded_dataset': decoded_dataset,
    }

    if param_dict['method'] != 'CLASSIC':
        config.update({'window': param_dict['window']})

    print("Trial parameters:", param_dict)

    average_time, stotal_latent, silhouette = kmeans_test(conf=config, int_data=0)

    print("Silhouette per cluster:", silhouette.val)

    del silhouette

    return stotal_latent.value, average_time.value

In [None]:
%%time
kmeans_study = optuna.create_study(study_name='kmeans', directions=['minimize', 'minimize'])
kmeans_study.optimize(objective_kmeans, n_trials=50, n_jobs=-1)
print("-------------------- Best trials --------------------")
trials = sorted(kmeans_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = kmeans_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize K-Means study results

In [None]:
plot_pareto_front(kmeans_study, target_names=['stotal', 'average_time'])

In [None]:
plot_optimization_history(kmeans_study, target = lambda t: t.values[0], target_name = 'stotal')

In [None]:
plot_optimization_history(kmeans_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(kmeans_study, target = lambda t: t.values[0], target_name = 'stotal')

In [None]:
plot_slice(kmeans_study, target = lambda t: t.values[1], target_name = 'average_time')

# 4. Optimize GNNS

To skip logs, click [here](#visualize-gnns-study-results).

In [None]:
def objective_gnns(trial):
    model = trial.suggest_categorical('model', model_to_files.keys())
    param_dict = {'k': trial.suggest_int('k', 40, 100)}
    param_dict.update({'E': trial.suggest_int('E', 40, param_dict['k'])})
    param_dict.update({'R': trial.suggest_int('R', 1, 10)})

    print("Trial params", param_dict)

    encoded_dataset, encoded_query = model_to_files[model][2:4]

    average_time, aaf_latent = gnn_test(encoded_dataset, encoded_query, queries_num=100, **param_dict, N=5, int_data=0)

    return aaf_latent.value, average_time.value

In [None]:
%%time
gnns_study = optuna.create_study(study_name='gnns', directions=['minimize', 'minimize'])
gnns_study.optimize(objective_gnns, n_trials=100, n_jobs=-1)
print("-------------------- Best trials --------------------")
trials = sorted(gnns_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = gnns_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize GNNS study results

In [None]:
plot_pareto_front(gnns_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(gnns_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(gnns_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(gnns_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(gnns_study, target = lambda t: t.values[1], target_name = 'average_time')

# 5. Optimize MRNG

To skip logs, click [here](#visualize-mrng-study-results).

In [None]:
def objective_mrng(trial):
    model = trial.suggest_categorical('model', model_to_files.keys())
    param_dict = {'l': trial.suggest_int('l', 1, 1000)}
    param_dict.update({'N': trial.suggest_int('N', 1, param_dict['l'])})
    
    print("Trial parameters:", param_dict)

    encoded_dataset, encoded_query = model_to_files[model][2:4]

    average_time, aaf_latent = mrng_test(encoded_dataset, encoded_query, queries_num=100, **param_dict, int_data=0)

    return aaf_latent.value, average_time.value

In [None]:
%%time
mrng_study = optuna.create_study(study_name='mrng', directions=['minimize', 'minimize'])
mrng_study.optimize(objective_mrng, n_trials=50, n_jobs=-1)
print("-------------------- Best trials --------------------")
trials = sorted(mrng_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = mrng_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize MRNG study results

In [None]:
plot_pareto_front(mrng_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(mrng_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(mrng_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(mrng_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(mrng_study, target = lambda t: t.values[1], target_name = 'average_time')

# 6. Optimize NSG

To skip logs, click [here](#visualize-nsg-study-results).

In [None]:
def objective_nsg(trial):
    model = trial.suggest_categorical('model', model_to_files.keys())
    param_dict = {'m' : trial.suggest_int('m', 3, 200),
                  'l' : trial.suggest_int('l', 10, 1000),
                  'lq': trial.suggest_int('lq', 1, 1000),
                  'k' : trial.suggest_int('k', 40, 100)}
    
    print("Trial parameters:", param_dict)

    encoded_dataset, encoded_query = model_to_files[model][2:4]

    average_time, aaf_latent = nsg_test(encoded_dataset, encoded_query, queries_num=100, **param_dict, N=5, int_data=0)

    return aaf_latent.value, average_time.value

In [None]:
%%time
nsg_study = optuna.create_study(study_name='nsg', directions=['minimize', 'minimize'])
nsg_study.optimize(objective_nsg, n_trials=100, n_jobs=-1)
print("-------------------- Best trials --------------------")
trials = sorted(nsg_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = nsg_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize NSG study results

In [None]:
plot_pareto_front(nsg_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(nsg_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(nsg_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(nsg_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(nsg_study, target = lambda t: t.values[1], target_name = 'average_time')

# 7. Grid Search

# 8. Results

## 8. a. Optimization Results

## 8. b. Grid Search Results

# 9. Conclusions