# Optimization - fine tuning in Initial Space

[1. Optimize LSH](#1-optimize-lsh)

[2. Optimize Hypercube](#2-optimize-hypercube)

[3. Optimize GNNS](#3-optimize-gnns)

[4. Optimize MRNG](#4-optimize-mrng)

[5. Optimize NSG](#5-optimize-nsg)

[6. Results](#6-results)

[7. Conclusions](#7-conclusions)

# Import libraries

In [None]:
import numpy as np

import pandas

import optuna
from optuna.visualization import plot_pareto_front, plot_optimization_history, plot_slice

from params import lsh_test, hypercube_test, gnn_test, mrng_test, nsg_test

In [None]:
input_path = b'MNIST/input.dat'
query_path = b'MNIST/query.dat'

n = 60000

# 1. Optimize LSH

To skip logs, click [here](#visualize-lsh-study-results).

In [None]:
def objective_lsh(trial):
    param_dict = {'k': trial.suggest_int('k', 2, 10),
                  'L': trial.suggest_int('L', 2, 10),
                  'table_size':  trial.suggest_categorical('table_size', [int(n/16), int(n/8), int(n/4)]),
                  'window_size': trial.suggest_int('window_size', 100, 5000),
                  'query_trick': trial.suggest_categorical('query_trick', [True, False])
                 }
    
    print("Trial parameters:", param_dict)

    average_time, aaf, min_neighbors = lsh_test(input_path, query_path, queries_num=100, **param_dict, N=60)

    # trial should return at least 60 neighbors to be used in GNNS
    # penalize model if slower than brute force
    c0 = - min_neighbors.value + 60
    c1 = average_time.value - 0.01
    trial.set_user_attr('constraint', (c0, c1))

    return aaf.value, average_time.value

def constraints(trial):
    return trial.user_attrs['constraint']

In [None]:
%%time
for i in range(10):
    try:
        sampler = optuna.samplers.NSGAIISampler(constraints_func=constraints)
        lsh_study = optuna.create_study(study_name='lsh', directions=['minimize', 'minimize'], sampler=sampler)
        lsh_study.optimize(objective_lsh, n_trials=50)
        print("-------------------- Best trials --------------------")
        trials = sorted(lsh_study.best_trials, key=lambda x: x.values)
        # print feasible trials only
        for trial in trials:
            print("Trial no. {}".format(trial.number))
            print(" Values = {}, Constraints = {}".format(trial.values, trial.user_attrs["constraint"]))
            print(" Params = {}".format(trial.params))
        break
    except:
        print("Trial failed, trying again...")
        continue

In [None]:
df = lsh_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize LSH study results

In [None]:
plot_pareto_front(lsh_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(lsh_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(lsh_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(lsh_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(lsh_study, target = lambda t: t.values[1], target_name = 'average_time')

# 2. Optimize Hypercube

To skip logs, click [here](#visualize-hypercube-study-results).

In [None]:
def objective_hypercube(trial):
    param_dict = {'k': trial.suggest_int('k', 2, 30),
                  'probes': trial.suggest_int('probes', 1, 1000),
                  'N': trial.suggest_int('N', 1, 10)}
    
    print("Trial parameters:", param_dict)
    
    average_time, aaf = hypercube_test(input_path, query_path, queries_num=100, **param_dict, M = 60000)

    return aaf.value, average_time.value

In [None]:
%%time
hypercube_study = optuna.create_study(study_name='hypercube', directions=['minimize', 'minimize'])
hypercube_study.optimize(objective_hypercube, n_trials=50)
print("-----------------------------------------------------")

trials = sorted(hypercube_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = hypercube_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize Hypercube study results

In [None]:
plot_pareto_front(hypercube_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

## Optimize M

To skip logs, click [here](#visualize-hypercube-study-results-m).

In [None]:
def objective_hypercube(trial):
    param_dict = {'k': trial.suggest_int('k', 2, 30),
                  'M': trial.suggest_int('M', 10, 5000),
                  'N': trial.suggest_int('N', 1, 10)}
    
    print("Trial parameters:", param_dict)
    
    average_time, aaf = hypercube_test(input_path, query_path, queries_num=100, **param_dict, probes = 5000)

    return aaf.value, average_time.value

In [None]:
%%time
hypercube_study = optuna.create_study(study_name='hypercube', directions=['minimize', 'minimize'])
hypercube_study.optimize(objective_hypercube, n_trials=50)
print("-----------------------------------------------------")

trials = sorted(hypercube_study.best_trials, key=lambda x: x.values)
for trial in trials:
    print("Trial no. {}".format(trial.number))
    print(" Values = {}".format(trial.values))
    print(" Params = {}".format(trial.params))

In [None]:
df = hypercube_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize Hypercube study results (M)

In [None]:
plot_pareto_front(hypercube_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(hypercube_study, target = lambda t: t.values[1], target_name = 'average_time')

# 3. Optimize GNNS

To skip logs, click [here](#visualize-gnns-study-results).

In [None]:
def objective_gnns(trial):
    param_dict = {'k': trial.suggest_int('k', 40, 100)}
    param_dict.update({'E': trial.suggest_int('E', 40, param_dict['k'])})
    param_dict.update({'R': trial.suggest_int('R', 1, 10)})

    print("Trial params", param_dict)

    average_time, aaf = gnn_test(input_path, query_path, queries_num=100, **param_dict, N=5)

    return aaf.value, average_time.value

In [None]:
%%time
for i in range(10):
    try:
        gnns_study = optuna.create_study(study_name='gnns', directions=['minimize', 'minimize'])
        gnns_study.optimize(objective_gnns, n_trials=50)
        print("-------------------- Best trials --------------------")
        trials = sorted(gnns_study.best_trials, key=lambda x: x.values)
        for trial in trials:
            print("Trial no. {}".format(trial.number))
            print(" Values = {}".format(trial.values))
            print(" Params = {}".format(trial.params))
        break
    except:
        print("Trial failed, trying again...")
        continue

In [None]:
df = gnns_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize GNNS study results

In [None]:
plot_pareto_front(gnns_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(gnns_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(gnns_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(gnns_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(gnns_study, target = lambda t: t.values[1], target_name = 'average_time')

# 4. Optimize MRNG

To skip logs, click [here](#visualize-mrng-study-results).

In [None]:
def objective_mrng(trial):
    param_dict = {'l': trial.suggest_int('l', 1, 1000)}
    param_dict.update({'N': trial.suggest_int('N', 1, param_dict['l'])})
    
    print("Trial parameters:", param_dict)

    average_time, aaf = mrng_test(input_path, query_path, queries_num=100, **param_dict)

    return aaf.value, average_time.value

In [None]:
%%time
for i in range(10):
    try:
        mrng_study = optuna.create_study(study_name='mrng', directions=['minimize', 'minimize'])
        mrng_study.optimize(objective_mrng, n_trials=50)
        print("-------------------- Best trials --------------------")
        trials = sorted(mrng_study.best_trials, key=lambda x: x.values)
        for trial in trials:
            print("Trial no. {}".format(trial.number))
            print(" Values = {}".format(trial.values))
            print(" Params = {}".format(trial.params))
        break
    except:
        print("Trial failed, trying again...")
        continue

In [None]:
df = mrng_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize MRNG study results

In [None]:
plot_pareto_front(mrng_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(mrng_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(mrng_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(mrng_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(mrng_study, target = lambda t: t.values[1], target_name = 'average_time')

# 5. Optimize NSG

To skip logs, click [here](#visualize-nsg-study-results).

In [None]:
def objective_nsg(trial):
    param_dict = {'m' : trial.suggest_int('m', 3, 500),
                  'l' : trial.suggest_int('l', 10, 4000),
                  'lq': trial.suggest_int('lq', 1, 2000),
                  'k' : trial.suggest_int('k', 40, 100)}
    
    print("Trial parameters:", param_dict)

    average_time, aaf = nsg_test(input_path, query_path, queries_num=100, **param_dict, N=5)

    return aaf.value, average_time.value

In [None]:
%%time
for i in range(10):
    try:
        nsg_study = optuna.create_study(study_name='nsg', directions=['minimize', 'minimize'])
        nsg_study.optimize(objective_nsg, n_trials=50, n_jobs=-1)
        print("-------------------- Best trials --------------------")
        trials = sorted(nsg_study.best_trials, key=lambda x: x.values)
        for trial in trials:
            print("Trial no. {}".format(trial.number))
            print(" Values = {}".format(trial.values))
            print(" Params = {}".format(trial.params))
        break
    except:
        print("Trial failed, trying again...")
        continue

In [None]:
df = nsg_study.trials_dataframe()

df_sorted = df.copy(deep=True)
df_sorted = df_sorted.dropna(subset=['values_0', 'values_1'])
df_sorted = df_sorted.sort_values(by=['values_0', 'values_1'], ascending=[True, True])
df_sorted = df_sorted.reset_index(drop=True)
df_sorted

## Visualize NSG study results

In [None]:
plot_pareto_front(nsg_study, target_names=['aaf', 'average_time'])

In [None]:
plot_optimization_history(nsg_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_optimization_history(nsg_study, target = lambda t: t.values[1], target_name = 'average_time')

In [None]:
plot_slice(nsg_study, target = lambda t: t.values[0], target_name = 'aaf')

In [None]:
plot_slice(nsg_study, target = lambda t: t.values[1], target_name = 'average_time')

# 6. Results

# 7. Conclusions