In [34]:
from src.config import Config
from src.utils import Utils
from src.engine import VectorEngine

from time import time
import random
import pandas as pd

def perform_search(db, queries, brute, symmetric=False, cached=True, top_k=5):
    start_time = time()
    results = db.search(queries, brute=brute, symmetric=symmetric, cached=cached, top_k=top_k)
    search_time = time() - start_time
    return results, search_time

def run_experiment(embd_dim, num_queries, num_docs, codebook_size, num_subvectors, queries_topk, keys_topk):
    queries, docs = Utils.generate_data(embd_dim, num_queries, num_docs)

    db = VectorEngine(embd_dim)
    db.add(docs)
    db.indexing(method='vectored', codebook_size=codebook_size)

    results = []

    # Perform ground truth search
    ground_truths, search_time_gt = perform_search(db, queries, brute=True, top_k=queries_topk)
    results.append(("brute", "Asymmetric", "Not Cached", search_time_gt, 1.0))

    # Perform asymmetric vector quantization search
    vq_asym_results, search_time_vq_asym = perform_search(db, queries, brute=False, symmetric=False, top_k=keys_topk)
    search_recall_vq_asym = Utils.get_recall(ground_truths, vq_asym_results)
    results.append(("VQ", "Asymmetric", "Not Cached", search_time_vq_asym, search_recall_vq_asym))

    # Perform symmetric vector quantization search
    vq_sym_results, search_time_vq_sym = perform_search(db, queries, brute=False, symmetric=True, top_k=keys_topk)
    search_recall_vq_sym = Utils.get_recall(ground_truths, vq_sym_results)
    results.append(("VQ", "Symmetric", "Cached", search_time_vq_sym, search_recall_vq_sym))

    # Perform asymmetric product quantization search
    db.indexing(method='producted', codebook_size=codebook_size, num_subvectors=num_subvectors)
    pq_asym_results, search_time_pq_asym = perform_search(db, queries, brute=False, symmetric=False, cached=False, top_k=keys_topk)
    search_recall_pq_asym = Utils.get_recall(ground_truths, pq_asym_results)
    results.append(("PQ", "Asymmetric", "Not Cached", search_time_pq_asym, search_recall_pq_asym))

    pq_asym_results, search_time_pq_asym = perform_search(db, queries, brute=False, symmetric=False, cached=True, top_k=keys_topk)
    search_recall_pq_asym = Utils.get_recall(ground_truths, pq_asym_results)
    results.append(("PQ", "Asymmetric", "Cached", search_time_pq_asym, search_recall_pq_asym))

    # Perform symmetric product quantization search
    pq_sym_results, search_time_pq_sym = perform_search(db, queries, brute=False, symmetric=True, top_k=keys_topk)
    search_recall_pq_sym = Utils.get_recall(ground_truths, pq_sym_results)
    results.append(("PQ", "Symmetric", "Cached", search_time_pq_sym, search_recall_pq_sym))

    # Create DataFrame to store results
    columns = ["Algorithm", "Symmetric", "Cached", "Search Time", "Recall"]
    df = pd.DataFrame(results, columns=columns)

    return df

def random_search(num_experiments):
    results_df = pd.DataFrame()
    
    for _ in range(num_experiments):
        # constant
        embd_dim = 120
        num_queries = 100
        num_docs = 50_000
        queries_topk = 5

        # Set random hyperparameters
        codebook_size = random.choice([10, 20, 40, 80, 160])
        num_subvectors = random.choice([2, 3, 4, 5, 8])
        keys_topk = random.choice([5, 10, 20, 30, 40])
        
        # Run experiment and append results to DataFrame
        experiment_df = run_experiment(embd_dim, num_queries, num_docs, codebook_size, num_subvectors, queries_topk, keys_topk)
        results_df = pd.concat([results_df, experiment_df], ignore_index=True)
    
    return results_df

# Run random search with 10 experiments
num_experiments = 50
results_df = random_search(num_experiments)
print(results_df)

    Algorithm   Symmetric      Cached  Search Time  Recall
0       brute  Asymmetric  Not Cached     0.109103   1.000
1          VQ  Asymmetric  Not Cached     0.058601   1.000
2          VQ   Symmetric      Cached     0.019934   0.076
3          PQ  Asymmetric  Not Cached     0.309252   0.026
4          PQ  Asymmetric      Cached     0.163474   0.026
..        ...         ...         ...          ...     ...
295        VQ  Asymmetric  Not Cached     0.012042   0.388
296        VQ   Symmetric      Cached     0.053888   0.124
297        PQ  Asymmetric  Not Cached     0.677206   0.312
298        PQ  Asymmetric      Cached     0.401837   0.312
299        PQ   Symmetric      Cached     0.486327   0.146

[300 rows x 5 columns]


In [35]:
import os

results_df.to_csv(os.path.join(Config.RESOURCE_DIR, 'results_11.csv'), index=False)
# Config.RESOURCE_DIR = 'resources'

In [38]:
grouped_df = results_df.groupby(['Algorithm', 'Symmetric', 'Cached'], as_index=False).mean(numeric_only=True)
sorted_df = grouped_df.sort_values(by='Search Time', ascending=True)

sorted_df

Unnamed: 0,Algorithm,Symmetric,Cached,Search Time,Recall
4,VQ,Symmetric,Cached,0.01979,0.0892
3,VQ,Asymmetric,Not Cached,0.046968,0.84088
5,brute,Asymmetric,Not Cached,0.184452,1.0
0,PQ,Asymmetric,Cached,0.266374,0.10428
2,PQ,Symmetric,Cached,0.307186,0.05836
1,PQ,Asymmetric,Not Cached,0.519876,0.10428


In [22]:
results_df

Unnamed: 0,Algorithm,Symmetric,Cached,Search Time,Recall
0,brute,Asymmetric,Not Cached,0.024248,1.0
1,VQ,Asymmetric,Not Cached,0.008708,0.266
2,VQ,Symmetric,Cached,0.006685,0.042
3,PQ,Asymmetric,Not Cached,0.061801,0.038
4,PQ,Asymmetric,Cached,0.021183,0.038
5,PQ,Symmetric,Cached,0.028741,0.02
6,brute,Asymmetric,Not Cached,0.012938,1.0
7,VQ,Asymmetric,Not Cached,0.009891,0.37
8,VQ,Symmetric,Cached,0.005294,0.096
9,PQ,Asymmetric,Not Cached,0.028389,0.082
