In [None]:
import faiss
import numpy as np
import matplotlib.pyplot as plt
from vasili_helpers import *
from tqdm import tqdm

In [None]:
faiss.omp_set_num_threads(64)

In [None]:
# xb,xq,gt,metric = load_dataset("SIFT1M")
# nq, d = xq.shape
# k = 10

In [None]:
ANN_DATASETS = [
    "deep-image-96-angular", # need to run again
    "glove-25-angular", # not ok
    "glove-50-angular", # not ok
    "glove-100-angular", # not ok
    "sift-128-euclidean", # ok
    "lastfm-64-dot", # ?
]

# nlist_values = [100, 500, 1000, 2000, 5000, 10000, 15000]  # Different nlist sizes
nlist_values = [int(x) for x in np.geomspace(100, 16000, num=10)] # logarithmic, increments of 10, between [100,16000] which is faiss recommendation
nprobe_values = [1,2,4,8,16, 32, 64, 128, 256]

for dataset_name in ANN_DATASETS:
    xb, xq, gt, metric = load_dataset(dataset_name)
    # xq = xq[:100,:]
    nq, d = xq.shape
    k = 10

    # Store recall results for plotting
    results = {}

    for nlist in nlist_values:
        # Build and train the IVF index
        # km_n_iter = 0 (untrained index, "worst case nprobe")
        index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
        
        recalls = []
        for nprobe in nprobe_values:
            # Skip nprobe values larger than 0.8 * nlist
            if nprobe > 0.8 * nlist:
                recalls.append(None)  # Add None for missing values
                continue
            
            # Set nprobe for the index
            index_ivf.nprobe = nprobe
            
            # Perform search with the current nprobe
            _, ivf_results = index_ivf.search(xq, k)
            
            # Compute recall and append
            recall = compute_recall(ivf_results, gt, k)
            recalls.append(recall)
        
        # Store results for the current nlista
        results[nlist] = recalls

    # Plot Recall vs. nprobe for different nlist values
    plt.figure(figsize=(10, 7))

    for nlist, recalls in results.items():
        plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

    plt.xlabel('nprobe')
    plt.ylabel('Recall')
    plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
    plt.ylim(0, 1)              # Set y-axis to [0, 1]
    plt.xscale('log', base=2)
    plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
    plt.legend(title="nlist", loc='lower right')
    plt.grid(True)
    plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
    plt.show()

In [None]:

xb, xq, gt, metric = load_dataset("lastfm-64-dot")

In [None]:
# nlist_values = [100, 500, 1000, 2000, 5000, 10000, 15000]  # Different nlist sizes
nlist_values = [int(x) for x in np.geomspace(100, 16000, num=10)] # logarithmic, increments of 10, between [100,16000] which is faiss recommendation
nprobe_values = [1,2,4,8,16, 32, 64, 128, 256]

for dataset_name in ANN_DATASETS:
    xb, xq, gt, metric = load_dataset(dataset_name)
    # xq = xq[:100,:]
    nq, d = xq.shape
    k = 10

    # Store recall results for plotting
    results = {}

    for nlist in nlist_values:
        # Build and train the IVF index
        # km_n_iter = 0 (untrained index, "worst case nprobe")
        index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
        
        recalls = []
        for nprobe in nprobe_values:
            # Skip nprobe values larger than 0.8 * nlist
            if nprobe > 0.8 * nlist:
                recalls.append(None)  # Add None for missing values
                continue
            
            # Set nprobe for the index
            index_ivf.nprobe = nprobe
            
            # Perform search with the current nprobe
            _, ivf_results = index_ivf.search(xq, k)
            
            # Compute recall and append
            recall = compute_recall(ivf_results, gt, k)
            recalls.append(recall)
        
        # Store results for the current nlista
        results[nlist] = recalls

    # Plot Recall vs. nprobe for different nlist values
    plt.figure(figsize=(10, 7))

    for nlist, recalls in results.items():
        plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

    plt.xlabel('nprobe')
    plt.ylabel('Recall')
    plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
    plt.ylim(0, 1)              # Set y-axis to [0, 1]
    plt.xscale('log', base=2)
    plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
    plt.legend(title="nlist", loc='lower right')
    plt.grid(True)
    plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
    plt.show()

In [None]:
# nlist_values = [100, 500, 1000, 2000, 5000, 10000, 15000]  # Different nlist sizes
nlist_values = [int(x) for x in np.geomspace(100, 16000, num=10)] # logarithmic, increments of 10, between [100,16000] which is faiss recommendation
nprobe_values = [1,2,4,8,16, 32, 64, 128, 256]

for dataset_name in ANN_DATASETS:
    xb, xq, gt, metric = load_dataset(dataset_name)
    # xq = xq[:100,:]
    nq, d = xq.shape
    k = 10

    # Store recall results for plotting
    results = {}

    for nlist in nlist_values:
        # Build and train the IVF index
        # km_n_iter = 0 (untrained index, "worst case nprobe")
        index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
        
        recalls = []
        for nprobe in nprobe_values:
            # Skip nprobe values larger than 0.8 * nlist
            if nprobe > 0.8 * nlist:
                recalls.append(None)  # Add None for missing values
                continue
            
            # Set nprobe for the index
            index_ivf.nprobe = nprobe
            
            # Perform search with the current nprobe
            _, ivf_results = index_ivf.search(xq, k)
            
            # Compute recall and append
            recall = compute_recall(ivf_results, gt, k)
            recalls.append(recall)
        
        # Store results for the current nlista
        results[nlist] = recalls

    # Plot Recall vs. nprobe for different nlist values
    plt.figure(figsize=(10, 7))

    for nlist, recalls in results.items():
        plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

    plt.xlabel('nprobe')
    plt.ylabel('Recall')
    plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
    plt.ylim(0, 1)              # Set y-axis to [0, 1]
    plt.xscale('log', base=2)
    plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
    plt.legend(title="nlist", loc='lower right')
    plt.grid(True)
    plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
    plt.show()

In [None]:
dataset_name = "glove-25-angular"
xb, xq, gt, metric = load_dataset(dataset_name)
xq = xq[:10, :]
gt = gt[:10, :]
k=10
metric

In [None]:
xb.shape

In [None]:
xq.shape

In [None]:
# Define nlist and nprobe values
nlist_values = [100, 500, 1000, 2000, 3000, 4000, 5000, 8000, 10000, 13000, 16000]  # Different nlist sizes from 100...16000
nprobe_values = [1,2,4,8,16, 32, 64, 128, 256]

# Store recall results for plotting
results = {}

for nlist in nlist_values:
    # Build and train the IVF index
    # km_n_iter = 0 (untrained index, "worst case nprobe")
    print(metric)
    index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
    
    recalls = []
    for nprobe in nprobe_values:
        # Skip nprobe values larger than 0.8 * nlist
        if nprobe > nlist:
            recalls.append(None)  # Add None for missing values
            continue
        
        # Set nprobe for the index
        index_ivf.nprobe = nprobe
        
        # Perform search with the current nprobe
        _, ivf_results = index_ivf.search(xq, k)
        # Compute recall and append
        recall = compute_recall(ivf_results, gt, k)
        recalls.append(recall)
    
    # Store results for the current nlista
    results[nlist] = recalls

# Plot Recall vs. nprobe for different nlist values
plt.figure(figsize=(10, 7))
 
for nlist, recalls in results.items():
    plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

plt.xlabel('nprobe')
plt.ylabel('Recall')
plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
plt.ylim(0, 1)              # Set y-axis to [0, 1]
plt.xscale('log', base=2)
plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
plt.legend(title="nlist", loc='lower right')
plt.grid(True)
plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
plt.show()


In [None]:
dataset_name = "glove-25-angular"
xb, xq, gt, metric = load_dataset(dataset_name)
# xq = xq[:1000, :]
# gt = gt[, :]
k=10
metric

# Define nlist and nprobe values
nlist_values = [100, 500, 1000, 2000, 3000, 4000, 5000, 8000, 10000, 13000, 16000]  # Different nlist sizes from 100...16000
nprobe_values = [1,2,4,8,16, 32, 64, 128, 256]

# Store recall results for plotting
results = {}

for nlist in nlist_values:
    # Build and train the IVF index
    # km_n_iter = 0 (untrained index, "worst case nprobe")
    print(metric)
    index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
    
    recalls = []
    for nprobe in nprobe_values:
        # Skip nprobe values larger than 0.8 * nlist
        if nprobe > nlist:
            recalls.append(None)  # Add None for missing values
            continue
        
        # Set nprobe for the index
        index_ivf.nprobe = nprobe
        
        # Perform search with the current nprobe
        _, ivf_results = index_ivf.search(xq, k)
        # Compute recall and append
        recall = compute_recall(ivf_results, gt, k)
        recalls.append(recall)
    
    # Store results for the current nlista
    results[nlist] = recalls

# Plot Recall vs. nprobe for different nlist values
plt.figure(figsize=(10, 7))
 
for nlist, recalls in results.items():
    plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

plt.xlabel('nprobe')
plt.ylabel('Recall')
plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
plt.ylim(0, 1)              # Set y-axis to [0, 1]
plt.xscale('log', base=2)
plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
plt.legend(title="nlist", loc='lower right')
plt.grid(True)
plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
plt.show()


In [None]:
dataset_name = "glove-25-angular"
xb, xq, gt, metric = load_dataset(dataset_name)
# xq = xq[:10, :]
# gt = gt[:10, :]
k=10
metric
# Define nlist and nprobe values
nlist_values = [100, 500, 1000, 2000, 3000, 4000, 5000, 8000, 10000, 13000, 16000]  # Different nlist sizes from 100...16000
nprobe_values = [1,2,4,8,16, 32, 64, 128, 256]

# Store recall results for plotting
results = {}

for nlist in nlist_values:
    # Build and train the IVF index
    # km_n_iter = 0 (untrained index, "worst case nprobe")
    print(metric)
    index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
    
    recalls = []
    for nprobe in nprobe_values:
        # Skip nprobe values larger than 0.8 * nlist
        if nprobe > nlist:
            recalls.append(None)  # Add None for missing values
            continue
        
        # Set nprobe for the index
        index_ivf.nprobe = nprobe
        
        # Perform search with the current nprobe
        _, ivf_results = index_ivf.search(xq, k)
        # Compute recall and append
        recall = compute_recall(ivf_results, gt, k)
        recalls.append(recall)
    
    # Store results for the current nlista
    results[nlist] = recalls

# Plot Recall vs. nprobe for different nlist values
plt.figure(figsize=(10, 7))

for nlist, recalls in results.items():
    plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

plt.xlabel('nprobe')
plt.ylabel('Recall')
plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
plt.ylim(0, 1)              # Set y-axis to [0, 1]
plt.xscale('log', base=2)
plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
plt.legend(title="nlist", loc='lower right')
plt.grid(True)
plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

nlist_values = [100, 500, 1000, 2000, 5000, 10000, 15000]  # Different nlist sizes
results = {nlist: [] for nlist in nlist_values}  # Store results for each nlist

# Process each dataset
for dataset_name in ANN_DATASETS:
    xb, xq, gt, metric = load_dataset(dataset_name)
    xq = xq[:10, :]  # Use a subset of query points
    gt = gt[:10, :]  # Use a subset of query points
    nq, d = xq.shape
    k = 10  # Number of nearest neighbors

    # Compute recall for each nlist
    for nlist in nlist_values:
        # Build and train the IVF index
        index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
        
        # Set nprobe = sqrt(nlist)
        nprobe = int(np.sqrt(nlist))
        index_ivf.nprobe = nprobe
        
        # Perform search
        _, ivf_results = index_ivf.search(xq, k)
        
        # Compute recall
        recall = compute_recall(ivf_results, gt, k)
        results[nlist].append(recall)

# Plot Recall vs. Dataset for Different nlist Sizes
plt.figure(figsize=(14, 8))

# Plot recall for each nlist
width = 0.12  # Width of each bar
x_indices = np.arange(len(ANN_DATASETS))  # x positions for datasets

for i, nlist in enumerate(nlist_values):
    # Offset x positions for each nlist
    x_positions = x_indices + (i - len(nlist_values) / 2) * width
    plt.bar(x_positions, results[nlist], width, label=f'nlist={nlist}')

# Labeling and aesthetics
plt.xticks(x_indices, ANN_DATASETS, rotation=45, ha="right")
plt.xlabel('Dataset')
plt.ylabel('Recall')
plt.title('Recall for Different Datasets (nprobe = sqrt(nlist))')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.yticks([i * 0.1 for i in range(11)])  # y-axis ticks at 0.1 intervals
plt.legend(title="nlist", loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
nlist_values = [100, 500, 1000, 2000, 5000, 10000, 15000]  # Different nlist sizes
results = {nlist: [] for nlist in nlist_values}  # Store results for each nlist

# Process each dataset
for dataset_name in ANN_DATASETS:
    xb, xq, gt, metric = load_dataset(dataset_name)
    xq = xq[:10, :]  # Use a subset of query points
    gt = gt[:10, :]  # Use a subset of query points
    nq, d = xq.shape
    k = 10  # Number of nearest neighbors

    # Compute recall for each nlist
    for nlist in nlist_values:
        # Build and train the IVF index
        index_ivf, _, _ = train_ivfflat(xb, nlist=nlist, metric=metric, km_n_iter=0)
        
        # Set nprobe = sqrt(nlist)
        nprobe = int(np.sqrt(nlist))
        index_ivf.nprobe = nprobe
        
        # Perform search
        _, ivf_results = index_ivf.search(xq, k)
        
        # Compute recall
        recall = compute_recall(ivf_results, gt, k)
        results[nlist].append(recall)

# Plot Recall vs. Dataset for Different nlist Sizes
plt.figure(figsize=(14, 8))

# Plot recall for each nlist
width = 0.12  # Width of each bar
x_indices = np.arange(len(ANN_DATASETS))  # x positions for datasets

for i, nlist in enumerate(nlist_values):
    # Offset x positions for each nlist
    x_positions = x_indices + (i - len(nlist_values) / 2) * width
    plt.bar(x_positions, results[nlist], width, label=f'nlist={nlist}')

# Labeling and aesthetics
plt.xticks(x_indices, ANN_DATASETS, rotation=45, ha="right")
plt.xlabel('Dataset')
plt.ylabel('Recall')
plt.title('Recall for Different Datasets (nprobe = sqrt(nlist))')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.yticks([i * 0.1 for i in range(11)])  # y-axis ticks at 0.1 intervals
plt.legend(title="nlist", loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
ANN_DATASETS

In [None]:

# Plot Recall vs. nprobe for different nlist values
plt.figure(figsize=(10, 7))

for nlist, recalls in results.items():
    plt.plot(nprobe_values, recalls, marker='o', label=f'nlist={nlist}')

plt.xlabel('nprobe')
plt.ylabel('Recall')
plt.title(f'Recall vs. nprobe for {dataset_name} (Different nlist Sizes)')
plt.ylim(0, 1)              # Set y-axis to [0, 1]
plt.xscale('log', base=2)
plt.xticks(nprobe_values)   # Ensure x-axis uses integer nprobe values
plt.legend(title="nlist", loc='lower right')
plt.grid(True)
plt.yticks([i * 0.1 for i in range(11)])  # Set y-axis ticks at 0.1 intervals (0, 0.1, ..., 1)
plt.show()
