In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys 
import numpy as np

Load it 

In [2]:
PATH_SIFT = "/pub/scratch/vmageirakos/datasets/parlay-ann/sift"

In [3]:
def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy()

def fvecs_read(fname):
    return ivecs_read(fname).view('float32')

def load_sift1M(PATH):
    print("Loading sift1M...", end='', file=sys.stderr)
    xt = fvecs_read(f"{PATH}/sift_learn.fvecs")
    xb = fvecs_read(f"{PATH}/sift_base.fvecs")
    xq = fvecs_read(f"{PATH}/sift_query.fvecs")
    gt = ivecs_read(f"{PATH}/sift_groundtruth.ivecs")
    print("done", file=sys.stderr)

    return xb, xq, xt, gt

In [4]:
xb, xq, xt, gt = load_sift1M(PATH_SIFT)

Loading sift1M...done


In [5]:
xb.shape
xq.shape
xt.shape
gt.shape

(1000000, 128)

(10000, 128)

(100000, 128)

(10000, 100)

Turn it into format iRangeGraph expects:

Data:
- The data points over which the index is built, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(float)` bytes contain the contents of the data one data point in a time.
There is no need to pre-sort the data points by any attribute. Just make sure data points and attribute1 and attribute2 match one by one in order.

Query:
- The query vectors, in .bin format. The first 4 bytes represent number of points as integer. The next 4 bytes represent the dimension of data as integer. The following `n*d*sizeof(float)` bytes contain the contents of the query one query point in a time.


In [30]:
import numpy as np
import struct

def convert_to_bin_format(data):
    # NUMPY
    num_points, dim = data.shape
    # Convert number of points and dimension to 4-byte integers
    header = np.int32(num_points).tobytes() + np.int32(dim).tobytes()
    # Convert data to float32 and to binary format
    data_bytes = data.astype(np.float32).tobytes()
    # Concatenate header and data bytes
    return header + data_bytes
    
def convert_to_bin_format(data):
    # Ensure the input data is a NumPy array
    
    # Get number of points and dimensions
    num_points, dim = data.shape
    
    # Create the header using struct to pack num_points and dim as 4-byte integers
    header = struct.pack('ii', num_points, dim)  # 'ii' means two signed integers (4 bytes each)
    
    # Convert data to float32 and ensure it's contiguous, then pack as bytes
    data_bytes = struct.pack(f'{num_points * dim}f', *data.astype(np.float32).flatten())
    
    # Concatenate header and data bytes
    return header + data_bytes

def save_data(data, filename):
    with open(filename, 'wb') as f:
        f.write(data)

In [26]:
num_points, dim = xb.shape
num_points
dim
header = struct.pack('ii', num_points, dim)
header
struct.unpack('ii', header)

1000000

128

b'@B\x0f\x00\x80\x00\x00\x00'

(1000000, 128)

In [17]:
# Convert and save xb and xq to .bin files
xb_bin = convert_to_bin_format(xb)
xq_bin = convert_to_bin_format(xq)

In [17]:
save_data(xb_bin, 'sift_base.bin')
save_data(xq_bin, 'sift_query.bin')