In [16]:
import faiss
import numpy as np
import time

# Number of vectors and dimensions (binary vector size)
n_vectors = 20000000
d = 64  # Binary vector size in bits

# Create random binary vectors
xb = np.random.randint(2, size=(n_vectors, d)).astype('uint8')
print(xb.shape)

# Convert to packed binary format (FAISS stores binary vectors as uint8 arrays)
xb_packed = np.packbits(xb, axis=1)
print(xb_packed.shape)


# Create the FAISS binary flat index
start_time = time.time()
index = faiss.IndexBinaryFlat(d)
print("Time to create index:", time.time() - start_time)

# Add the binary vectors to the index
start_time = time.time()
index.add(xb_packed)
print("Time to add vectors:", time.time() - start_time)

# Query vector
xq = np.random.randint(2, size=(1, d)).astype('uint8')
print(xq.shape)
xq_packed = np.packbits(xq, axis=1)

# Perform a search for the top 5 nearest neighbors
start_time = time.time()
k = 5
distances, indices = index.search(xq_packed, k)
print("Time to generate query vector:", time.time() - start_time)

print("Nearest neighbors:", indices)
print("Hamming distances:", distances)

(20000000, 64)
(20000000, 8)
Time to create index: 0.004430532455444336
Time to add vectors: 0.0381777286529541
(1, 64)
Time to generate query vector: 0.07126069068908691
Nearest neighbors: [[19240477 14689759 19053508  5805317  5010410]]
Hamming distances: [[11 12 12 12 12]]


In [None]:
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import pairwise_distances
import numpy as np

# Simulate binary dataset (60M points, 64 bits)
num_points = 60000000
num_features = 64
binary_data = np.random.randint(2, size=(num_points, num_features), dtype=np.uint8)

# Use K-medoids with Hamming distance
# Reduce the dataset size for demonstration purposes
sample_data = binary_data[:100000]  # Use a smaller sample to fit into memory
kmedoids = KMedoids(n_clusters=20000, metric="hamming", random_state=42)
kmedoids.fit(sample_data)

print("Cluster labels:", kmedoids.labels_)
print("Cluster medoids:", kmedoids.cluster_centers_)

In [3]:
import faiss
import numpy as np
from tqdm import tqdm

# Simulate binary dataset
binary_data = np.random.randint(2, size=(60000000, 64), dtype=np.uint8)

# Convert binary data to uint8 format for FAISS
binary_data_packed = np.packbits(binary_data, axis=1)

# Create a FAISS index for Hamming distance
index = faiss.IndexBinaryFlat(64)  # 64-bit binary vectors
index.add(binary_data_packed)
print("Number of indexed vectors:", index.ntotal)

# Perform clustering using approximate nearest neighbors
n_clusters = 20000
cluster_centers = binary_data_packed[:n_clusters]  # Use first N points as initial centers
assignments = []

# for vector in binary_data_packed:
for vector in tqdm(binary_data_packed, total=len(binary_data_packed)):
    _, nearest_center = index.search(np.expand_dims(vector, axis=0), k=1)
    assignments.append(nearest_center[0][0])

print("Cluster assignments:", assignments)

Number of indexed vectors: 60000000


  0%|          | 91/60000000 [00:13<2451:46:03,  6.80it/s]


KeyboardInterrupt: 