## General Plan

Ideally:
1. We use outdoors dataset for all examples w/ https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 model
2. We have a general "test_recall" method that takes in an original ranked list or results plus any number of other lists and compares recall of the top-N
3. We have the following code examples, each of which output the recall for each set of results passed in. Possibly also the first few results where it makes sense to show the output.

----1. Scalar quantization (Int8 , Int4) **NOTE: can leave Int4 off if unsupported by SentenceTransformers (I think this may be the case)**

--------- SentenceTransformers as library

--------- code using it tests original vs. Int 8 vs. Int8 w/rescoring

--------- second listing tests original vs. Int4 vs. Int4 w/rescoring

----2. Binary quantization

--------- Sentence transformers as library

--------- code using it tests original vs. bq vs. bq w/ rescoring

----3. Matroyoshka Learned Representations

-------- Sentence transformers as library

--------- code using it tests original vs. MRL @ 1/2 vs MRL @ 1/4 vs. MRL @ 1/8 vs. MRL @ 1/2 w/rescoring vs. MRL @ 1/4 w/rescoring vs. MRL @ 1/8 w/rescoring

----3. Product quantization

-------- nanopq as library (I don't think sentence tranformers does product quantization)


# Daniel's Original Code

In [214]:
import sys
sys.path.append("../..")
from aips import get_engine
from pyspark.sql import SparkSession
import pickle 
import numpy 
import torch
import clip
import time
import pandas
import random
import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

engine = get_engine()
spark = SparkSession.builder.appName("AIPS").getOrCreate()

In [215]:
![ ! -d 'tmdb' ] && git clone --depth 1 https://github.com/ai-powered-search/tmdb.git
! cd tmdb && git pull
! cd tmdb && mkdir -p '../data/tmdb/' && tar -xvf movies_with_image_embeddings.tgz -C '../data/tmdb/'

Already up to date.
movies_with_image_embeddings.pickle


In [216]:
def normalize_embedding(embedding):
    return numpy.divide(embedding,
      numpy.linalg.norm(embedding,axis=0)).tolist()

def read(cache_name):
    cache_file_name = f"data/tmdb/{cache_name}.pickle"
    with open(cache_file_name, "rb") as fd:
        return pickle.load(fd)

def quantize(embeddings):
    embeddings = numpy.array(embeddings)
    quantized_embeddings = numpy.zeros_like(embeddings, dtype=numpy.int8)
    quantized_embeddings[embeddings > 0] = 1
    return quantized_embeddings.tolist()

def tmdb_with_embeddings_dataframe():
    movies = read("movies_with_image_embeddings")
    embeddings = movies["image_embeddings"]
    normalized_embeddings = [normalize_embedding(e) for e in embeddings]
    quantized_embeddings = [quantize(e) for e in normalized_embeddings]
    movie_dataframe = spark.createDataFrame(
        zip(movies["movie_ids"], movies["titles"], 
            movies["image_ids"], normalized_embeddings,
            quantized_embeddings),
        schema=["movie_id", "title", "image_id", "image_embedding",
                "image_binary_embedding"])
    return movie_dataframe
    
def encode_text(text):
    text = clip.tokenize([text]).to(device)
    text_features = model.encode_text(text).tolist()[0]
    return numpy.array(normalize_embedding(text_features))

In [217]:
movie_dataframe = tmdb_with_embeddings_dataframe()
embeddings_collection = engine.create_collection("tmdb_with_embeddings")
embeddings_collection.write(movie_dataframe)

Wiping "tmdb_with_embeddings" collection
Creating "tmdb_with_embeddings" collection
Status: Success
Successfully written 7549 documents


In [218]:
def column_list(dataframe, column):
    return numpy.array(dataframe.select(column).rdd.flatMap(lambda x: x).collect())

def sort_titles(scores, movies, limit=25):
    titles = column_list(movies, "title").tolist()
    binary_results = numpy.argsort(scores)[-limit:][::-1]
    ranked = [titles[id] for id in binary_results]
    return list(dict.fromkeys(ranked))

def numpy_rankings(query, limit=20):
    start_dotprod = time.time()

    embeddings = column_list(movie_dataframe, "image_embedding")
    query_embedding = encode_text(query)
    dot_prod_scores = numpy.dot(embeddings, query_embedding)

    stop_dotprod = time.time(); start_binary = time.time()

    quantized_embeddings = column_list(movie_dataframe, "image_binary_embedding")
    quantized_query = numpy.array(quantize(query_embedding)) 
    binary_scores = 1536 - numpy.logical_xor(quantized_embeddings,
                                             quantized_query).sum(axis=1)
    
    stop_binary = time.time()
    
    binary_results = sort_titles(binary_scores, movie_dataframe)
    full_results = sort_titles(dot_prod_scores, movie_dataframe)
    return {"binary_query_time": stop_binary - start_binary,
            "full_query_time": stop_dotprod - start_dotprod,
            "recall": len(set(full_results).intersection(set(binary_results))) / len(set(binary_results)),
            "binary_results": binary_results,
            "full_results": full_results}

In [219]:
def only_titles(response):
    return [d["title"] for d in response["docs"]]

def base_search_request(query_vector, field, quantization_size):
    return {"query": query_vector,
            "query_fields": [field],
            "return_fields": ["movie_id", "title", "score"],
            "limit": 25,
            "k": 1000,
            "quantization_size": quantization_size}

def engine_rankings(query, log=False):
    collection = engine.get_collection("tmdb_with_embeddings")
    query_embedding = encode_text(query)    
    quantized_query = numpy.zeros_like(query_embedding, dtype=numpy.int8)
    quantized_query[query_embedding > 0] = 1

    binary_request = base_search_request(quantized_query.tolist(),
                                         "image_binary_embedding",
                                         "BINARY")
    start_dotprod = time.time()    
    binary_results = only_titles(collection.search(**binary_request))
    stop_dotprod = time.time()

    reranked_request = binary_request
    reranked_request["rerank_query"] = {
        "query": query_embedding.tolist(),
        "query_fields": ["image_embedding"],
        "k": 100,
        "rerank_count": 100,
        "quantization_size": "FLOAT32"}
    
    if log: print(json.dumps(reranked_request, indent=2))        
    start_reranked = time.time()    
    full_results = only_titles(collection.search(**reranked_request))
    stop_reranked = time.time()
    return {"binary_query_time": stop_reranked - start_reranked,
            "full_query_time": stop_dotprod - start_dotprod,
            "recall": len(set(full_results).intersection(set(binary_results))) / len(set(binary_results)),
            "binary_results": binary_results,
            "full_results": full_results}

In [220]:
query = "The Hobbit"
engine_scores = engine_rankings(query)
numpy_scores = numpy_rankings(query)
results = pandas.DataFrame(zip(engine_scores["binary_results"], numpy_scores["binary_results"],
                          engine_scores["full_results"], numpy_scores["full_results"]),
                          columns=["quantized solr", "quantized numpy",
                                   "dotprod solr", "dotprod numpy"])
print(f"Search engine binary search time: {engine_scores['binary_query_time']}")
print(f"Search engine full search time: {engine_scores['full_query_time']}")
print(f"Numpy binary search time: {numpy_scores['binary_query_time']}")
print(f"Numpy full search time: {numpy_scores['full_query_time']}")
results

Search engine binary search time: 0.030432939529418945
Search engine full search time: 0.0288541316986084
Numpy binary search time: 1.6173162460327148
Numpy full search time: 2.3195929527282715


Unnamed: 0,quantized solr,quantized numpy,dotprod solr,dotprod numpy
0,The Hobbit: The Desolation of Smaug,The Hobbit: The Desolation of Smaug,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the Ring
1,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the Ring,The Hobbit: An Unexpected Journey,The Hobbit: An Unexpected Journey
2,Klaus,The Hobbit: The Battle of the Five Armies,The Princess Bride,The Princess Bride
3,The Hobbit: The Desolation of Smaug,Klaus,The Hobbit: The Battle of the Five Armies,The Hobbit: The Battle of the Five Armies
4,The Hobbit: The Battle of the Five Armies,The Goonies,The Hobbit: The Battle of the Five Armies,The Hobbit: The Desolation of Smaug
5,The Goonies,The Hobbit: An Unexpected Journey,The Hobbit: An Unexpected Journey,The Lord of the Rings: The Two Towers
6,The Hobbit: The Battle of the Five Armies,Labyrinth,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Return of the King
7,Labyrinth,The Lord of the Rings: The Return of the King,The Hobbit: The Desolation of Smaug,Guardians of the Galaxy Vol. 2
8,The Hobbit: The Desolation of Smaug,Frozen II,The Lord of the Rings: The Fellowship of the Ring,The Last Samurai


In [221]:
random.seed(1234)

titles = column_list(movie_dataframe, "title")
random.shuffle(titles)

def mean_accuracy(f):
    return numpy.mean([f(q)["recall"] for q in tqdm.tqdm(titles[:25])])

print(f"Average quantized recall for numpy: {mean_accuracy(numpy_rankings)}")
print(f"Average quantized recall for engine: {mean_accuracy(engine_rankings)}")

100%|██████████| 25/25 [02:21<00:00,  5.66s/it]


Average quantized recall for numpy: 0.3250562238049114


100%|██████████| 25/25 [00:04<00:00,  5.25it/s]

Average quantized recall for engine: 0.5726311496493216





# Useful code

In [227]:
original_embeddings = numpy.array(column_list(movie_dataframe, "image_embedding")[0:1000]) #100 movie image embeddings

## Using the mxabi-embed-large-v1 model with SentenceTransfomer

In [222]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

matryoshka_dim = 64
#model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka", truncate_dim=matryoshka_dim)
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=matryoshka_dim)

embeddings = model.encode(
    [
        "The weather is so nice!",
        "It's so sunny outside!",
        "He drove to the stadium.",
    ]
)
print(embeddings.shape)
# => (3, 64)

# Similarity of the first sentence to the other two:
similarities = cos_sim(embeddings[0], embeddings[1:])
print(similarities)

TypeError: SentenceTransformer.__init__() got an unexpected keyword argument 'truncate_dim'

## Scalar Quantization

In [None]:
#TODO: use SentenceTransformers instead of showing the internal code
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings

int8_embeddings = quantize_embeddings(embeddings, precision="int8") #float32”, “int8”, “uint8”, “binary”, “ubinary”

## Binary Quantization

In [None]:
#TODO: use SentenceTransformers instead of showing the internal code
binary_embeddings = quantize_embeddings(embeddings, precision="binary") # Note: "binary" is np.packedbits and "ubinary" maps into uInt8. Note sure what we need here, but be aware we have both types available. #See: https://sbert.net/examples/applications/embedding-quantization/README.html

## Matroyoshka Learned Representations

In [None]:
#TODO: just divide the embedding by 1/2, 1/4, 1/8. Should'nt need anything fancy other than embeddings trained on the MRL-compatible model

## Product quantization

In [223]:
! pip install nanopq 
#move to requirements.txt w/ version

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [233]:
import nanopq
import numpy as np

embeddings = numpy.array(original_embeddings, dtype=np.float32) #convert to float32 from float64. May not be needed depending on if they're already in that format.

N = embeddings.shape[0] #documents
D = embeddings.shape[1] #dimensions/features
M=8 #number of subvectors 
print(embeddings.dtype)

# Instantiate with M=8 sub-spaces
pq = nanopq.PQ(M=M)

# Train codewords
pq.fit(embeddings)  #NOTE: this can be trained on a training set or a subset of the embeddings if this is too slow. It's doing Kmeans to generate clusters.

float32
M: 8, Ks: 256, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 8
Training the subspace: 1 / 8
Training the subspace: 2 / 8
Training the subspace: 3 / 8
Training the subspace: 4 / 8
Training the subspace: 5 / 8
Training the subspace: 6 / 8
Training the subspace: 7 / 8


<nanopq.pq.PQ at 0x7f648d4543d0>

In [234]:
# "Index documents": Encode to PQ-codes
quantized_documents = pq.encode(embeddings)  # with dtype=np.uint8

Encoding the subspace: 0 / 8
Encoding the subspace: 1 / 8
Encoding the subspace: 2 / 8
Encoding the subspace: 3 / 8
Encoding the subspace: 4 / 8
Encoding the subspace: 5 / 8
Encoding the subspace: 6 / 8
Encoding the subspace: 7 / 8


In [235]:

quantized_query = pq.encode(embeddings[0]) #grab any document that makes for a good example, but let's use the same example throughout all quantization types for consistency

# Results: create a distance table online, and compute Asymmetric Distance to each PQ-code 
dists = pq.dtable(quantized_query).adist(quantized_documents)

#TODO: sort results by dists

AssertionError: 

# IGNORE BELOW: Throw-away code / not necessarily functional

# Scalar Quantization

## Uniform Scalar Quantization

In [169]:
original_embeddings = numpy.array(column_list(movie_dataframe, "image_embedding")[0:1000]) #100 movie image embeddings

In [179]:
def print_comparison(original_embeddings, quantized_embeddings, dequantized_embeddings):
    print("Original:", f"[ {', '.join([str(emb) for emb in original_embeddings[0][0:4]]) + ' ... ' + ', '.join([str(emb) for emb in original_embeddings[0][-4:]])}]", "Memory Usage: ", sys.getsizeof(original_embeddings[0]))
    print("Quantized:", f"[ {', '.join([str(emb) for emb in quantized_embeddings[0][0:4]]) + ' ... ' + ', '.join([str(emb) for emb in quantized_embeddings[0][-4:]])}]", "Memory Usage: ", sys.getsizeof(quantized_embeddings[0]))
    print("Dequantized:", f"[ {', '.join([str(emb) for emb in dequantized_embeddings[0][0:4]]) + ' ... ' + ', '.join([str(emb) for emb in dequantized_embeddings[0][-4:]])}]", "Memory Usage: ", sys.getsizeof(dequantized_embeddings[0]))

    #    print("Quantized:", quantized_embeddings[0][0:4, "Memory Usage: ", sys.getsizeof(quantized[0]))
#    print("Dequantized:", dequantized[0])
    print("\nSimilarity, Original vs. Dequantized:", np.cos(original_embeddings[1],dequantized_embeddings[1]))

In [180]:
import numpy as np

def uniform_scalar_quantize(embeddings, bits):
    # Determine the range of the embeddings
    min_val, max_val = embeddings.min(), embeddings.max()
    
    # Calculate the step size
    step = (max_val - min_val) / (2**bits - 1)
    
    # Quantize the embeddings
    quantized = np.round((embeddings - min_val) / step)
    
    # Clip to ensure values are within the valid range
    quantized = np.clip(quantized, 0, 2**bits - 1)
    
    return quantized.astype(np.uint8 if bits <= 8 else np.uint16)

def uniform_scalar_dequantize(quantized, bits, original_min, original_max):
    # Calculate the step size
    step = (original_max - original_min) / (2**bits - 1)
    
    # Dequantize the embeddings
    dequantized = quantized * step + original_min
    
    return dequantized

# Example usage
#original_embeddings = np.random.rand(500, 10)  # 5 embeddings of dimension 10
#original_embeddings = column_list(movie_dataframe, "image_embedding")

#bits = 8  # Quantize to 8 bits

#quantized = uniform_scalar_quantize(original_embeddings, bits)
#dequantized = uniform_scalar_dequantize(quantized, bits, original_embeddings.min(), original_embeddings.max())


In [181]:
bits = 16  # Quantize to 8 bits

quantized = uniform_scalar_quantize(original_embeddings, bits)
dequantized = uniform_scalar_dequantize(quantized, bits, original_embeddings.min(), original_embeddings.max())

print_comparison(original_embeddings, quantized, dequantized)
#print("Original:", original_embeddings[0], "Memory Usage: ", sys.getsizeof(original_embeddings[0]))
#print("Quantized:", quantized[0], "Memory Usage: ", sys.getsizeof(quantized[0]))
#print("Dequantized:", dequantized[0])

#print("\nSimilarity, Original vs. Dequantized:", np.mean(np.cos(numpy.array(original_embeddings), numpy.array(dequantized))))

Original: [ 0.00393834233885761, -0.035241456162213435, -0.035581467904314476, -0.00575613596354904 ... -0.004695540596792066, 0.040802999896525756, 0.0049649511614913875, 0.008843208585659732] Memory Usage:  112
Quantized: [ 47159, 44945, 44925, 46611 ... 46671, 49243, 47217, 47436] Memory Usage:  112
Dequantized: [ 0.003936196123636382, -0.03523501888379843, -0.03558886906453407, -0.005759298828519999 ... -0.004697748286313197, 0.04080738495628955, 0.004962361647769642, 0.00883702112682494] Memory Usage:  112

Similarity, Original vs. Dequantized: [0.99999999 0.99991281 0.99960331 0.99997372 0.99928661 0.9999972
 0.9994595  0.99616831 0.99829246 0.99985146 0.99995043 0.99936638
 0.99795224 0.99997967 0.99992438 0.99947543 0.99995011 0.99997784
 0.99865064 0.99996072 0.99300141 1.         0.9997298  0.999988
 0.99993678 0.99969668 0.9992162  0.99984406 0.99996161 0.99928171
 0.99998916 0.99904102 0.99870128 0.99965787 0.9999936  0.99992066
 0.99980722 0.9973062  0.99999533 0.99474695 

## IGNORE Non-uniform Scalar Quantization

In [182]:
from sklearn.cluster import KMeans

def kmeans_scalar_quantize(embeddings, n_clusters):
    # Flatten the embeddings
    flat_embeddings = embeddings.reshape(-1, 1)
    
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    kmeans.fit(flat_embeddings)
    
    # Quantize the embeddings
    quantized = kmeans.predict(flat_embeddings).reshape(embeddings.shape)
    
    return quantized, kmeans.cluster_centers_

def kmeans_scalar_dequantize(quantized, cluster_centers):
    return cluster_centers[quantized].reshape(quantized.shape)

# Example usage
n_clusters = 256  # 8-bit quantization

quantized, cluster_centers = kmeans_scalar_quantize(original_embeddings, n_clusters)
dequantized = kmeans_scalar_dequantize(quantized, cluster_centers)


In [183]:
print_comparison(original_embeddings, quantized, dequantized)

Original: [ 0.00393834233885761, -0.035241456162213435, -0.035581467904314476, -0.00575613596354904 ... -0.004695540596792066, 0.040802999896525756, 0.0049649511614913875, 0.008843208585659732] Memory Usage:  112
Quantized: [ 4, 103, 10, 228 ... 97, 64, 214, 27] Memory Usage:  112
Dequantized: [ 0.0039102465223891745, -0.03498032376666863, -0.036069270532962446, -0.005824379986281493 ... -0.004485301835488328, 0.04063635490584795, 0.004711785236143582, 0.008579395170028216] Memory Usage:  112

Similarity, Original vs. Dequantized: [0.99999999 0.99991281 0.99960331 0.99997372 0.99928661 0.9999972
 0.9994595  0.99616831 0.99829246 0.99985146 0.99995043 0.99936638
 0.99795224 0.99997967 0.99992438 0.99947543 0.99995011 0.99997784
 0.99865064 0.99996072 0.99300141 1.         0.9997298  0.999988
 0.99993678 0.99969668 0.9992162  0.99984406 0.99996161 0.99928171
 0.99998916 0.99904102 0.99870128 0.99965787 0.9999936  0.99992066
 0.99980722 0.9973062  0.99999533 0.99474695 0.99999291 0.999999

# Binary Quantization

## Just use 0 or support passing in threshold so median can be passed in - Simple Thresholding

In [184]:
import numpy as np

def binary_quantize_threshold(embeddings, threshold=None):
    if threshold is None:
        threshold = np.median(embeddings)
    return (embeddings > threshold).astype(np.uint8)

def binary_dequantize_threshold(binary_embeddings, original_min, original_max):
    return np.where(binary_embeddings, original_max, original_min)

# Example usage

#1: Midpoint = 0


#2 Midpoint = median
binary_quantized = binary_quantize(original_embeddings)
binary_dequantized = binary_dequantize(binary_quantized, 
                                       original_embeddings.min(), 
                                       original_embeddings.max())

In [185]:
print_comparison(original_embeddings, binary_quantized, binary_dequantized)

Original: [ 0.00393834233885761, -0.035241456162213435, -0.035581467904314476, -0.00575613596354904 ... -0.004695540596792066, 0.040802999896525756, 0.0049649511614913875, 0.008843208585659732] Memory Usage:  112
Quantized: [ 1, 0, 0, 0 ... 0, 1, 1, 1] Memory Usage:  112
Dequantized: [ 0.329053742183538, -0.8304248375419548, -0.8304248375419548, -0.8304248375419548 ... -0.8304248375419548, 0.329053742183538, 0.329053742183538, 0.329053742183538] Memory Usage:  112

Similarity, Original vs. Dequantized: [0.99999999 0.99991281 0.99960331 0.99997372 0.99928661 0.9999972
 0.9994595  0.99616831 0.99829246 0.99985146 0.99995043 0.99936638
 0.99795224 0.99997967 0.99992438 0.99947543 0.99995011 0.99997784
 0.99865064 0.99996072 0.99300141 1.         0.9997298  0.999988
 0.99993678 0.99969668 0.9992162  0.99984406 0.99996161 0.99928171
 0.99998916 0.99904102 0.99870128 0.99965787 0.9999936  0.99992066
 0.99980722 0.9973062  0.99999533 0.99474695 0.99999291 0.99999911
 0.99984901 0.99977668 0.9

## IGNORE - Iterative Quantization (ITQ)

In [186]:
from sklearn.decomposition import PCA
import numpy as np

def itq(data, num_iterations=50):
    # Center the data
    data = data - np.mean(data, axis=0)
    
    # Perform PCA
    pca = PCA(n_components=data.shape[1])
    data = pca.fit_transform(data)
    
    # Initialize random rotation
    R = np.random.randn(data.shape[1], data.shape[1])
    U, _, Vt = np.linalg.svd(R)
    R = U.dot(Vt)
    
    for i in range(num_iterations):
        # Fix R and update B
        Z = np.dot(data, R)
        B = np.sign(Z)
        
        # Fix B and update R
        UB, _, UAT = np.linalg.svd(np.dot(data.T, B))
        R = np.dot(UB, UAT)
    
    # Final binary codes
    Z = np.dot(data, R)
    B = np.sign(Z)
    
    return (B + 1) / 2  # Convert to 0 and 1

# Example usage
itq_quantized = itq(original_embeddings)
print("ITQ Quantized:", itq_quantized[0])

ITQ Quantized: [1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1.
 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1.
 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0.
 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1.
 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1.
 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1.
 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1.

# Product Quantization

In [190]:
! pip install nanopq

[0mCollecting nanopq
  Downloading nanopq-0.2.1-py3-none-any.whl.metadata (4.2 kB)
Downloading nanopq-0.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nanopq
Successfully installed nanopq-0.2.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [236]:
original_embeddings.shape[0]

1000

In [237]:
import nanopq
import numpy as np

embeddings = numpy.array(original_embeddings, dtype=np.float32)

N = embeddings.shape[0]
D = embeddings.shape[1]
M=8
X = numpy.array(original_embeddings, dtype=np.float32)
print(X.dtype)

#N, Nt, D = 10000, 2000, 128
#X = np.random.random((N, D)).astype(np.float32)  # 10,000 128-dim vectors to be indexed
#Xt = np.random.random((Nt, D)).astype(np.float32)  # 2,000 128-dim vectors for training

#query = np.random.random((D,)).astype(np.float32)  # a 128-dim query vector

# Instantiate with M=8 sub-spaces
pq = nanopq.PQ(M=M)

# Train codewords
pq.fit(X, seed=0) #seed for data consistency

float32
M: 8, Ks: 256, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 0
Training the subspace: 0 / 8
Training the subspace: 1 / 8
Training the subspace: 2 / 8
Training the subspace: 3 / 8
Training the subspace: 4 / 8
Training the subspace: 5 / 8
Training the subspace: 6 / 8
Training the subspace: 7 / 8


<nanopq.pq.PQ at 0x7f6549a06020>

In [249]:
# Encode to PQ-codes
quantized_documents = pq.encode(X)  # (10000, 8) with dtype=np.uint8
#print(quantized)

#query = pq.encode(embeddings[0]) #just grab the first movie embedding
query = embeddings[0]

# Results: create a distance table online, and compute Asymmetric Distance to each PQ-code 
dists = pq.dtable(query).adist(quantized_documents)  # (10000, )

#TODO: sort results
print(dists[:10]) #not currently sorted. I think these may be in order of cluster id. If so, will need a map of id:score and then to sort by score

Encoding the subspace: 0 / 8
Encoding the subspace: 1 / 8
Encoding the subspace: 2 / 8
Encoding the subspace: 3 / 8
Encoding the subspace: 4 / 8
Encoding the subspace: 5 / 8
Encoding the subspace: 6 / 8
Encoding the subspace: 7 / 8
[0.14254893 0.70412374 0.7292688  0.23975845 0.57965636 0.7138767
 0.56706476 0.7499395  0.74030113 0.6424299 ]


## Matroyoshka

In [250]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

matryoshka_dim = 64
#model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka", truncate_dim=matryoshka_dim)
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=matryoshka_dim)

embeddings = model.encode(
    [
        "The weather is so nice!",
        "It's so sunny outside!",
        "He drove to the stadium.",
    ]
)
print(embeddings.shape)
# => (3, 64)

# Similarity of the first sentence to the other two:
similarities = cos_sim(embeddings[0], embeddings[1:])
print(similarities)

TypeError: SentenceTransformer.__init__() got an unexpected keyword argument 'truncate_dim'

## IGNORE

In [189]:
import numpy as np
from sklearn.cluster import KMeans

def product_quantize(embeddings, num_subvectors, bits_per_subvector):
    d = embeddings.shape[1]
    subvector_size = d // num_subvectors
    num_centroids = 2**bits_per_subvector
    
    quantized = np.zeros_like(embeddings, dtype=np.uint8)
    codebooks = []
    
    for i in range(num_subvectors):
        start = i * subvector_size
        end = (i + 1) * subvector_size
        subvector = embeddings[:, start:end]
        
        kmeans = KMeans(n_clusters=num_centroids, n_init=10)
        kmeans.fit(subvector)
        
        quantized[:, start:end] = kmeans.predict(subvector).reshape(-1, 1)
        codebooks.append(kmeans.cluster_centers_)
    
    return quantized, codebooks

def product_dequantize(quantized, codebooks):
    num_subvectors = len(codebooks)
    subvector_size = codebooks[0].shape[1]
    
    dequantized = np.zeros((quantized.shape[0], num_subvectors * subvector_size))
    
    for i in range(num_subvectors):
        start = i * subvector_size
        end = (i + 1) * subvector_size
        dequantized[:, start:end] = codebooks[i][quantized[:, start:end]]
    
    return dequantized

# Example usage
num_subvectors = 5
bits_per_subvector = 8

pq_quantized, codebooks = product_quantize(original_embeddings, num_subvectors, bits_per_subvector)
pq_dequantized = product_dequantize(pq_quantized, codebooks)


ValueError: could not broadcast input array from shape (1000,102,102) into shape (1000,102)

In [None]:
#print("Original:", original_embeddings[0])
#print("PQ Quantized:", pq_quantized[0])
#print("PQ Dequantized:", pq_dequantized[0])

#print("Similarity, Original vs. Dequantized:", np.dot(original_embeddings[0], pq_dequantized[0]))
print_comparisons(original_embeddings, pq_quantized, pq_dequantized)

In [None]:
import numpy as np
from sklearn.decomposition import PCA

def fit_itq(data, num_iterations=50):
    # Center the data
    mean = np.mean(data, axis=0)
    centered_data = data - mean
    
    # Perform PCA
    pca = PCA(n_components=data.shape[1])
    data_pca = pca.fit_transform(centered_data)
    
    # Initialize random rotation
    R = np.random.randn(data_pca.shape[1], data_pca.shape[1])
    U, _, Vt = np.linalg.svd(R)
    R = U.dot(Vt)
    
    for i in range(num_iterations):
        # Fix R and update B
        Z = np.dot(data_pca, R)
        B = np.sign(Z)
        
        # Fix B and update R
        UB, _, UAT = np.linalg.svd(np.dot(data_pca.T, B))
        R = np.dot(UB, UAT)
    
    return R, mean, pca

def itq_quantize(data, R, mean, pca):
    # Center and project the data
    centered_data = data - mean
    data_pca = pca.transform(centered_data)
    
    # Apply rotation and binarize
    Z = np.dot(data_pca, R)
    B = np.sign(Z)
    
    return (B + 1) / 2  # Convert to 0 and 1

def itq_dequantize(binary_codes, R, mean, pca):
    # Convert back to -1 and 1
    B = 2 * binary_codes - 1
    
    # Inverse rotation
    Z_approx = np.dot(B, R.T)
    
    # Inverse PCA
    data_approx = pca.inverse_transform(Z_approx)
    
    # Add back the mean
    return data_approx + mean

# Example usage
#original_embeddings = np.random.rand(100, 64)  # 100 embeddings of dimension 64

# Fit ITQ
R, mean, pca = fit_itq(original_embeddings, num_iterations=50)

In [None]:
# Quantize
itq_quantized = itq_quantize(original_embeddings, R, mean, pca)

# Dequantize
itq_dequantized = itq_dequantize(itq_quantized, R, mean, pca)

# Print results for the first embedding
#print("Original:", original_embeddings[0])
#print("ITQ Quantized:", itq_quantized[0])
#print("ITQ Dequantized:", itq_dequantized[0])

# Compute and print mean squared error
#mse = np.mean((original_embeddings - itq_dequantized) ** 2)
#print(f"Mean Squared Error: {mse}")

print_comparison(original_embeddings, itq_quantized, itq_dequantized)

In [207]:
!pip install sentence-transformers==2.7.0

[0mCollecting sentence-transformers==2.7.0
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers==2.7.0)
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.15.1 (from sentence-transformers==2.7.0)
  Downloading huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.34.0->sentence-transformers==2.7.0)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━

In [208]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings

binary_embeddings = quantize_embeddings(embeddings, precision="binary")

ImportError: cannot import name 'quantization' from 'sentence_transformers' (/opt/conda/lib/python3.10/site-packages/sentence_transformers/__init__.py)

## IGNORE: Locally-adaptive quantization

In [209]:
import numpy as np

class LAVQ:
    def __init__(self, codebook_size, learning_rate=0.1):
        self.codebook_size = codebook_size
        self.learning_rate = learning_rate
        self.codebook = None
    
    def fit(self, data):
        # Initialize codebook with random samples from the data
        self.codebook = data[np.random.choice(data.shape[0], self.codebook_size, replace=False)]
    
    def quantize(self, data):
        quantized = np.zeros(data.shape[0], dtype=int)
        for i, vector in enumerate(data):
            # Find the closest codebook vector
            distances = np.sum((self.codebook - vector) ** 2, axis=1)
            closest_index = np.argmin(distances)
            quantized[i] = closest_index
            
            # Update the closest codebook vector
            self.codebook[closest_index] += self.learning_rate * (vector - self.codebook[closest_index])
        
        return quantized
    
    def dequantize(self, quantized):
        return self.codebook[quantized]

# Example usage
np.random.seed(42)  # for reproducibility
original_embeddings = np.random.rand(1000, 10)  # 1000 embeddings of dimension 10

lavq = LAVQ(codebook_size=256)  # 8-bit quantization
lavq.fit(original_embeddings)

In [210]:
# Quantize the embeddings
quantized = lavq.quantize(original_embeddings)

# Dequantize
dequantized = lavq.dequantize(quantized)

# Print results for the first embedding
print("Original:", original_embeddings[0])
print("Quantized Index:", quantized[0])
print("Dequantized:", dequantized[0])

# Compute and print mean squared error
mse = np.mean((original_embeddings - dequantized) ** 2)
print(f"Mean Squared Error: {mse}")

Original: [0.37454012 0.95071431 0.73199394 0.59865848 0.15601864 0.15599452
 0.05808361 0.86617615 0.60111501 0.70807258]
Quantized Index: 80
Dequantized: [0.32761169 0.87076841 0.76142373 0.62190297 0.17016318 0.14367201
 0.10737511 0.85391109 0.53493335 0.713197  ]
Mean Squared Error: 0.021941625251251977
