In [1]:
import os
import pickle

from pyspark import SparkConf
from aips.data_loaders.outdoors import load_dataframe
from aips import get_engine
from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers.util import cos_sim
from pyspark.sql import SparkSession
from aips import get_engine
import time
import numpy
import math

engine = get_engine()
#Recommended for making ALS run faster, if you have enough memory / cores allocated to docker
conf = SparkConf()
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.dynamicAllocation.executorMemoryOverhead", "8g")
spark = SparkSession.builder.appName("AIPS-ch13").config(conf=conf).getOrCreate()

model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1",
                            similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
                            truncate_dim=1024)

#https://github.com/facebookresearch/faiss/wiki/Pre--and-post-processing
#https://github.com/facebookresearch/faiss/wiki
#https://huggingface.co/spaces/sentence-transformers/quantized-retrieval/blob/main/app.py

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

#outdoors_collection = engine.create_collection("outdoors")

Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-squad2-guesses.csv
._roberta-base-squad2-outdoors
roberta-base-squad2-outdoors/
roberta-base-squad2-outdoors/._tokenizer_config.json
roberta-base-squad2-outdoors/tokenizer_config.json
roberta-base-squad2-outdoors/._special_tokens_map.json
roberta-base-squad2-outdoors/special_tokens_map.json
roberta-base-squad2-outdoors/._config.json
roberta-base-

### Note: I updated the get_embeddings method earlier in the chapter to have the same method signature, so no need to duplicate it in the manuscript. Ideally we'd load this in from a python file for reuse.

In [3]:
#model.stop_multi_process_pool(pool)
#pool = model.start_multi_process_pool()
#embeddings = model.encode(texts, convert_to_tensor=False).tolist()

def get_embeddings(texts, model, cache_name, ignore_cache=False):
    cache_file_name = f"data/embeddings/{cache_name}.pickle"
    if ignore_cache or not os.path.isfile(cache_file_name):        
        embeddings = model.encode(texts, normalize_embeddings=True)
        os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
        with open(cache_file_name, "wb") as fd:
            pickle.dump(embeddings, fd)
    else:
        with open(cache_file_name, "rb") as fd:
            embeddings = pickle.load(fd)
    return embeddings

## Boilerplate code for Quantization listings
### Generating embeddings and benchmark data

In [4]:
import faiss
from aips.data_loaders.outdoors import load_dataframe

def display_results(scores, ids, data):
    results = generate_search_results(scores, ids, data)
    display(results)
    return results

def get_outdoors_data():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    return outdoors_data

def calculate_outdoors_embeddings(model):
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    return numpy.array(get_embeddings(post_texts, model, "outdoors_mrl_normed"))

def display_statistics(full_search_results, quantized_search_results, start_message="Recall"):
    index_name = quantized_search_results["index_name"]
    full_search_time = full_search_results["time_taken"]
    time_taken = quantized_search_results["time_taken"]
    time_imp = round((full_search_time - time_taken) * 100 / full_search_time, 2)
    quantized_size = quantized_search_results["size"]
    improvement_ms = f"({time_imp}% improvement)"
    improvement_size = f"({round((full_search_results['size'] - quantized_size) * 100 / full_search_results['size'], 2)}% improvement)"
    print(f"{index_name} search took: {time_taken:.3f} ms {improvement_ms}")
    print(f"{index_name} index size: {round(quantized_size / 1000000, 2)} MB {improvement_size}")
    recall = calculate_recall(full_search_results["results"], quantized_search_results["results"])
    print(f"{recall}: {str(round(sum(recall) / len(recall), 4))}")

def calculate_recall(scored_full_results, scored_quantized_results):
    recall = []
    for i in range(len(scored_full_results)):
        full_ids = [r["id"] for r in scored_full_results[i]]
        quantized_ids = [r["id"] for r in scored_quantized_results[i]]
        recall.append((len(set(full_ids).intersection(set(quantized_ids))) /
                       len(set(quantized_ids))))
    return recall

def generate_search_results(faiss_scores, faiss_ids):
    outdoors_data = get_outdoors_data()
    faiss_results = []
    for i in range(len(faiss_scores)):
        results = []
        for j, id in enumerate(faiss_ids[i]):
            id = int(id)
            result = {"score": faiss_scores[i][j],
                      "title": outdoors_data[id]["title"],
                      "body": outdoors_data[id]["body"],
                      "id": id}
            results.append(result)
        faiss_results.append(results)
    return faiss_results

In [5]:
#This will generate and cache the embeddings. Takes 2-3 hours typically
embeddings = calculate_outdoors_embeddings(model) 
print(embeddings.shape) #     => (18456, 1024)

outdoors_data = get_outdoors_data() 
print(len(outdoors_data)) #   => 18456

(18456, 1024)
18456


In [23]:
from sentence_transformers.quantization import quantize_embeddings
#from usearch.index import Index

def get_test_queries():
    return ["tent poles", "hiking trails", "mountain forests",
            "white water", "best waterfalls", "mountain biking",
            "snowboarding slopes", "bungee jumping", "public parks"]

def index_embeddings(embeddings, name, print_shape=True):
    if print_shape:
        print(f"{name} embeddings shape:", embeddings.shape)
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def time_and_execute_search(index, index_name, query_embeddings, k=25):
    start_time = time.time()
    faiss_scores, faiss_ids = index.search(query_embeddings, k=k)
    time_taken = ((time.time() - start_time) * 1000)
    
    results = {"results": generate_search_results(faiss_scores, faiss_ids),
               "time_taken": time_taken, 
               "faiss_scores": faiss_scores, "faiss_ids": faiss_ids}
    index_stats = {}
    if index_name:
        index_stats ={"index_name": index_name,
                      "size": os.path.getsize(index_name)}
    return results | index_stats


def execute_full_search(embeddings, query_embeddings, k=25,
                        index_name="full_out_embs"):      
    full_index = index_embeddings(embeddings, index_name, print_shape=False)
    return time_and_execute_search(full_index, index_name, query_embeddings, k=k)

def evaluate_search(index, index_name, query_embeddings, quantized_query_embeddings,
                    k=25, display=True, log=False):
    embeddings = calculate_outdoors_embeddings(model)
    full_results = execute_full_search(embeddings, query_embeddings, k=k)
    quantized_results = time_and_execute_search(index, index_name,
                                                quantized_query_embeddings, k=k)
    if display:
        display_statistics(full_results, quantized_results)
    return quantized_results, full_results

def evaluate_rerank_search(index, query_embeddings,
                           quantized_embeddings, k=100, limit=25):
    results, full_results = evaluate_search(index, None, query_embeddings,
                                            quantized_embeddings, display=False, k=k)
    
    embeddings = calculate_outdoors_embeddings(model)
    rescore_scores, rescore_ids = [], []
    for i in range(len(results["results"])):
        embedding_ids = results["faiss_ids"][i]
        top_k_embeddings = [embeddings[id] for id in embedding_ids]
        query_embedding = query_embeddings[i]
        scores = query_embedding @ numpy.array(top_k_embeddings).T
        indices = scores.argsort()[::-1][:limit]
        top_k_indices = embedding_ids[indices]
        top_k_scores = scores[indices]
        rescore_scores.append(top_k_scores)
        rescore_ids.append(top_k_indices)

    results = generate_search_results(rescore_scores, rescore_ids)
    recall = calculate_recall(full_results["results"], results)
    print("\n", f"Reranked recall; {recall}")

In [24]:
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1",
                            similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
                            truncate_dim=1024)
embeddings = calculate_outdoors_embeddings(model)
queries = get_test_queries()
query_embeddings = model.encode(queries, convert_to_numpy=True,
                                normalize_embeddings=True)

## Listing 13.21
### int8 quantization

In [25]:
def index_int8_embeddings(embeddings, name):
    embeddings = quantize_embeddings(embeddings, precision="int8")
    print("Int8 embeddings shape:", embeddings.shape)
    index = faiss.IndexScalarQuantizer(embeddings.shape[1],
                                       faiss.ScalarQuantizer.QT_8bit, )
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

int8_index_name = "int8_out_embs"
int8_index = index_int8_embeddings(embeddings, int8_index_name)

quantized_queries = quantize_embeddings(query_embeddings,
                                        calibration_embeddings=embeddings,
                                        precision="int8")
evaluate_search(int8_index, "int8_out_embs", query_embeddings, quantized_queries)
evaluate_rerank_search(int8_index, query_embeddings, quantized_queries)

Int8 embeddings shape: (18456, 1024)


## Listing 13.22
### Binary Quantization

In [9]:
def index_binary_embeddings(embeddings, binary_index_name):
    binary_embeddings = quantize_embeddings(embeddings,
                                    precision="binary").astype(numpy.uint8)
    print("Binary embeddings shape:", binary_embeddings.shape)
    index = faiss.IndexBinaryFlat(binary_embeddings.shape[1] * 8)
    index.add(binary_embeddings)
    faiss.write_index_binary(index, binary_index_name)
    return index

#queries = get_test_queries()
#embeddings = calculate_outdoors_embeddings(model)

query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
quantized_queries = quantize_embeddings(query_embeddings,
                        calibration_embeddings=embeddings,
                                       precision="binary").astype(numpy.uint8)

binary_index = index_binary_embeddings(embeddings, "binary_out_embs")
evaluate_search(binary_index, "binary_out_embs", query_embeddings, quantized_queries)
evaluate_rerank_search(binary_index, "binary_out_embs", query_embeddings, quantized_queries)

Binary embeddings shape: (18456, 128)
binary_out_embs search took: 0.807 ms (87.11% improvement)
binary_out_embs index size: 2.36 MB (96.87% improvement)
Recall: 0.6044
binary_out_embs search took: 0.951 ms (84.75% improvement)
binary_out_embs index size: 2.36 MB (96.87% improvement)
Recall: 0.6644
9 [{'score': 0.7794986, 'title': '', 'body': "Okay, it is a bit hard to tell the model, so here is the process you'll need. Stake the tent out. Any old stakes will do. With a semi-rigid pole (see below) push this pole through the loops/guides until it is taut. Mark and measure. Once you measure you can order poles from any of a number of sites that sell custom poles. Semi-rigid poles that can work (depends on tent and diameter of pole) Ratan/Bamboo Fiberglass - You can buy very inexpensive fiberglass rods, just google around.", 'id': 9908}, {'score': 0.77647805, 'title': '', 'body': 'Contact Tent Pole Technologies . They make custom tent poles and can do so based off of the pole specs you ha

## Listing 13.23
### Matroyoshka Learned Representations

In [10]:
#queries = get_test_queries()
#embeddings = calculate_outdoors_embeddings(model)
#query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
for slice in [512, 256, 128]:    
    scaled_embeddings = numpy.array(list(map(lambda e: e[:slice], embeddings)))
    quantized_queries = numpy.array(list(map(lambda qe: qe[:slice], query_embeddings)))
    index_name = f"mrl_out_embs_{slice}"
    index = index_embeddings(scaled_embeddings, index_name)
    evaluate_search(index, index_name, query_embeddings, quantized_queries)
    evaluate_rerank_search(index, index_name, query_embeddings, quantized_queries)
    print("\n")

mrl_out_embs_512 embeddings shape: (18456, 512)
mrl_out_embs_512 search took: 3.467 ms (37.91% improvement)
mrl_out_embs_512 index size: 37.8 MB (50.0% improvement)
Recall: 0.7022
mrl_out_embs_512 search took: 3.138 ms (43.51% improvement)
mrl_out_embs_512 index size: 37.8 MB (50.0% improvement)
Recall: 0.7267
9 [{'score': 0.7794986, 'title': '', 'body': "Okay, it is a bit hard to tell the model, so here is the process you'll need. Stake the tent out. Any old stakes will do. With a semi-rigid pole (see below) push this pole through the loops/guides until it is taut. Mark and measure. Once you measure you can order poles from any of a number of sites that sell custom poles. Semi-rigid poles that can work (depends on tent and diameter of pole) Ratan/Bamboo Fiberglass - You can buy very inexpensive fiberglass rods, just google around.", 'id': 9908}, {'score': 0.77647805, 'title': '', 'body': 'Contact Tent Pole Technologies . They make custom tent poles and can do so based off of the pole 

## Listing 13.24
### Product quantizationQ

In [11]:
def index_pq_embeddings(embeddings, index_name):
    dimensions = embeddings.shape[1]
    num_subquantizers = 16
    bits_per_subvector = 8
    index = faiss.IndexPQ(dimensions, num_subquantizers, bits_per_subvector)
    index.train(embeddings)
    index.add(embeddings)   
    faiss.write_index(index, index_name) # Commit the index to disk
    return index


queries = get_test_queries()
embeddings = calculate_outdoors_embeddings(model)
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)

index_name = "pq_out_embs"
index = index_pq_embeddings(embeddings, index_name)
evaluate_search(index, index_name, query_embeddings, query_embeddings)
evaluate_rerank_search(index, index_name, query_embeddings, query_embeddings)

pq_out_embs search took: 1.081 ms (87.01% improvement)
pq_out_embs index size: 1.34 MB (98.22% improvement)
Recall: 0.3333
pq_out_embs search took: 1.194 ms (86.86% improvement)
pq_out_embs index size: 1.34 MB (98.22% improvement)
Recall: 0.4278
9 [{'score': 0.7794986, 'title': '', 'body': "Okay, it is a bit hard to tell the model, so here is the process you'll need. Stake the tent out. Any old stakes will do. With a semi-rigid pole (see below) push this pole through the loops/guides until it is taut. Mark and measure. Once you measure you can order poles from any of a number of sites that sell custom poles. Semi-rigid poles that can work (depends on tent and diameter of pole) Ratan/Bamboo Fiberglass - You can buy very inexpensive fiberglass rods, just google around.", 'id': 9908}, {'score': 0.77647805, 'title': '', 'body': 'Contact Tent Pole Technologies . They make custom tent poles and can do so based off of the pole specs you have already listed. Many in the outdoor industry (EMS, 

## Listing 13.25
### Quantization and reranking: collection/engine implementation

In [13]:
from pyspark.sql.types import Row, ArrayType, FloatType, StructField, StructType, StringType, ByteType
import faiss
from aips.data_loaders.outdoors import load_dataframe

def calculate_outdoors_embeddingss(model):
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    embeddings = get_embeddings(post_texts, model, "outdoors_mrl_normed")
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    quantized_embeddings = quantize_embeddings(embeddings,
                        calibration_embeddings=embeddings,
                                       precision="binary").astype(numpy.uint8)
    for i in range(len(outdoors_data)):
        outdoors_data[i]["text_embedding"] = embeddings[i].tolist()
        print(quantized_embeddings[i].shape)
        outdoors_data[i]["binary_text_embedding"] = quantized_embeddings[i].tolist()
    return outdoors_data

def build_engine_quantization_index():
    outdoors_data = calculate_outdoors_embeddingss(model)
    schema = StructType([StructField("title", StringType()),
                         StructField("body", StringType()),
                         StructField("text_embedding", ArrayType(FloatType())),
                         StructField("binary_text_embedding", ArrayType(ByteType()))])
    outdoors_dataframe = spark.createDataFrame(
        [Row(title=x["title"], body=x["body"],
             text_embedding=x["text_embedding"],
             binary_text_embedding=x["binary_text_embedding"])
             for x in outdoors_data], schema=schema)
    embeddings_collection = engine.create_collection("outdoors_quantization")
    embeddings_collection.write(outdoors_dataframe)
    return embeddings_collection

def calculate_search_engine_rerank_recall(collection, full_request, binary_request):
    #full_start_time = time.time()
    #full_results = collection.search(**full_request)
    #full_time_taken = ((time.time() - full_start_time) * 1000)

    #binary_start_time = time.time()
    #print(binary_start_time)
    #binary_results = collection.search(**binary_request)
    #binary_time_taken = ((time.time() - binary_start_time) * 1000)
    #print(time.time())

    full_results = collection.search(**full_request)["docs"]
    binary_results = collection.search(**binary_request)["docs"]
    full_ids = [r["id"] for r in full_results]
    quantized_ids = [r["id"] for r in binary_results]
    recall = (len(set(full_ids).intersection(set(quantized_ids))) /
              len(set(quantized_ids)))
    print(recall)
    return recall
    #print(full_time_taken, binary_time_taken)

def search_request(query_vector, query_field,
                   rerank_vector=None, rerank_query_field=None,
                   quantization_size=None, k=1000, limit=25):
    request = {"query": query_vector,
               "query_fields": [query_field],
               "return_fields": ["title", "body", "id","score"],
               "limit": limit,
               "k": k,
               "quantization_type": quantization_size,}
    if rerank_vector is not None and rerank_query_field:
        request["rerank_query"] = {"query": rerank_vector,
                                   "query_fields": [rerank_query_field],
                                   "k": k}
    return request

collection = build_engine_quantization_index()
#collection = engine.get_collection("outdoors_quantization")
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
quantized_queries = quantize_embeddings(query_embeddings,
                        calibration_embeddings=embeddings,
                                        precision="binary")
total_recall = 0
for i, query in enumerate(query_embeddings):
    full_request = search_request(query.tolist(), "text_embedding")
    binary_request = search_request(query_vector=quantized_queries[i].tolist(),
                                    query_field="binary_text_embedding",
                                    rerank_vector=query.tolist(),
                                    rerank_query_field="text_embedding",
                                    quantization_size="BINARY", limit=250)
    total_recall += calculate_search_engine_rerank_recall(
                  collection, full_request, binary_request)

print(f"Search engine rerank recall: {total_recall / len(query_embeddings)}")

(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)
(128,)

ValueError: element in array field binary_text_embedding: object of ByteType out of range, got: 232