In [1]:
import os
import pickle
import time
import numpy
from pyspark import SparkConf
from sentence_transformers import SentenceTransformer, SimilarityFunction
from pyspark.sql import SparkSession
from aips import get_engine

engine = get_engine()

#Recommended for making processing faster, if you have enough memory / cores allocated to docker
conf = SparkConf()
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.dynamicAllocation.executorMemoryOverhead", "8g")
spark = SparkSession.builder.appName("AIPS-ch13").config(conf=conf).getOrCreate()

model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1",
                            similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
                            truncate_dim=1024)

  from tqdm.autonotebook import tqdm, trange


In [2]:
#Some helpful documentation and reference material

#https://github.com/facebookresearch/faiss/wiki/Pre--and-post-processing
#https://github.com/facebookresearch/faiss/wiki
#https://huggingface.co/spaces/sentence-transformers/quantized-retrieval/blob/main/app.py

#multi threading sentence_transformers_model
#model.stop_multi_process_pool(pool)
#pool = model.start_multi_process_pool()
#embeddings = model.encode(texts, convert_to_tensor=False).tolis

In [3]:
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-squad2-guesses.csv
._roberta-base-squad2-outdoors
roberta-base-squad2-outdoors/
roberta-base-squad2-outdoors/._tokenizer_config.json
roberta-base-squad2-outdoors/tokenizer_config.json
roberta-base-squad2-outdoors/._special_tokens_map.json
roberta-base-squad2-outdoors/special_tokens_map.json
roberta-base-squad2-outdoors/._config.json
roberta-base-

In [4]:
def get_embeddings(texts, model, cache_name, ignore_cache=False):
    cache_file_name = f"data/embeddings/{cache_name}.pickle"
    if ignore_cache or not os.path.isfile(cache_file_name):        
        embeddings = model.encode(texts, normalize_embeddings=True)
        os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
        with open(cache_file_name, "wb") as fd:
            pickle.dump(embeddings, fd)
    else:
        with open(cache_file_name, "rb") as fd:
            embeddings = pickle.load(fd)
    return embeddings

## Boilerplate code for Quantization listings
### Generating embeddings and benchmark data

In [5]:
import faiss
from aips.data_loaders.outdoors import load_dataframe

def display_results(scores, ids, data):
    results = generate_search_results(scores, ids, data)
    display(results)
    return results

def get_outdoors_data():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    return outdoors_data

def display_statistics(search_results, baseline_search_results=None, start_message="Recall"):
    index_name = search_results["index_name"]
    time_taken = search_results["time_taken"]
    index_size = search_results["size"]      
    improvement_ms = ""
    improvement_size = ""
    recall = 1.0
    if baseline_search_results:
        full_search_time = baseline_search_results["time_taken"]
        time_imp = round((full_search_time - time_taken) * 100 / full_search_time, 2)
        improvement_ms = f" ({time_imp}% improvement)"
        improvement_size = f" ({round((baseline_search_results['size'] - index_size) * 100 / baseline_search_results['size'], 2)}% improvement)"
        recall = calculate_recall(baseline_search_results["results"], search_results["results"])
            
    print(f"{index_name} search took: {time_taken:.3f} ms{improvement_ms}")
    print(f"{index_name} index size: {round(index_size / 1000000, 2)} MB{improvement_size}")
    print(f"{start_message}: {round(recall, 4)}")

def calculate_recall(scored_full_results, scored_quantized_results):
    recalls = []
    for i in range(len(scored_full_results)):
        full_ids = [r["id"] for r in scored_full_results[i]]
        quantized_ids = [r["id"] for r in scored_quantized_results[i]]
        recalls.append((len(set(full_ids).intersection(set(quantized_ids))) /
                       len(set(quantized_ids))))
    return sum(recalls) / len(recalls)

def generate_search_results(faiss_scores, faiss_ids):
    outdoors_data = get_outdoors_data()
    faiss_results = []
    for i in range(len(faiss_scores)):
        results = []
        for j, id in enumerate(faiss_ids[i]):
            id = int(id)
            result = {"score": faiss_scores[i][j],
                      "title": outdoors_data[id]["title"],
                      "body": outdoors_data[id]["body"],
                      "id": id}
            results.append(result)
        faiss_results.append(results)
    return faiss_results

def time_and_execute_search(index, index_name, query_embeddings, k=25, num_runs=100):
    search_times = []
    faiss_scores = None 
    faiss_ids = None
    
    for i in range(num_runs):
        start_time = time.time()
        faiss_scores, faiss_ids = index.search(query_embeddings, k=k)
        time_taken = ((time.time() - start_time) * 1000)
        search_times.append(time_taken)

    results = {"results": generate_search_results(faiss_scores, faiss_ids),
               "time_taken": numpy.average(search_times), 
               "faiss_scores": faiss_scores, "faiss_ids": faiss_ids}
    index_stats = {}
    if index_name:
        index_stats ={"index_name": index_name,
                      "size": os.path.getsize(index_name)}
    return results | index_stats

## Listing 13.20

### Indexing full-precision embeddings using FAISS

In [6]:
from sentence_transformers.quantization import quantize_embeddings

model = SentenceTransformer(
          "mixedbread-ai/mxbai-embed-large-v1",
          similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
          truncate_dim=1024)

def index_full_precision_embeddings(doc_embeddings, name):
    index = faiss.IndexFlatIP(doc_embeddings.shape[1])
    index.add(doc_embeddings)
    faiss.write_index(index, name)
    return index

def get_outdoors_embeddings(model):
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    #TODO: This will take 2-5 hours to run the first time if not cached. Upload to github to save readers hassle.
    return numpy.array(
        get_embeddings(post_texts, model, "outdoors_mrl_normed"))

doc_embeddings = get_outdoors_embeddings(model) #takes 2-3 hours if not cached
full_index = index_full_precision_embeddings(doc_embeddings,
                                          "full_embeddings")

## Listing 13.21

### Generating full-precision query embeddings

In [7]:
def get_test_queries():
    return["tent poles", "hiking trails", "mountain forests",
           "white water", "best waterfalls", "mountain biking",
           "snowboarding slopes", "bungee jumping", "public parks"]

queries = get_test_queries()
query_embeddings = model.encode(queries,
                  convert_to_numpy=True,
              normalize_embeddings=True)

full_results = time_and_execute_search(
    full_index, "full_embeddings",
    query_embeddings, k=25)
display_statistics(full_results)

full_embeddings search took: 6.279 ms
full_embeddings index size: 75.6 MB
Recall: 1.0


## Listing 13.22

### Functions to benchmark quantized search approaches

In [8]:
def evaluate_search(full_index, optimized_index, optimized_index_name,
                    query_embeddings, optimized_query_embeddings,
                    k=25, display=True, log=False):
    full_results = time_and_execute_search(full_index, "full_embeddings",
                                           query_embeddings, k=k)
    optimized_results = time_and_execute_search(optimized_index, optimized_index_name,
                                                optimized_query_embeddings, k=k)
    if display:
        display_statistics(optimized_results, full_results)
    return optimized_results, full_results

def evaluate_rerank_search(full_index, optimized_index,
                           query_embeddings,
                           optimized_embeddings,
                           k=50, limit=25):
    results, full_results = evaluate_search(full_index, optimized_index, None, query_embeddings,
                                            optimized_embeddings, display=False, k=k)
    
    doc_embeddings = get_outdoors_embeddings(model) #This can point to a cheap on-disk data source containing the original full-precision embeddings
    rescore_scores, rescore_ids = [], []
    for i in range(len(results["results"])):
        embedding_ids = results["faiss_ids"][i]
        top_k_embeddings = [doc_embeddings[id] for id in embedding_ids]
        query_embedding = query_embeddings[i]
        scores = query_embedding @ numpy.array(top_k_embeddings).T
        indices = scores.argsort()[::-1][:limit]
        top_k_indices = embedding_ids[indices]
        top_k_scores = scores[indices]
        rescore_scores.append(top_k_scores)
        rescore_ids.append(top_k_indices)

    results = generate_search_results(rescore_scores, rescore_ids)
    recall = calculate_recall(full_results["results"], results)
    print(f"Reranked recall: {round(recall, 4)}")

## Listing 13.23
### int8 quantization

In [9]:
def index_int8_embeddings(doc_embeddings, name):
    int8_embeddings = quantize_embeddings(doc_embeddings,
                                          precision="int8")
    print("Int8 embeddings shape:", int8_embeddings.shape)
    index = faiss.IndexScalarQuantizer(int8_embeddings.shape[1],
                                       faiss.ScalarQuantizer.QT_8bit)
    index.train(int8_embeddings)
    index.add(int8_embeddings)
    faiss.write_index(index, name)
    return index

int8_index_name = "int8_embeddings"
int8_index = index_int8_embeddings(doc_embeddings, int8_index_name)

quantized_queries = quantize_embeddings(query_embeddings,
                   calibration_embeddings=doc_embeddings,
                                        precision="int8")

evaluate_search(full_index, int8_index, int8_index_name,
                query_embeddings, quantized_queries)
evaluate_rerank_search(full_index, int8_index,
                       query_embeddings, quantized_queries)

Int8 embeddings shape: (18456, 1024)
int8_embeddings search took: 8.827 ms (-47.13% improvement)
int8_embeddings index size: 18.91 MB (74.99% improvement)
Recall: 0.9289
Reranked recall: 1.0


## Listing 13.24
### Binary Quantization

In [10]:
def index_binary_embeddings(doc_embeddings, binary_index_name):
    binary_embeddings = quantize_embeddings(doc_embeddings,
                                    precision="binary").astype(numpy.uint8)
    print("Binary embeddings shape:", binary_embeddings.shape)
    index = faiss.IndexBinaryFlat(binary_embeddings.shape[1] * 8)
    index.add(binary_embeddings)
    faiss.write_index_binary(index, binary_index_name)
    return index

binary_index_name = "binary_embeddings"
binary_index = index_binary_embeddings(doc_embeddings, binary_index_name)

quantized_queries = quantize_embeddings(query_embeddings,
                   calibration_embeddings=doc_embeddings,
                                      precision="binary").astype(numpy.uint8)

evaluate_search(full_index, binary_index, binary_index_name,
                query_embeddings, quantized_queries)
evaluate_rerank_search(full_index, binary_index,
                       query_embeddings, quantized_queries)

Binary embeddings shape: (18456, 128)
binary_embeddings search took: 0.395 ms (93.38% improvement)
binary_embeddings index size: 2.36 MB (96.87% improvement)
Recall: 0.6044
Reranked recall: 1.0


## Listing 13.25
### Product Quantization

In [11]:
def index_pq_embeddings(doc_embeddings, index_name, num_subvectors=16):
    dimensions = doc_embeddings.shape[1]
    M = num_subvectors
    num_bits = 8
    index = faiss.IndexPQ(dimensions, M, num_bits)
    index.train(doc_embeddings)
    index.add(doc_embeddings)   
    faiss.write_index(index, index_name) # Commit the index to disk
    return index

pq_index_name = "pq_embeddings"
pq_index = index_pq_embeddings(doc_embeddings, pq_index_name)

evaluate_search(full_index, pq_index, pq_index_name, query_embeddings, query_embeddings)
evaluate_rerank_search(full_index, pq_index, query_embeddings, query_embeddings)

#TODO: consider adding IVFFlatPQ as optimization for speed to show at end

pq_embeddings search took: 0.542 ms (90.95% improvement)
pq_embeddings index size: 1.34 MB (98.22% improvement)
Recall: 0.3333
Reranked recall: 0.68


### Run again with 64 subvectors instead of 16

In [12]:
pq_index = index_pq_embeddings(doc_embeddings, pq_index_name, num_subvectors=64)
evaluate_search(full_index, pq_index, pq_index_name, query_embeddings, query_embeddings)
evaluate_rerank_search(full_index, pq_index, query_embeddings, query_embeddings)

pq_embeddings search took: 1.916 ms (69.19% improvement)
pq_embeddings index size: 2.23 MB (97.05% improvement)
Recall: 0.5778
Reranked recall: 0.9911


## Listing 13.26
### Matryoshka Representations Learning

In [13]:
def get_mrl_embeddings(embeddings, num_dimensions):
    mrl_embeddings = numpy.array(
        list(map(lambda e: e[:num_dimensions], embeddings)))
    return mrl_embeddings

def index_mrl_embeddings(doc_embeddings, num_dimensions, mrl_index_name):
    mrl_doc_embeddings = get_mrl_embeddings(doc_embeddings, num_dimensions)
    print(f"{mrl_index_name} embeddings shape:", mrl_doc_embeddings.shape)
    mrl_index = index_full_precision_embeddings(mrl_doc_embeddings,
                                                    mrl_index_name)
    return mrl_index

print(f"Original embeddings shape: {doc_embeddings.shape}")
original_dimensions = doc_embeddings.shape[1] #1024

for num_dimensions in [original_dimensions//2, #512
                       original_dimensions//4, #256
                       original_dimensions//8]: #128

    mrl_index_name = f"mrl_embeddings_{num_dimensions}"
    mrl_index = index_mrl_embeddings(doc_embeddings, num_dimensions, mrl_index_name)
    mrl_queries = get_mrl_embeddings(query_embeddings, num_dimensions)
    print("\n", end="")
    
    evaluate_search(full_index, mrl_index, mrl_index_name,
                            query_embeddings, mrl_queries)
    evaluate_rerank_search(full_index, mrl_index,
                   query_embeddings, mrl_queries)

Original embeddings shape: (18456, 1024)
mrl_embeddings_512 embeddings shape: (18456, 512)

mrl_embeddings_512 search took: 2.992 ms (51.56% improvement)
mrl_embeddings_512 index size: 37.8 MB (50.0% improvement)
Recall: 0.7022
Reranked recall: 1.0
mrl_embeddings_256 embeddings shape: (18456, 256)

mrl_embeddings_256 search took: 1.309 ms (79.63% improvement)
mrl_embeddings_256 index size: 18.9 MB (75.0% improvement)
Recall: 0.4756
Reranked recall: 0.9689
mrl_embeddings_128 embeddings shape: (18456, 128)

mrl_embeddings_128 search took: 0.572 ms (90.4% improvement)
mrl_embeddings_128 index size: 9.45 MB (87.5% improvement)
Recall: 0.2489
Reranked recall: 0.64


## Listing 13.27

### Combining techniques

In [14]:
def index_binary_ivf_mrl_embeddings(reduced_mrl_doc_embeddings, binary_index_name):    
    #Binary quantization
    binary_embeddings = quantize_embeddings(reduced_mrl_doc_embeddings,
                        calibration_embeddings=reduced_mrl_doc_embeddings,
                        precision="binary").astype(numpy.uint8)
    dimensions = reduced_mrl_doc_embeddings.shape[1]
    quantizer = faiss.IndexBinaryFlat(dimensions)

    #ANN: IVF Flat Algorithm
    num_clusters = 256
    index = faiss.IndexBinaryIVF(
        quantizer, dimensions, num_clusters)
    index.nprobe = 4

    index.train(binary_embeddings)
    index.add(binary_embeddings)
    faiss.write_index_binary(index, binary_index_name)
    return index

mrl_dimensions = doc_embeddings.shape[1] // 2  #MRL 1024 => 512 dimensions
reduced_mrl_doc_embeddings =  get_mrl_embeddings(
    doc_embeddings, mrl_dimensions)

binary_ivf_mrl_index_name = "binary_ivf_mrl_embeddings"
binary_ivf_mrl_index = index_binary_ivf_mrl_embeddings(
    reduced_mrl_doc_embeddings, binary_ivf_mrl_index_name)

mrl_queries = get_mrl_embeddings(query_embeddings, 
                                  mrl_dimensions)
quantized_queries = quantize_embeddings(mrl_queries,
      calibration_embeddings=reduced_mrl_doc_embeddings,
      precision="binary").astype(numpy.uint8)

evaluate_search(full_index, binary_ivf_mrl_index,
    binary_ivf_mrl_index_name, 
    query_embeddings, quantized_queries)
evaluate_rerank_search(
    full_index, binary_ivf_mrl_index,
    query_embeddings, quantized_queries)

binary_ivf_mrl_embeddings search took: 0.122 ms (98.04% improvement)
binary_ivf_mrl_embeddings index size: 1.35 MB (98.22% improvement)
Recall: 0.3511
Reranked recall: 0.7244


## Listing 13.28
Rerank search results with cross-encoder 

#### Located in the [Chapter 13 Semantic Search notebook](4.semantic-search.ipynb#listing-13.28) notebook


Up next: [Chapter 14: Question Answering with a Fine-tuned Large Language Model](../ch14/1.question-answering-visualizer.ipynb)