In [1]:
import os
import pickle

from pyspark import SparkConf
from aips.data_loaders.outdoors import load_dataframe
from aips import get_engine
from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers.util import cos_sim
from pyspark.sql import SparkSession
from aips import get_engine
import time
import numpy
import math

engine = get_engine()
#Recommended for making ALS run faster, if you have enough memory / cores allocated to docker
conf = SparkConf()
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.dynamicAllocation.executorMemoryOverhead", "8g")
spark = SparkSession.builder.appName("AIPS-ch13").config(conf=conf).getOrCreate()

#https://github.com/facebookresearch/faiss/wiki/Pre--and-post-processing
#https://github.com/facebookresearch/faiss/wiki

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

#outdoors_collection = engine.create_collection("outdoors")

Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-squad2-guesses.csv
._roberta-base-squad2-outdoors
roberta-base-squad2-outdoors/
roberta-base-squad2-outdoors/._tokenizer_config.json
roberta-base-squad2-outdoors/tokenizer_config.json
roberta-base-squad2-outdoors/._special_tokens_map.json
roberta-base-squad2-outdoors/special_tokens_map.json
roberta-base-squad2-outdoors/._config.json
roberta-base-

In [3]:
dimensions = 1024
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1",
                            similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
                            truncate_dim=dimensions)

def get_embeddings(texts, cache_name, ignore_cache=False):
    cache_file_name = f"data/embeddings/{cache_name}.pickle"
    if ignore_cache or not os.path.isfile(cache_file_name):        
        #pool = model.start_multi_process_pool()
        embeddings = model.encode(texts, normalize_embeddings=True)
        #model.stop_multi_process_pool(pool)
        #embeddings = model.encode(texts, convert_to_tensor=False).tolist()
        os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
        with open(cache_file_name, "wb") as fd:
            pickle.dump(embeddings, fd)
    else:
        with open(cache_file_name, "rb") as fd:
            embeddings = pickle.load(fd)
    return embeddings

## Listing 13.21
### Generating embeddings and benchmark data

In [4]:
import faiss
from aips.data_loaders.outdoors import load_dataframe

def calculate_outdoors_embeddings():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    embeddings = get_embeddings(post_texts, "outdoors_mrl_normed")
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    return numpy.array(embeddings), outdoors_data

def display_results(scores, ids, data):
    results = generate_search_results(scores, ids, data)
    display(results)
    return results

def calculate_stat_messages(full_search_time, time_taken, full_index_size, size):
    if full_search_time:
        t = round((full_search_time - time_taken) * 100 / full_search_time, 2)
    improvement_ms = f"({t}% improvement)" if full_search_time else ""
    improvement_size = f"({round((full_index_size - size) * 100 / full_index_size, 2)}% improvement)" if full_index_size else ""
    return improvement_ms, improvement_size

def generate_search_results(faiss_scores, faiss_ids, data):
    scores = list(list(faiss_scores)[0])
    ids = list(faiss_ids)
    results = []
    for i, id in enumerate(ids[0]):
        id = int(id)
        result = {"score": scores[i],
                  "title": data[id]["title"],
                  "body": data[id]["body"],
                  "id": id}
        results.append(result)
    return results

In [18]:
#This will generate and cache the embeddings. Takes 2-3 hours typically
embeddings, outdoors_dataframe = calculate_outdoors_embeddings() 

## Listing 13.22
### int8 quantization

In [30]:
from sentence_transformers.quantization import quantize_embeddings

def index_embeddings(embeddings, name):
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def time_and_execute_search(index, index_name, embedded_query, k=25, log=False):
    _, outdoors_data = calculate_outdoors_embeddings()
    start_time = time.time()
    faiss_scores, faiss_ids = index.search(embedded_query, k=k) 
    if log:
        display(f"Execute search scores: {faiss_scores}")
        display(f"Search ids: {faiss_ids}")
    time_taken = ((time.time() - start_time) * 1000)
    scored_results = generate_search_results(faiss_scores, faiss_ids, outdoors_data)
    size = os.path.getsize(index_name)
    return scored_results, time_taken, size

def execute_full_search(embeddings, embedded_query, k=25):  
    index_name = "full_out_embs"
    full_index = index_embeddings(embeddings, index_name)
    return time_and_execute_search(full_index, index_name, embedded_query, k=k)

def evaluate_search(index, index_name, embedded_query, k=25, log=False):
    embeddings, outdoors_data = calculate_outdoors_embeddings()
    scored_results, time_taken, size = time_and_execute_search(index, index_name, embedded_query, k=k)
    full_search_results, full_search_time, full_index_size = execute_full_search(embeddings, query_embeddings, outdoors_data)
    improvement_ms, improvement_size = calculate_stat_messages(full_search_time, time_taken,
                                                               full_index_size, size)

    calculate_recall(full_search_results, scored_int8_results)
    print(f"{index_name} search took: {time_taken:.3f} ms {improvement_ms}")
    print(f"{index_name} index size: {round(size / 1000000, 2)} MB {improvement_size}")
    return scored_results, time_taken, size

def calculate_recall(scored_full_results, scored_quantized_results):
    full_ids = [r["id"] for r in scored_full_results]
    quantized_ids = [r["id"] for r in scored_quantized_results]
    recall =  (len(set(full_ids).intersection(set(quantized_ids))) /
               len(set(quantized_ids)))
    print("Recall: " + str(recall))

def index_int_embeddings(embeddings, name):
    embeddings = quantize_embeddings(embeddings, precision="int8")
    print("Int8 embeddings shape:", embeddings.shape)
    index = faiss.IndexScalarQuantizer(embeddings.shape[1], faiss.ScalarQuantizer.QT_8bit)
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def get_test_queries():
    return ["tent poles"]
    
embeddings, _ = calculate_outdoors_embeddings()
int8_index_name = "int8_out_embs"
queries = get_test_queries()
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
int8_index = index_int_embeddings(embeddings, int8_index_name)
quantized_queries = quantize_embeddings(query_embeddings,
                                        calibration_embeddings=embeddings,
                                        precision="int8")
scored_int8_results, _, _ = evaluate_search(int8_index, int8_index_name,
                                           quantized_queries)


Int8 embeddings shape: (18456, 1024)


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [31]:
from sentence_transformers.quantization import quantize_embeddings

def index_embeddings(embeddings, name):
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def execute_full_search(embeddings, embedded_query, outdoors_data):    
    embeddings, outdoors_data = calculate_outdoors_embeddings()
    index_name = "full_out_embs"
    full_index = index_embeddings(embeddings, index_name)
    scored_full_results, time, size = evaluate_search(full_index, index_name,
                                                     embedded_query, outdoors_data)
    return scored_full_results, time, size

def evaluate_search(index, index_name, embedded_query,
                    k=25, log=False, full_search_time=None, full_index_size=None):
    
    embeddings, outdoors_data = calculate_outdoors_embeddings()
    full_search_results, full_search_time, full_index_size = \
        execute_full_search(embeddings, embedded_query, outdoors_data)

    start_time = time.time()
    faiss_scores, faiss_ids = index.search(embedded_query, k) 
    time_taken = ((time.time() - start_time) * 1000)
    size = os.path.getsize(index_name)
    improvement_ms, improvement_size = calculate_stat_messages(full_search_time, time_taken,
                                                               full_index_size, size)
    print(f"{index_name} search took: {time_taken:.3f} ms {improvement_ms}")
    print(f"{index_name} index size: {round(size / 1000000, 2)} MB {improvement_size}")
    if log:
        display(f"Execute search scores: {faiss_scores}")
        display(f"Search ids: {faiss_ids}")
        
    scored_results = generate_search_results(faiss_scores, faiss_ids, outdoors_data)

    calculate_recall(full_search_results, scored_int8_results)
    return scored_results, time_taken, size

def calculate_recall(scored_full_results, scored_quantized_results):
    full_ids = [r["id"] for r in scored_full_results]
    quantized_ids = [r["id"] for r in scored_quantized_results]
    recall =  (len(set(full_ids).intersection(set(quantized_ids))) /
               len(set(quantized_ids)))
    print("Recall: " + str(recall))

def index_int_embeddings(embeddings, name):
    embeddings = quantize_embeddings(embeddings, precision="int8")
    print("Int8 embeddings shape:", embeddings.shape)
    index = faiss.IndexScalarQuantizer(embeddings.shape[1], faiss.ScalarQuantizer.QT_8bit)
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def get_test_queries():
    return ["tent poles"]

int8_index_name = "int8_out_embs"
embeddings, _ = calculate_outdoors_embeddings()
int8_index = index_int_embeddings(embeddings, int8_index_name)
queries = get_test_queries()
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
quantized_queries = quantize_embeddings(query_embeddings,
                                        calibration_embeddings=embeddings,
                                        precision="int8")
scored_int8_results, _, _ = evaluate_search(int8_index, int8_index_name,
                                            query_embeddings,
                                            quantized_queries)

Int8 embeddings shape: (18456, 1024)


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

## Listing 13.23
### Binary Quantization

In [7]:
def binary_quantize_embeddings(embeddings):
    #quantized_embs = numpy.where(embeddings < 0, 0, 1).astype(numpy.uint8) # binarize
    return numpy.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
    
def index_binary_embeddings(embeddings):
    binary_embeddings = numpy.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
    print("Binary embeddings shape:", binary_embeddings.shape)
    index = faiss.IndexBinaryFlat(binary_embeddings.shape[1] * 8)
    index.add(binary_embeddings)
    faiss.write_index_binary(index, "binary_out_embs")
    return index

binary_index = index_binary_embeddings(embeddings)
quantized_query = numpy.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)
scored_binary_results, _, _ = execute_search(binary_index, "binary_out_embs",
                                             quantized_query, outdoors_data,
                                             log=False,
                                             full_search_time=full_search_time,
                                             full_index_size=full_index_size)
calculate_recall(scored_full_results, scored_binary_results)
# (vs. 5.000 ms, 90% reduction)

Binary embeddings shape: (18456, 128)
binary_out_embs search took: 754.192 ms (-12400.24% improvement)
binary_out_embs index size: 2.36 MB (96.87% improvement)
Recall: 0.0


## Listing 13.24
### Matroyoshka Learned Representations

In [8]:
def evaluate_mrl_quantized_searches(query):
    embeddings, outdoors_data = calculate_outdoors_embeddings()
    query_embeddings = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scored_full_results, full_search_time, full_index_size = \
        execute_full_search(embeddings, query_embeddings, outdoors_data)
    for slice in [512, 256, 128]:
        index_name = f"mrl_out_embs_{slice}"
        scaled_embeddings = numpy.array(list(map(lambda e: e[:slice], embeddings)))
        print(f"MRL{slice} embeddings shape:", scaled_embeddings.shape)
        index = index_embeddings(scaled_embeddings, index_name)
        quantized_query = numpy.array(list(map(lambda qe: qe[:slice], query_embeddings)))
        scored_results, _, _ = execute_search(index, index_name,
                                              quantized_query, outdoors_data,
                                              full_search_time=full_search_time,
                                              full_index_size=full_index_size)        
        calculate_recall(scored_full_results, scored_results)

evaluate_mrl_quantized_searches("tent poles")

#full_out_embs search took: 3.900 ms
#full_out_embs index size: 75,595,821 bytes

full_out_embs search took: 3.799 ms 
full_out_embs index size: 75.6 MB 
MRL512 embeddings shape: (18456, 512)
mrl_out_embs_512 search took: 1.930 ms (49.21% improvement)
mrl_out_embs_512 index size: 37.8 MB (50.0% improvement)
Recall: 0.84
MRL256 embeddings shape: (18456, 256)
mrl_out_embs_256 search took: 1.014 ms (73.31% improvement)
mrl_out_embs_256 index size: 18.9 MB (75.0% improvement)
Recall: 0.64
MRL128 embeddings shape: (18456, 128)
mrl_out_embs_128 search took: 0.582 ms (84.69% improvement)
mrl_out_embs_128 index size: 9.45 MB (87.5% improvement)
Recall: 0.52


## Listing 13.25
### Product quantizationQ

In [9]:
def index_pq_embeddings(embeddings, name="pq_out_embs"):    
    dimensions = embeddings.shape[1]
    sub_vectors = 8
    subquantizer_bits = 8
    #faiss::IndexIVFPQ, IndexIVFPQR
    index = faiss.IndexPQ(dimensions, sub_vectors, subquantizer_bits)
    index.train(embeddings)
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def index_pq_reranked_embeddings(embeddings, name="pq_out_embs"):    
    dimensions = embeddings.shape[1]
    sub_vectors = 8
    subquantizer_bits = 8
    #faiss::IndexIVFPQ, IndexIVFPQR
    index = faiss.IndexPQ(dimensions, sub_vectors, subquantizer_bits)
    rereanking_pq_index = faiss.IndexRefineFlat(index)
    rereanking_pq_index.train(embeddings)
    rereanking_pq_index.add(embeddings)
    faiss.write_index(rereanking_pq_index, name)
    return index

embeddings, outdoors_data = calculate_outdoors_embeddings()
query_embeddings = model.encode(["Hiking trails"], convert_to_numpy=True, normalize_embeddings=True)
scored_full_results, full_search_time, full_index_size = \
    execute_full_search(embeddings, query_embeddings, outdoors_data)
embeddings, outdoors_data = calculate_outdoors_embeddings()
index = index_pq_reranked_embeddings(embeddings)
scored_results, _, _ = execute_search(index, "pq_out_embs", query_embeddings, outdoors_data,
                                   full_search_time=full_search_time,
                                   full_index_size=full_index_size)
calculate_recall(scored_full_results, scored_results)

full_out_embs search took: 4.253 ms 
full_out_embs index size: 75.6 MB 
pq_out_embs search took: 0.538 ms (87.35% improvement)
pq_out_embs index size: 76.79 MB (-1.58% improvement)
Recall: 0.28


## Listing 13.26
### Quantization and reranking

In [15]:
from pyspark.sql.functions import col, udf, monotonically_increasing_id
from pyspark.sql.types import Row, ArrayType, FloatType, StructField, StructType, StringType, ByteType
import faiss
from aips.data_loaders.outdoors import load_dataframe

def calculate_outdoors_embeddingsss():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    embeddings = get_embeddings(post_texts, "outdoors_mrl_normed")
    outdoors_data = list(outdoors_dataframe.rdd.map(lambda r: r.asDict()).collect())
    for i in range(len(outdoors_data)):
        embs = [float(e) for e in embeddings[i]]
        outdoors_data[i]["text_embedding"] = embs
        asdf = numpy.array(embs)
        outdoors_data[i]["binary_text_embedding"] = numpy.packbits(embs > 0) \
            .reshape(asdf, -1).tolist()
        [float(e) for e in embeddings[i]]
    return outdoors_data

def build_engine_quantization_index():
    outdoors_data = calculate_outdoors_embeddingsss()
    schema = StructType([StructField("title", StringType()),
                         StructField("body", StringType()),
                         StructField("text_embedding", ArrayType(FloatType())),
                         StructField("binary_text_embedding", ArrayType(ByteType()))])
    outdoors_dataframe = spark.createDataFrame(
        [Row(title=x["title"], body=x["body"],
             text_embedding=x["text_embedding"],
             binary_text_embedding=x["binary_text_embedding"])
             for x in outdoors_data], schema=schema)
    #embeddings = list(embeddings)
    #outdoors_data = load_dataframe("data/outdoors/posts.csv")
    #quantized_embeddings = [quantize(e) for e in normalized_embeddings]
    embeddings_collection = engine.create_collection("outdoors_quantization")
    #outdoors_data = outdoors_data.withColumn("id", monotonically_increasing_id())
    #outdoors_data = outdoors_data.withColumn("text_embedding",
                                             #udf(lambda id: [float(e) for e in embeddings.pop(0)],
                                                 #ArrayType(FloatType()))("id"))
    #outdoors_data = outdoors_data.withColumn("binary_text_embedding")
    print(type(outdoors_dataframe))
    embeddings_collection.write(outdoors_dataframe)

def search_request(query_vector, field, rerank_vector, rerank_field,
                          quantization_size):
    return {"query": query_vector,
            "query_fields": [field],
            "return_fields": ["title", "body"],
            "limit": 25,
            "k": 1000,
            #"rerank_query": {"rerank_count": 250,
            #                 "query": rerank_vector,
            #                 "query_fields": [field],
            #                 "k": 25},
            "quantization_size": quantization_size,
            "log": True}

def engine_rankings(query, log=False):
    collection = engine.get_collection("outdoors_quantization")
    query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    #quantized_query = numpy.zeros_like(query_embedding, dtype=numpy.int8)
    #quantized_query[query_embedding > 0] = 1
    quantized_query = numpy.packbits(query_embedding > 0).reshape(embeddings.query_embedding[0], -1)

    binary_request = search_request(quantized_query[0].tolist(), "binary_text_embedding",
                                    query_embedding[0].tolist(), "text_embedding",
                                    "BINARY")
    results = collection.search(**binary_request)
    display(results)

build_engine_quantization_index()
collection = engine.get_collection("outdoors_quantization")
engine_rankings("hiking trails")

TypeError: only integer scalar arrays can be converted to a scalar index