In [1]:
import os
import pickle
from aips.data_loaders.outdoors import load_dataframe
from aips import get_engine
from sentence_transformers import SentenceTransformer, SimilarityFunction
from sentence_transformers.quantization import quantize_embeddings
from sentence_transformers.util import cos_sim
from pyspark.sql import SparkSession
from aips import get_engine
import time
import numpy
import math

engine = get_engine()
spark = SparkSession.builder.appName("AIPS").getOrCreate()

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:

![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../data/outdoors/' && tar -xvf outdoors.tgz -C '../data/outdoors/'

#outdoors_collection = engine.create_collection("outdoors")

Cloning into 'outdoors'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 25 (delta 0), reused 22 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 491.39 MiB | 8.83 MiB/s, done.
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-squad2-guesses.csv
._roberta-base-squad

In [3]:
dimensions = 1024
#model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka", matryoshka_dims=64)
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1",
                            similarity_fn_name=SimilarityFunction.DOT_PRODUCT,
                            truncate_dim=dimensions)
#_ = model.half()
def get_embeddings(texts, cache_name, ignore_cache=False):
    cache_file_name = f"data/embeddings/{cache_name}.pickle"
    if ignore_cache or not os.path.isfile(cache_file_name):        
        #pool = model.start_multi_process_pool()
        embeddings = model.encode(texts, normalize_embeddings=True)
        #model.stop_multi_process_pool(pool)
        #embeddings = model.encode(texts, convert_to_tensor=False).tolist()
        os.makedirs(os.path.dirname(cache_file_name), exist_ok=True)
        with open(cache_file_name, "wb") as fd:
            pickle.dump(embeddings, fd)
    else:
        with open(cache_file_name, "rb") as fd:
            embeddings = pickle.load(fd)
    return embeddings

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

## Listing 13.21
### Generating embeddings and benchmark data

In [26]:
import faiss
from datasets import load_dataset
from aips.data_loaders.outdoors import load_dataframe

def calculate_outdoors_embeddings():
    outdoors_dataframe = load_dataframe("data/outdoors/posts.csv")
    post_texts = [post["title"] + " " + post["body"]
                  for post in outdoors_dataframe.collect()]
    embeddings = get_embeddings(post_texts, "outdoors_mrl_normed")
    return numpy.array(embeddings), outdoors_dataframe

In [6]:
outdoors_data = calculate_outdoors_embeddings()

In [7]:
display(outdoors_data[0].size)

1024

## Listing 13.22
### int8 quantization

In [87]:
def execute_search(index, index_name, embedded_query, k=25):
    start_time = time.time()
    faiss_scores, faiss_doc_ids = index.search(embedded_query, k)
    print(f"{index_name} search took: {(time.time() - start_time):.3f} sec")
    print(f"{index_name} index size: {os.path.getsize(index_name)} bytes")
    return faiss_scores, faiss_doc_ids

def index_embeddings(embeddings, name):
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, name)
    return index

def display_results(scores, ids, data):
    results = apply_scores(scores, ids, data)
    display(results)
    return results

def apply_scores(scores, ids, data):
    scores = list(list(scores)[0])
    ids = list(ids)
    results = []
    for i, id in enumerate(ids[0]):
        id = int(id)
        result = {"score": scores[i],
                  "title": data[id]["title"],
                  "body": data[id]["body"],
                  "id": id}
        results.append(result)
    return results

def calculate_recall(scored_full_results, scored_quantized_results):
    full_ids = [r["id"] for r in scored_full_results]
    quantized_ids = [r["id"] for r in scored_quantized_results]
    recall =  (len(set(full_ids).intersection(set(quantized_ids))) /
                len(set(quantized_ids)))
    print("Recall: " + str(recall))

embeddings, dataframe = calculate_outdoors_embeddings()
outdoors_data = list(dataframe.rdd.map(lambda r: r.asDict()).collect())
full_index = index_embeddings(embeddings, "full_out_embs")
query_embeddings = model.encode(["tent poles"], convert_to_numpy=True, normalize_embeddings=True)
full_search_scores, ids = execute_search(full_index, "full_out_embs", query_embeddings)
scored_full_results = apply_scores(full_search_scores, ids, outdoors_data)

int8_embeddings = quantize_embeddings(embeddings, precision="int8")
int8_index = index_embeddings(int8_embeddings, "int8_out_embs")
int8_query_embeddings = quantize_embeddings(query_embeddings, precision="int8")
int8_search_scores, int8_ids = execute_search(int8_index, "int8_out_embs", int8_query_embeddings)
scored_int8_results = apply_scores(int8_search_scores, int8_ids, outdoors_data)

calculate_recall(scored_full_results, scored_int8_results)



full_out_embs search took: 0.005 sec
full_out_embs index size: 75595821 bytes
int8_out_embs search took: 0.005 sec
int8_out_embs index size: 75595821 bytes
Recall: 0.0


  return ((embeddings - starts) / steps - 128).astype(np.int8)


## Listing 13.23
### Binary Quantization

In [89]:
def binary_quantize_embeddings(embeddings):    
    return numpy.packbits(embeddings > 0).reshape(embeddings.shape[0], -1)

def index_binary_embeddings(embeddings):
    binary_embeddings = binary_quantize_embeddings(list(embeddings))
    index = faiss.IndexBinaryFlat(binary_embeddings.shape[1])
    index.add(binary_embeddings)
    faiss.write_index_binary(binary_embeddings, "binary_out_embs")
    return index

binary_index = index_binary_embeddings(embeddings)
binary_query_embeddings = binary_quantize_embeddings(query_embeddings)
binary_search_scores, binary_ids = execute_search(binary_index, "binary_out_embs", binary_query_embeddings)
scored_bintary_results = apply_scores(binary_search_scores, binary_ids, outdoors_data)

calculate_recall(scored_full_results, scored_int8_results)

ValueError: cannot reshape array of size 128 into shape (1024,newaxis)

## Listing 13.24
### Matroyoshka Learned Representations

In [None]:
def evaluate_mrl_quantized_searches(embeddings, queries):
    query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
    for slice in [512, 256, 128]:
        scaled_embeddings = map(lambda e: e[:slice], embeddings)
        index = index_embeddings(scaled_embeddings)
        scaled_query_embeddings = map(lambda qe: qe[:slice], query_embeddings)
        yield evaluate_search(index, scaled_query_embeddings)

queries = get_evaluation_queries()
embeddings = calculate_outdoors_embeddings()
evaluations = evaluate_mrl_quantized_searches(embeddings, queries)
display(*evaluations)

## Listing 13.25
### Product quantization

In [None]:
def index_pq_embeddings(embeddings):    
    dimensions = embeddings.shape[1]
    sub_vectors = 8
    subquantizer_bits = 8
    index = faiss.IndexPQ(dimensions, sub_vectors, subquantizer_bits)
    index.train(embeddings)
    return index

embeddings = calculate_outdoors_embeddings()
index = index_pq_embeddings()
queries = get_evaluation_queries()
query_embeddings = model.encode(queries, convert_to_numpy=True, normalize_embeddings=True)
evaluation = evaluate_search(index, query_embeddings)
display(evaluation)