Based on https://github.com/neuml/txtai/blob/master/examples/02_Build_an_Embeddings_index_with_Hugging_Face_Datasets.ipynb

# Setup 

In [1]:
from datasets import load_dataset

from txtai.embeddings import Embeddings
from txtai.pipeline import Similarity

from IPython import display


In [2]:
def stream(dataset, field, limit):
    index = 0
    for row in dataset:
        yield (index, row[field], None)
        index += 1

        if index >= limit:
            break


def search(query):
    return embeddings.search(query, limit=50)


# similarity takes a query and a list of texts
# def ranksearch(query):
#     results = [text for _, text in search(query)]
#     return [(score, results[x]) for x, score in similarity(query, results)]


# Dataset

In [3]:
dataset = load_dataset("ag_news", split="train")


Found cached dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


# Indexing

In [4]:
num_docs = 100

embeddings = Embeddings(
    {
        "path": "sentence-transformers/all-MiniLM-L6-v2",
        "backend": "qdrant_txtai.ann.qdrant.Qdrant",
        "qdrant": {
            "host": "qdrant"
        }
    }
)
embeddings.index(stream(dataset, "text", num_docs))


# Reranking

In [5]:
# not applicable to qdrant since search returns (id, score) and no "text"

# similarity = Similarity("valhalla/distilbart-mnli-12-3")


# Search

In [6]:
queries = [
    "Positive Apple reports",
    "Negative Apple reports",
    "Best planets to explore for life",
    "LA Dodgers good news",
    "LA Dodgers bad news",
]


In [7]:
for query in queries[:2]:
    print(query)
    print(search(query))


Positive Apple reports
[(73, 0.35510775), (51, 0.35448885), (7, 0.33826345), (85, 0.33634022), (49, 0.30569416), (83, 0.303743), (88, 0.3023945), (86, 0.29291403), (82, 0.27456617), (78, 0.26397276), (5, 0.24335706), (81, 0.24180745), (27, 0.23923783), (36, 0.22436482), (1, 0.21463366), (79, 0.20670545), (39, 0.20071876), (71, 0.19698218), (41, 0.19376302), (31, 0.1936928), (2, 0.18801743), (67, 0.17827374), (70, 0.1714298), (38, 0.17018963), (6, 0.16622469), (0, 0.16589355), (72, 0.16167642), (10, 0.16092366), (14, 0.15391365), (30, 0.15381524), (84, 0.15273768), (99, 0.15183601), (37, 0.14511038), (52, 0.14319141), (42, 0.14136688), (74, 0.13916151), (13, 0.13913164), (80, 0.13793717), (18, 0.13407096), (33, 0.13330953), (4, 0.124894366), (40, 0.12336382), (87, 0.12323387), (68, 0.11771364), (92, 0.11617284), (47, 0.11434842), (60, 0.107100256), (57, 0.10663518), (77, 0.10631414), (76, 0.10312337)]
Negative Apple reports
[(7, 0.3375751), (73, 0.32395706), (51, 0.30255818), (82, 0.293