Based on https://github.com/neuml/txtai/blob/master/examples/02_Build_an_Embeddings_index_with_Hugging_Face_Datasets.ipynb

# Setup 

In [1]:
from datasets import load_dataset

from txtai.embeddings import Embeddings
from txtai.pipeline import Similarity

from IPython import display


In [2]:
def stream(dataset, field, limit):
    index = 0
    for row in dataset:
        yield (index, row[field], None)
        index += 1

        if index >= limit:
            break


def search(query):
    return [
        (result["score"], result["text"])
        for result in embeddings.search(query, limit=50)
    ]


def ranksearch(query):
    results = [text for _, text in search(query)]
    return [(score, results[x]) for x, score in similarity(query, results)]


# Dataset

In [3]:
dataset = load_dataset("ag_news", split="train")


Found cached dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


# Indexing

In [4]:
num_docs = 100
embeddings = Embeddings(
    {"path": "sentence-transformers/paraphrase-MiniLM-L3-v2", "content": True}
)
embeddings.index(stream(dataset, "text", num_docs))


# Reranking

In [5]:
similarity = Similarity("valhalla/distilbart-mnli-12-3")


# Search

In [6]:
queries = [
    "Positive Apple reports",
    "Negative Apple reports",
    "Best planets to explore for life",
    "LA Dodgers good news",
    "LA Dodgers bad news",
]


In [7]:
for query in queries[:2]:
    print(query)
    print(ranksearch(query)[:2])


Positive Apple reports
[(0.8435829877853394, 'Apple to open second Japanese retail store this month (MacCentral) MacCentral - Apple Computer Inc. will open its second Japanese retail store later this month in the western Japanese city of Osaka, it said Thursday.'), (0.396741658449173, "Researchers seek to untangle the e-mail thread E-mail is a victim of its own success. That's the conclusion of IBM Corp. researchers in Cambridge, who have spent nearly a decade conducting field tests at IBM and other companies about how employees work and use electronic mail. It's clear to them that e-mail has become the Internet's killer application.")]
Negative Apple reports
[(0.5658772587776184, "Hacker Cracks Apple's Streaming Technology (AP) AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again  #151; this time breaking the locks on Apple Computer Inc.'s wireless music streaming technology."), (0.5205283761024475, "Dutch Firm Beats Apple to Punc

In [9]:

embeddings.index([(0, "Correct", None), (1, "Not what we hoped", None)])
result = embeddings.search("positive", 1)
print(result)

[{'id': '0', 'text': 'Correct', 'score': 0.5179595947265625}]


In [8]:
embeddings.search("positive apple", limit=50)

[{'id': '88',
  'text': 'Apple to open second Japanese retail store this month (MacCentral) MacCentral - Apple Computer Inc. will open its second Japanese retail store later this month in the western Japanese city of Osaka, it said Thursday.',
  'score': 0.39304929971694946},
 {'id': '39',
  'text': 'Microsoft Corp. 2.0: a kinder corporate culture Even a genius can mess up. Bill Gates was a brilliant technologist when he cofounded Microsoft , but as he guided it to greatness in both size and historical consequence, he blundered. He terrorized underlings with his temper and parceled out praise like Scrooge gave to charity. Only the lash inspired the necessary aggressiveness to beat the competition, he thought.',
  'score': 0.3433215618133545},
 {'id': '83',
  'text': "Hacker Cracks Apple's Streaming Technology (AP) AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again  #151; this time breaking the locks on Apple Computer Inc.'s wirel