### Qdrant, datasets

In memory vector search

In [3]:
import rich
from datasets import Dataset, load_dataset
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

In [4]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [5]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] is False and len(x["comments"]) > 0)
)

columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)


In [6]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]
# Split each comment into an individual record
comments_df = df.explode("comments", ignore_index=True)

In [7]:
comments_dataset = Dataset.from_pandas(comments_df)

comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)
# Filter short comments
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)

Map: 100%|██████████| 2964/2964 [00:00<00:00, 25067.63 examples/s]
Filter: 100%|██████████| 2964/2964 [00:00<00:00, 264689.09 examples/s]


In [8]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

Map: 100%|██████████| 2175/2175 [00:00<00:00, 22389.69 examples/s]


In [9]:
client = QdrantClient(":memory:")
embedding_model = TextEmbedding()

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 33770.56it/s]


In [10]:
client.create_collection(
    collection_name="github_issues",
    vectors_config=models.VectorParams(
        size=384,  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [11]:
client.upload_points(
    collection_name="github_issues",
    points=[
        models.PointStruct(id=idx, vector=list(embedding_model.embed(doc["text"]))[0], payload=doc)
        for idx, doc in enumerate(comments_dataset)
    ],
)

In [12]:
hits = client.search(
    collection_name="github_issues",
    query_vector=list(embedding_model.embed("How can I load a dataset offline?"))[0],
    limit=3,
)
for hit in hits:
    rich.print(hit.payload, "score:", hit.score)