In [None]:
# imports
import pandas as pd
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [None]:
# load data
df = pd.read_csv("top_rated_wines.csv")
df = df[df["variety"].notna()]  # remove any NaN values as it blows up serialization
data = df.to_dict("records")

In [None]:
# create embeddings
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# create the vector database client
vdb = QdrantClient(":memory:")  # create in-memory Qdrant instance

In [None]:
# create the collection
vdb.recreate_collection(
    collection_name="top_wines",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

In [None]:
# vectorize
# note that for Coursera we use an older way of Qdrant doing the uploads using Records instead of Points
vdb.upload_records(
    collection_name="top_wines",
    records=[
        models.Record(id=idx, vector=encoder.encode(doc["notes"]).tolist(), payload=doc)
        for idx, doc in enumerate(data)  # data is the variable holding all the wines
    ],
)

In [None]:
# search locally
hits = vdb.search(
    collection_name="top_wines",
    query_vector=encoder.encode("A wine from Mendoza Argentina").tolist(),
    limit=3,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

In [None]:
# naive check if embeddings are stored
vdb.scroll(
    collection_name="top_wines",
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="variety", match=models.MatchValue(value="Red Wine")),
        ]
    ),
    limit=3,
    with_payload=False,
    with_vectors=True,
)