In [26]:
%pip install sentence-transformers qdrant-client polars folium

Note: you may need to restart the kernel to use updated packages.


In [27]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
import polars as pl
import numpy as np

In [28]:
encoder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

In [29]:
def convert(row):
    return [
        str(row["lat"]),
        str(row["lon"]),
        row["category"],
        row["sub_category"],
        row["name"],
    ]

In [30]:
df_geo = pl.read_parquet("../../data/transformed/poi_clean_category_geo.parquet")
df = df_geo.drop(["type", "geometry"])
df.head()

id,lat,lon,addr:city,addr:country,addr:housenumber,addr:postcode,addr:street,brand,brand:wikidata,brand:wikipedia,cuisine,description,drive_through,name,note,operator,takeaway,wheelchair,bicycle,information,access,wikidata,wikimedia_commons,wikipedia,website,toilets:wheelchair,alt_name,check_date,opening_hours,changing_table,contact:phone,contact:website,delivery,diet:vegetarian,indoor_seating,internet_access,…,phone,heritage,heritage:operator,image,man_made,level,operator:wikidata,wifi,old_name,seamark:harbour:category,seamark:name,seamark:type,harbour,highway,name:fr,direction,capacity,reservation,smoking,toilets:access,backrest,colour,material,seats,delivery:covid19,opening_hours:covid19,takeaway:covid19,description:covid19,payment:cash,payment:maestro,payment:mastercard,contact:mobile,url,contact:email,addr:suburb,sub_category,category
i64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
23656136,49.858471,6.3649698,,,,,,,,,,,,"""Waldsportplatz…",,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""fitness_statio…","""leisure"""
26860223,49.743478,6.0898523,,,,,,,,,,,,"""Restaurant Cam…",,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""restaurant""","""amenity"""
30432808,50.856197,5.8256972,,,,,,,,,,,,"""Steenkolenmijn…",,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""attraction""","""tourism"""
31425173,50.844029,5.6890789,,,,,,,,,,,,"""Nieuwenhofpoor…",,,,,,,,"""Q19630871""","""Category:Nieuw…","""nl:Nieuwenhofp…",,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""city_gate""","""historic"""
31575884,50.844551,5.6902818,,,,,,,,,,,,"""Universiteitsb…",,,,,,,,"""Q15734302""",,"""nl:Universitei…","""https://librar…",,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""library""","""amenity"""


In [31]:
vectors = encoder.encode(
    [convert(row) for row in df.iter_rows(named=True)],
    batch_size=256,
    show_progress_bar=True,
)

Batches:   0%|          | 0/248 [00:00<?, ?it/s]

In [32]:
vectors.shape

(63330, 384)

In [33]:
np.save("poi_vectors.npy", vectors, allow_pickle=False)

In [34]:
qdrant = QdrantClient(":memory:")

In [35]:
qdrant.recreate_collection(
    collection_name="poi",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [36]:
def add_loc(row):
    row["location"] = {"lat": row["lat"], "lon": row["lon"]}
    return row

In [37]:
qdrant.upload_collection(
    collection_name="poi",
    vectors=vectors,
    payload=[add_loc(row) for row in df.to_dicts()],
    ids=df["id"].to_list(),
    batch_size=256,
)

In [38]:
class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
        # initialize Qdrant client
        self.qdrant_client = qdrant

    def search(self, pos, neg, around):
        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.recommend(
            collection_name=self.collection_name,
            positive=pos,
            negative=neg,
            strategy=models.RecommendStrategy.AVERAGE_VECTOR,
            query_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="location",
                        geo_radius=models.GeoRadius(
                            center=models.GeoPoint(
                                lon=around[1],
                                lat=around[0],
                            ),
                            radius=1000.0,
                        ),
                    ),
                ],
                # must_not=[
                #     models.FieldCondition(
                #         key="category",
                #         match=models.MatchValue(
                #             value="amenity",
                #         ),
                #     )
                # ],
            ),
            limit=5,  # 5 the most closest results is enough
        )
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [hit.payload for hit in search_result]
        return payloads

In [39]:
neural_searcher = NeuralSearcher(collection_name="poi")

In [40]:
pos = [409005527, 488390305, 491571270, 490992842]
df.select(["id", "name", "category", "sub_category"]).filter(pl.col("id").is_in(pos))

id,name,category,sub_category
i64,str,str,str
409005527,"""Le Gros Vélo""","""amenity""","""cafe"""
488390305,"""La croix St-Cl…","""historic""","""wayside_cross"""
490992842,"""La Roche qui T…","""tourism""","""attraction"""
491571270,"""Château du Lav…","""historic""","""castle"""


In [41]:
neg = [494911197, 498293266, 494911196, 516617417]
df.select(["id", "name", "category", "sub_category"]).filter(pl.col("id").is_in(neg))

id,name,category,sub_category
i64,str,str,str
494911196,"""FrietJess""","""amenity""","""fast_food"""
494911197,"""Pizza Plaza""","""amenity""","""restaurant"""
498293266,"""Frit city juni…","""amenity""","""fast_food"""
516617417,"""Pizza Hut""","""amenity""","""restaurant"""


In [42]:
items = neural_searcher.search(
    pos=pos,
    neg=neg,
    around=[50.467388, 4.871985],
)
[
    [
        item["id"],
        item["name"],
        item["lat"],
        item["lon"],
        item["category"],
        item["sub_category"],
    ]
    for item in items
]
len(items)

5

In [44]:
import folium

# create a map centered on the item's location
m = folium.Map(location=[items[0]["lat"], items[0]["lon"]], zoom_start=15)

# add a marker for the item's location
for item in items:
    folium.Marker(
        location=[item["lat"], item["lon"]], popup=str(item["id"]) + " " + item["name"]
    ).add_to(m)

# display the map
m