# Hybrid search for the metadata catalogue of Berlin

In this notebook we prepare the data for our search application and setup the search index.

- We load the data from the API.
- We prepare the data and lemmatize the text data (for lexical search).
- We embed via the OpenAI embedding API.
- We test the embeddings with cosine similarity.
- We setup the Weaviate index.
- We create a collection with our data.
- We test the index in regard to lexical, and vector search as well as the combination of both - hybrid search. The latter is what we use in the app.


# Imports


In [1]:
import pandas as pd
from pandarallel import pandarallel
import numpy as np

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

from time import time
import time
import os
import re
import requests
from dotenv import load_dotenv

import pyarrow.parquet as pq

import warnings
from bs4 import MarkupResemblesLocatorWarning
from sklearn.metrics.pairwise import cosine_similarity
import weaviate
from weaviate.classes.config import Property, DataType
import weaviate.classes as wvc
import weaviate.classes.config as wc

import spacy
from openai import OpenAI
import json

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client_openai = OpenAI(api_key=OPENAI_API_KEY)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


**Constants**


In [2]:
DATA_FOLDER = "_data/"
DATASETS = DATA_FOLDER + "01_data.parq"
DATA_WITH_EMBEDDINGS = DATA_FOLDER + "02_data_embedded.parq"

BASELINK_API = "https://daten.berlin.de/datensaetze/"

# Dataset links are composed of this baselink and the identifier for each dataset.
BASELINK_DATASHOP = (
    "https://datenregister.berlin.de/api/3/action/current_package_list_with_resources"
)

# Implement search


Now we create the actual search index for lexical and semantic search.

Note that [Weaviates](https://weaviate.io/developers/weaviate) default location for the index is `~/.local/share/weaviate`. If you want to use the app on a different machine, simply copy the files from this location to the same location on the other machine. You can also set a specific path with parameter `persistence_data_path` – see [documentation here](https://weaviate.io/developers/weaviate/installation/embedded).


In [3]:
df = pd.read_parquet(DATA_WITH_EMBEDDINGS)

In [4]:
# This will either connect to an existing Weaviate instance or create a new one.
# We only can execute this once, as is. If we want to run it again, we need to restart the kernel.
# More info regarding the Embedded Weaviate client:
# https://weaviate.io/developers/weaviate/installation/embedded
# By default the index data is stored in ~/.local/share/weaviate/

client = weaviate.connect_to_embedded()

# # Use this line if another application on your machine is already using Weaviate on the default port.
# # This notebook also counts as another application. If your notebook is running, you need to use a different port for the Streamlit app. Otherwise, you get an error.
# client = weaviate.connect_to_local(port=8079, grpc_port=50050)

{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-08-18T15:55:54+02:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-08-18T15:55:54+02:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-08-18T15:55:54+02:00"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-08-18T15:55:54+02:00"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":34201},"time":"2024-08-18T15:55:54+02:00"}
{"address":"192.168.178.36:34202","level":"info","msg":"starting cloud rpc server ...","time":"2024-08-18T15:55:54+02:00"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-08-18T15:

In [5]:
# Get the meta endpoint description of Weaviate
# This returns information about the Weaviate instance, including:
# - hostname
# - version
# - available modules (e.g., OpenAI, Cohere, Hugging Face integrations)
display(client.get_meta())

# Check if Weaviate is live (basic health check)
# Returns True if the instance is up and running
print(client.is_live())

# Check if Weaviate is ready to handle requests
# Returns True if the instance is fully operational and ready to process queries
print(client.is_ready())

{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-08-18T15:55:57+02:00","wait_for_cache_prefill":false}
{"level":"info","msg":"Completed loading shard mdv_ctHqTXnNa0cS in 68.838086ms","time":"2024-08-18T15:55:57+02:00"}


{'hostname': 'http://127.0.0.1:8079',
 'modules': {'generative-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'Generative Search - OpenAI'},
  'qna-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'OpenAI Question & Answering Module'},
  'ref2vec-centroid': {},
  'reranker-cohere': {'documentationHref': 'https://txt.cohere.com/rerank/',
   'name': 'Reranker - Cohere'},
  'text2vec-cohere': {'documentationHref': 'https://docs.cohere.ai/embedding-wiki/',
   'name': 'Cohere Module'},
  'text2vec-huggingface': {'documentationHref': 'https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task',
   'name': 'Hugging Face Module'},
  'text2vec-openai': {'documentationHref': 'https://platform.openai.com/docs/guides/embeddings/what-are-embeddings',
   'name': 'OpenAI Module'}},
 'version': '1.26.1'}

True
True


In [6]:
# Resetting or cleaning up: Delete collection.
client.collections.delete("MDV")

{"action":"hnsw_vector_cache_prefill","count":7000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-08-18T15:55:57+02:00","took":323516344}


In [7]:
# First we need to create a schema that defines how data is stored, organized and retrieved in Weaviate.
# A schema is called a "collection". We can define as many collections as we want.
# Weaviate Collections:
# - Fundamental data organization unit, similar to database tables
# - Group related objects with a shared schema
# - Support vector representations for semantic search
# - Defined by:
#   * Properties (fields) with data types and index settings
#   * Vector index configuration (e.g., HNSW)
#   * Sharding and replication settings
# - Enable both traditional filtering and vector-based operations
# - Configurable for various indexing and search optimizations
# Example: 'MDV' collection with properties like 'identifier', 'title', 'description'

client.collections.create(
    "MDV",
    vectorizer_config=wc.Configure.Vectorizer.none(),
    inverted_index_config=wvc.config.Configure.inverted_index(
        bm25_b=0.75,  # default 0.75
        bm25_k1=1.2,  # default 1.2
        stopwords_additions=None,
        stopwords_preset=None,
        stopwords_removals=None,
    ),
    properties=[
        Property(name="identifier", data_type=DataType.TEXT),
        Property(name="link", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="title_lemma", data_type=DataType.TEXT),
        Property(name="description", data_type=DataType.TEXT),
        Property(name="description_lemma", data_type=DataType.TEXT),
        Property(name="keyword", data_type=DataType.TEXT),
        Property(name="distribution", data_type=DataType.TEXT),
        Property(name="distribution_lemma", data_type=DataType.TEXT),
        Property(name="is_study", data_type=DataType.BOOL),
    ],
)

<weaviate.collections.collection.sync.Collection at 0x759a276d2ec0>

```python
MDV
{
    'MDV': *CollectionConfig(
        name='MDV',
        description=None,
        generative_config=None,
        inverted_index_config=_InvertedIndexConfig(
            bm25=_BM25Config(b=0.75, k1=1.2),
            cleanup_interval_seconds=60,
            index_null_state=False,
            index_property_length=False,
            index_timestamps=False,
            stopwords=_StopwordsConfig(
                preset=<StopwordsPreset.EN: 'en'>,
                additions=None,
                removals=None
            )
        ),
        multi_tenancy_config=_MultiTenancyConfig(
            enabled=False,
            auto_tenant_creation=False,
            auto_tenant_activation=False
        ),
        properties=[
            _Property(
                name='identifier',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='link',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='title',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='title_lemma',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='description',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='description_lemma',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='keyword',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='distribution',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='distribution_lemma',
                description=None,
                data_type=<DataType.TEXT: 'text'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=True,
                nested_properties=None,
                tokenization=<Tokenization.WORD: 'word'>,
                vectorizer_config=None,
                vectorizer='none'
            ),
            *Property(
                name='is_study',
                description=None,
                data_type=<DataType.BOOL: 'boolean'>,
                index_filterable=True,
                index_range_filters=False,
                index_searchable=False,
                nested_properties=None,
                tokenization=None,
                vectorizer_config=None,
                vectorizer='none'
            )
        ],
        references=[],
        replication_config=_ReplicationConfig(
            factor=1,
            async_enabled=False
        ),
        reranker_config=None,
        sharding_config=_ShardingConfig(
            virtual_per_physical=128,
            desired_count=1,
            actual_count=1,
            desired_virtual_count=128,
            actual_virtual_count=128,
            key='_id',
            strategy='hash',
            function='murmur3'
        ),
        vector_index_config=_VectorIndexConfigHNSW(
            quantizer=None,
            cleanup_interval_seconds=300,
            distance_metric=<VectorDistances.COSINE: 'cosine'>,
            dynamic_ef_min=100,
            dynamic_ef_max=500,
            dynamic_ef_factor=8,
            ef=-1,
            ef_construction=128,
            flat_search_cutoff=40000,
            max_connections=32,
            skip=False,
            vector_cache_max_objects=1000000000000
        ),
        vector_index_type=<VectorIndexType.HNSW: 'hnsw'>,
        vectorizer_config=None,
        vectorizer=<Vectorizers.NONE: 'none'>,
        vector_config=None
    )
}
```


In [19]:
# Collection:
# In this case, 'MDV' is the name of a single collection. A collection in Weaviate is similar to a table in a traditional database or a collection in
# document-oriented databases. It's a group of related objects that share the same schema.

# List all collections.
for v in client.collections.list_all().values():
    print(v.name)

# Get detailed information about all collections.
schema = client.collections.list_all(simple=False)
print(schema)

MDV
{'MDV': _CollectionConfig(name='MDV', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='identifier', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='link', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectori

In [9]:
# Now we ingest data into the collection in the form of "items".

collection = client.collections.get("MDV")

with collection.batch.dynamic() as batch:
    for data in df.to_dict(orient="records"):
        properties = {
            "identifier": data["identifier"],
            "link": data["link"],
            "title": data["title"],
            "title_lemma": data["title_lemma"],
            "description": data["description"],
            "description_lemma": data["description_lemma"],
            "keyword": data["keyword"],
            "distribution": data["distribution"],
            "distribution_lemma": data["distribution_lemma"],
            "is_study": data["is_study"],
        }
        batch.add_object(properties=properties, vector=data["embedding_openai"])

{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-08-18T15:55:58+02:00","wait_for_cache_prefill":false}
{"level":"info","msg":"Created shard mdv_rcw3bLauOVXL in 13.907936ms","time":"2024-08-18T15:55:58+02:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-08-18T15:55:58+02:00","took":268696}


In [10]:
# List all items in the collection.
collection = client.collections.get("MDV")
for item in collection.iterator():
    print(item)
    break

Object(uuid=_WeaviateUUIDInt('0001dc52-93dc-4144-8fb7-92e898a3e1f2'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'description': "Sachdaten des Gesundheits- und Sozialindex (GESIx) des Gesundheits- und Sozialstrukturatlas Berlin 2022 auf der Ebene der Planungsräume. Datengrundlage bilden 20 Indikatoren, die überwiegend auf amtlichen Statistiken beruhen, aus den Dimensionen Erwerbsleben, soziale Lage und Gesundheit. Für diese drei Dimensionen werden Subindizes berechnet, die zu einem Index - GESIx - zusammengeführt werden.\n\n![Vorschaugrafik zu Datensatz 'Gesundheits- und Sozialstrukturatlas: Gesundheits- und Sozialindex 2022 (GESIx)'](https://fbinter.stadt-berlin.de/fb_daten/vorschau/sachdaten/svor_default.gif)", 'identifier': '93c2cf18-77ad-40ff-b83a-c7ca30f4e129', 'is_study': False, 'link': 'https://datenregister.berlin.de/api/3/action/current_pac

In [11]:
# Get total count of all items in the collection.
collection = client.collections.get("MDV")
response = collection.aggregate.over_all(total_count=True)

# Check if the total count of items in the collection is equal to the number of items in the DataFrame.
# If the assertion fails, try to re-run the previous cells.
assert response.total_count == len(df)

print(f"The collection contains {response.total_count} items.")

The collection contains 3311 items.


## Lexical search (using BM25)


[Weaviate documentation for lexical search](https://weaviate.io/developers/weaviate/search/bm25)


In [12]:
collection = client.collections.get("MDV")

In [13]:
# Set fields to search in with BM25.
# We exclude the identifier and link fields.
# We give twice the weight to the title fields.

query_properties = [
    "title^2",
    "description",
    "title_lemma^2",
    "description_lemma",
    "keyword",
    "distribution",
    "distribution_lemma",
]

In [14]:
# nlp = spacy.load("de_core_news_lg")
nlp = spacy.load("de_core_news_sm", disable=["ner", "parser"])

LETTERS_AND_DIGITS = re.compile(r"[^a-zäüöA-ZÜÄÖ0-9.]")
MULTIPLE_SPACES = re.compile(r"\s+")


def prepare_for_lexical_search(text, lower=False, remove_umlauts=False):
    """Lemmatize text, and optionally lower case and remove umlauts for lexical search.

    Args:
        text (str): Text to process.

    Keyword Arguments:
        lower (bool): Lower case text (default: {True}).
        remove_umlauts (bool): Remove umlauts from text (default: {True}).

    Returns:
        str: Lemmatized text, optionally lower cased, and without umlauts.
    """
    doc = nlp(text)
    text = " ".join([token.lemma_ if token.is_alpha else token.text for token in doc])
    text = re.sub(LETTERS_AND_DIGITS, " ", text)
    text = re.sub(MULTIPLE_SPACES, " ", text)
    if lower:
        text = text.lower()
    if remove_umlauts:
        text = text.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue")
    return text

In [15]:
query = "Löhne in Berlin"
query = prepare_for_lexical_search(query)
query

'Lohn in Berlin'

In [16]:
# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/bm25
response = collection.query.bm25(
    query=query,
    query_properties=query_properties,
    offset=0,
    limit=20,
    # auto_limit=10,
    return_metadata=wvc.query.MetadataQuery(
        score=True,
        explain_score=True,
    ),
)
for item in response.objects:
    print(
        item.properties["title"],
        # item.properties["description"],
        # item.metadata.score,
        # item.metadata.explain_score,
    )

Laufende Steuereinnahmen des Landes Berlin
Laufende Steuereinnahmen des Landes Berlin
Breitbandversorgung Berlin 2017-2018 Privatverfügbarkeiten
Senatsvorlagen der Senatsverwaltung für Finanzen
Differenzbilanzierung 2016 (Berlin)
Differenzbilanzierung 2012 (Berlin)
Kitas in Berlin
Standardlastprofil Speicherheizung (Berlin)
Differenzbilanzierung 2013 (Berlin)
Differenzbilanzierung 2011 (Berlin)
Differenzbilanzierung 2015 (Berlin)
Differenzbilanzierung 2014 (Berlin)
Radzähldaten in Berlin
Los_9_2024 (Berlin)
Standardlastprofil Bandlastkunden 2024 (Berlin)
Los_6_2024 (Berlin)
Los_7_2024 (Berlin)
Los_3_2025 (Berlin)
Los_4_2024 (Berlin)
Los_2_2024 (Berlin)


## Semantic search


[Weaviate documentation for semantic search](https://weaviate.io/developers/weaviate/search/similarity)


In [24]:
from typing import List
from openai import OpenAI
import weaviate
import os

# Initialize the OpenAI client
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def embed_with_openai(texts: List[str]) -> List[List[float]]:
    """Generate embeddings for given texts using OpenAI's API."""
    response = openai_client.embeddings.create(
        input=texts, model="text-embedding-ada-002"
    )
    return [embedding.embedding for embedding in response.data]


def search_similar_documents(
    query: str, client: weaviate.Client, collection_name: str, n_results: int = 20
) -> List[dict]:
    """
    Search for documents similar to the query using Weaviate's near_vector search.

    Args:
    query (str): The search query.
    client (weaviate.Client): Weaviate client instance.
    collection_name (str): Name of the Weaviate collection to search.
    n_results (int): Number of results to return.

    Returns:
    List[dict]: List of dictionaries containing title, distance, and certainty for each result.
    """
    # Generate embedding for the query
    query_embedding = embed_with_openai([query])[0]

    # Perform near_vector search
    response = (
        client.query.get(collection_name, ["title"])
        .with_near_vector({"vector": query_embedding})
        .with_limit(n_results)
        .with_additional(["distance", "certainty"])
        .do()
    )

    # Extract and return results
    results = []
    for item in response["data"]["Get"][collection_name]:
        results.append(
            {
                "title": item["title"],
                "distance": item["_additional"]["distance"],
                "certainty": item["_additional"]["certainty"],
            }
        )

    return results


# Usage
def main():
    client = weaviate.Client(
        "http://127.0.0.1:8079"
    )  # Correct address as per your specification
    query = "fahrräder in berlin"
    show_n_results = 20

    similar_documents = search_similar_documents(query, client, "MDV", show_n_results)

    # Display results
    for i, doc in enumerate(similar_documents, 1):
        print(f"{i}. Title: {doc['title']}")
        print(f"   Distance: {doc['distance']:.4f}, Certainty: {doc['certainty']:.4f}")
        print()


if __name__ == "__main__":
    main()

1. Title: Summenlast der Netzverluste 2009 (Berlin)
   Distance: 0.9334, Certainty: 0.5333

2. Title: Jahresabschluss 2015
   Distance: 0.9355, Certainty: 0.5322

3. Title: Summenlast der Netzverluste 2015 (Berlin)
   Distance: 0.9394, Certainty: 0.5303

4. Title: Bezirkshaushaltsplan Friedrichshain-Kreuzberg 2012/2013 fortgeschrieben mit Ergänzungsplan 2013
   Distance: 0.9397, Certainty: 0.5302

5. Title: Preisblatt Anschluss Niederspannung Berlin
   Distance: 0.9427, Certainty: 0.5287

6. Title: Reparaturführer Charlottenburg-Wilmersdorf
   Distance: 0.9430, Certainty: 0.5285

7. Title: Langjährige Entwicklung Luftqualität PM10-Emissionen Kfz-Verkehr NN 2009 (Umweltatlas) - [WMS]
   Distance: 0.9432, Certainty: 0.5284

8. Title: Finanzamt-Suche
   Distance: 0.9433, Certainty: 0.5284

9. Title: Langjährige Entwicklung Luftqualität PM10-Emissionen Kfz-Verkehr GN 2009 (Umweltatlas) - [WMS]
   Distance: 0.9439, Certainty: 0.5281

10. Title: Energetischer Sanierungsfahrplan
   Distance: 

In [28]:
# Usage
def main():
    client = weaviate.Client("http://127.0.0.1:8079")
    query = "Straßen in Berlin"
    collection_name = "MDV"
    n_results = 20

    results = search_similar_documents(query, client, collection_name, n_results)

    # Display results
    for result in results:
        print(result["title"], result["distance"], result["certainty"])


if __name__ == "__main__":
    main()

Reparaturführer Charlottenburg-Wilmersdorf 0.93730605 0.5313469767570496
Summenlast der Netzverluste 2009 (Berlin) 0.9450978 0.5274510979652405
Jahresabschluss 2015 0.9493318 0.5253340899944305
Energetischer Sanierungsfahrplan 0.9515328 0.5242336094379425
Schulbausanierung Sommer 2022 0.9515732 0.5242134034633636
Summenlast der Netzverluste 2015 (Berlin) 0.951823 0.5240885019302368
Bezirkshaushaltsplan Friedrichshain-Kreuzberg 2012/2013 fortgeschrieben mit Ergänzungsplan 2013 0.95460397 0.5226980149745941
Summenlast der Netzverluste 2010 (Berlin) 0.95591736 0.5220413208007812
Summenlast der Netzverluste 2006 (Berlin) 0.9560822 0.5219588875770569
Summenlast der Netzverluste 2007 (Berlin) 0.9565257 0.5217371582984924
Jahresabschluss 2014 0.9570526 0.5214737057685852
14 Radrouten und Radverkehrsanlagen - GPS-Tracks für die Radrouten durch Berlin 0.95749694 0.5212515294551849
Finanzamt-Suche 0.95777977 0.5211101174354553
Summenlast der Netzverluste 2008 (Berlin) 0.9581924 0.520903795957565

## Hybrid search


[Weviate documentation for hybrid search - the combination of lexical and vector search](https://weaviate.io/developers/weaviate/search/hybrid)


In [30]:
def search_similar_documents(
    query: str, client: weaviate.Client, collection_name: str, n_results: int = 20
) -> List[dict]:
    """
    Search for documents similar to the query using Weaviate's hybrid search.

    Args:
    query (str): The search query.
    client (weaviate.Client): Weaviate client instance.
    collection_name (str): Name of the Weaviate collection to search.
    n_results (int): Number of results to return.

    Returns:
    List[dict]: List of dictionaries containing title, score, distance, and certainty for each result.
    """
    query_embedding = embed_with_openai([query])[0]
    prepared_query = prepare_for_lexical_search(query)

    response = (
        client.query.get(collection_name, ["title"])
        .with_hybrid(query=prepared_query, vector=query_embedding, alpha=0.7)
        .with_limit(n_results)
        .with_additional(["score", "distance", "certainty"])
        .do()
    )

    results = []
    for item in response["data"]["Get"][collection_name]:
        results.append(
            {
                "title": item["title"],
                "score": item["_additional"]["score"],
                "distance": item["_additional"].get("distance"),
                "certainty": item["_additional"].get("certainty"),
            }
        )

    return results


# Usage
def main():
    client = weaviate.Client("http://127.0.0.1:8079")
    query = "Löhne in Berlin"
    collection_name = "MDV"
    n_results = 40

    results = search_similar_documents(query, client, collection_name, n_results)

    # Display results
    for result in results:
        print(
            result["title"],
            result["score"],
            result.get("distance", "N/A"),
            result.get("certainty", "N/A"),
        )


if __name__ == "__main__":
    main()

Gesundheitsberichterstattung Berlin: Kosten -> Aufwendungen für Rehabilitation und Pflege 0.7 None None
Gesundheitsberichterstattung Berlin: Einrichtungen des Gesundheitswesens -> Stationäre/teilstationäre medizinische Einrichtungen -> Vorsorge- oder Rehabilitationseinrichtungen 0.589394 None None
Radzähldaten in Berlin 0.54055935 None None
Daten Ausfall Halbring für Energyhack 2015 0.5266549 None None
Gesundheitsberichterstattung Berlin: Kosten -> Krankheitskosten 0.51893425 None None
Förderungen / Finanzen 0.4898429 None None
Breitband-Ausbau der Berliner Schulen 0.4680342 None None
Gesundheitsberichterstattung Berlin: Inanspruchnahme von Leistungen der Gesundheitsförderung und der Gesundheitsversorgung -> Inanspruchnahme/Leistungen der Prävention, Gesundheitsförderung und Früherkennung von Krankheiten -> Kariesprävalenz und Kar... 0.4657743 None None
Gesundheitsberichterstattung Berlin: Kosten -> Kostenstruktur von Krankenhäusern 0.44931677 None None
Gesundheitsberichterstattung Ber