In [20]:
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder
from haystack import Pipeline, Document
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
import json

#### Document Store

In [21]:
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

document_store = QdrantDocumentStore(
    url="http://localhost:6333",
    index="hybrid",
    recreate_index=True,
    embedding_dim=512,
    return_embedding=True,
    use_sparse_embeddings=True,
    sparse_idf=True
)

### Indexing

In [22]:
import json

In [23]:
with open("../docs_with_q_4o-mini.json", "rt") as f_in:
    ds_gpt = json.load(f_in)


len(ds_gpt)

661

In [24]:
ds_gpt[0]

{'source': 'https://www.reddit.com/r/germany/wiki/autobahn_safety',
 'content': 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autobahn#/media/File%3AAutobahnen_in_Deutschland.svg) with a total length of more than 8,000 miles. [65%](https://en.wikipedia.org/wiki/Autobahn#Speed_limits) of the Autobahn has no speed limit. How safe can that be?\nVehicles traveled 147 billion miles on the Autobahn in 2015. 322 people died = 2.19 deaths per billion miles.\nIn the US, vehicles travelled 757 billion miles on interstate highways. 3,837 people died = 5.07 deaths per billion miles.\nThat means: If you drive on the interstate, your likelihood to die is 131% higher than for the same distance on the Autobahn.\n*sources:*\nStatistisches Bundesamt: [Unfallentwicklung auf deutschen Stra√üen 2015](https://www.destatis.de/DE/PresseService/Presse/Pressekonferenzen/2016/Unfallentwicklung_2015/Pressebroschuere_unfallentwicklung.pdf?__blob=publicationFile)\nNa

In [25]:
docs = []

for doc in ds_gpt:
    docs.append([Document(content=doc["question"] + ' '+doc["content"], meta={"question": doc["question"], "content": doc["content"],
                "headline": doc["headline"], "source": doc["source"], "length": doc["length"], "id": doc["id"]})])

In [26]:
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack import Pipeline
from haystack_integrations.components.embedders.fastembed import FastembedSparseDocumentEmbedder, FastembedDocumentEmbedder

In [27]:
hybrid_indexing = Pipeline()
hybrid_indexing.add_component("sparse_doc_embedder", FastembedSparseDocumentEmbedder(
    model="Qdrant/bm25", meta_fields_to_embed=["source", "headline"]))
hybrid_indexing.add_component("dense_doc_embedder", FastembedDocumentEmbedder(
    model="jinaai/jina-embeddings-v2-small-en", meta_fields_to_embed=["source", "headline"]))
hybrid_indexing.add_component("writer", DocumentWriter(
    document_store=document_store, policy=DuplicatePolicy.OVERWRITE))

hybrid_indexing.connect("sparse_doc_embedder", "dense_doc_embedder")
hybrid_indexing.connect("dense_doc_embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f9ef478f110>
üöÖ Components
  - sparse_doc_embedder: FastembedSparseDocumentEmbedder
  - dense_doc_embedder: FastembedDocumentEmbedder
  - writer: DocumentWriter
üõ§Ô∏è Connections
  - sparse_doc_embedder.documents -> dense_doc_embedder.documents (list[Document])
  - dense_doc_embedder.documents -> writer.documents (list[Document])

In [28]:
docs[0]

[Document(id=22af165baa4b0bd92e1a2d4c5309e8fdf30ec8231cf3434873d333255a66dd27, content: 'What are the safety statistics comparing the Autobahn to US interstate highways? The Autobahn is a [...', meta: {'question': 'What are the safety statistics comparing the Autobahn to US interstate highways?', 'content': 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autobahn#/media/File%3AAutobahnen_in_Deutschland.svg) with a total length of more than 8,000 miles. [65%](https://en.wikipedia.org/wiki/Autobahn#Speed_limits) of the Autobahn has no speed limit. How safe can that be?\nVehicles traveled 147 billion miles on the Autobahn in 2015. 322 people died = 2.19 deaths per billion miles.\nIn the US, vehicles travelled 757 billion miles on interstate highways. 3,837 people died = 5.07 deaths per billion miles.\nThat means: If you drive on the interstate, your likelihood to die is 131% higher than for the same distance on the Autobahn.\n*sources:*\nStat

In [29]:
from tqdm.auto import tqdm

In [30]:
for doc in tqdm(docs):
    hybrid_indexing.run({"documents": doc})

  0%|          | 0/661 [00:00<?, ?it/s]

Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 842.06it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.13it/s]
100it [00:00, 5073.55it/s]           
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 1510.92it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  5.94it/s]
100it [00:00, 9385.75it/s]           
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 913.79it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  4.65it/s]
100it [00:00, 15965.53it/s]          
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 1426.63it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  4.22it/s]
100it [00:00, 16439.23it/s]          
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 2449.94i

In [12]:
docs[0][0].meta

{'question': 'What are the safety statistics comparing the Autobahn to US interstate highways?',
 'content': 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autobahn#/media/File%3AAutobahnen_in_Deutschland.svg) with a total length of more than 8,000 miles. [65%](https://en.wikipedia.org/wiki/Autobahn#Speed_limits) of the Autobahn has no speed limit. How safe can that be?\nVehicles traveled 147 billion miles on the Autobahn in 2015. 322 people died = 2.19 deaths per billion miles.\nIn the US, vehicles travelled 757 billion miles on interstate highways. 3,837 people died = 5.07 deaths per billion miles.\nThat means: If you drive on the interstate, your likelihood to die is 131% higher than for the same distance on the Autobahn.\n*sources:*\nStatistisches Bundesamt: [Unfallentwicklung auf deutschen Stra√üen 2015](https://www.destatis.de/DE/PresseService/Presse/Pressekonferenzen/2016/Unfallentwicklung_2015/Pressebroschuere_unfallentwicklung.pd

In [31]:
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder

from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder
from haystack_integrations.components.rankers.fastembed import FastembedRanker


hybrid_query = Pipeline()
hybrid_query.add_component("sparse_text_embedder",FastembedSparseTextEmbedder(model="Qdrant/bm25"))
hybrid_query.add_component("dense_text_embedder", FastembedTextEmbedder(model="jinaai/jina-embeddings-v2-small-en"))
hybrid_query.add_component("retriever", QdrantHybridRetriever(document_store=document_store, top_k=5))

hybrid_query.connect("sparse_text_embedder.sparse_embedding","retriever.query_sparse_embedding")
hybrid_query.connect("dense_text_embedder.embedding","retriever.query_embedding")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f9ef45c2490>
üöÖ Components
  - sparse_text_embedder: FastembedSparseTextEmbedder
  - dense_text_embedder: FastembedTextEmbedder
  - retriever: QdrantHybridRetriever
üõ§Ô∏è Connections
  - sparse_text_embedder.sparse_embedding -> retriever.query_sparse_embedding (SparseEmbedding)
  - dense_text_embedder.embedding -> retriever.query_embedding (list[float])

In [11]:
question = "Is it possible to hitchhike through Germany?"

results = hybrid_query.run(
    {"dense_text_embedder": {"text": question},
     "sparse_text_embedder": {"text": question}})

Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 107.33it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3572.66it/s]


In [15]:
results["retriever"]["documents"][2].meta["content"]

"This one is easy. If you're a German or EU citizen (i.e. you have a German or EU passport), then you can come to Germany whenever you like, stay as long as you like, and do whatever you like (work, study, or just sit by the sea watching the waves come in)."

In [16]:
import pandas as pd

In [17]:
qs = pd.read_csv("./gp4o-mini-questions.csv", sep=",")

In [18]:
q_list = qs["question"].to_list()
id_list = qs["content"].to_list()

In [19]:
import copy

grund_truth = [copy.deepcopy(item) for item in ds_gpt for _ in range(5)]

In [20]:
grund_truth_documents = []
for docs in grund_truth:
    grund_truth_documents.append([Document(content=docs["content"])])

In [21]:
retrieved_documents = []
for q in tqdm(q_list):
    results = hybrid_query.run(
        {"dense_text_embedder": {"text": q},
         "sparse_text_embedder": {"text": q}}
    )

    retrieved_documents.append([Document(content=results["retriever"]["documents"][0].meta["content"]),
                                Document(
                                    content=results["retriever"]["documents"][1].meta["content"]),
                                Document(
                                    content=results["retriever"]["documents"][2].meta["content"]),
                                Document(
                                    content=results["retriever"]["documents"][3].meta["content"]),
                                Document(content=results["retriever"]["documents"][4].meta["content"])])

  0%|          | 0/3305 [00:00<?, ?it/s]

Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 50.31it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 1331.95it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 101.85it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 5482.75it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 82.01it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 5706.54it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 96.39it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 5722.11it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 116.98it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6393.76it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

In [22]:
retrieved_documents[0]

[Document(id=fc12cd232ca4a0d8446820546b3c02e2ce6b4a8b3a8d271f66ae599b7416188c, content: 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autoba...'),
 Document(id=dde240165a39044c4ca8a4a40a04a2f91b14c5c0e01bc4dfa37acf76b57e63ef, content: 'Enter your area code (PLZ) on [this website](http://www.verivox.de/internet-vergleich/internetundtel...'),
 Document(id=e1ff4a2ca7962cc5f90ee550dd56440832cd39697c9e4cb3545938c04980776d, content: '[Amanda:](https://web.archive.org/web/20160316041117/http://www.amiexpat.com/2009/08/20/more-real-ex...'),
 Document(id=78091b22119fd20c676a31a782300900f6b79839e87052fd2d0e17550973ff9e, content: 'While hitchhiking isn't that common any more, it should still be possible to do it. Hitchhike from A...'),
 Document(id=290e7940e47da8016a01597ea448473f6ffe668b61f8a68ccec4099bb0934b22, content: 'Also note that the law refers to the length of your stay in Germany, not the length of your stay at ...')]

In [23]:
from haystack import Pipeline
from haystack.components.evaluators import DocumentMRREvaluator, DocumentRecallEvaluator

In [24]:
eval_pipeline = Pipeline()
eval_pipeline.add_component("doc_mrr_evaluator", DocumentMRREvaluator())
eval_pipeline.add_component("doc_rec_evaluator", DocumentRecallEvaluator())

In [25]:
retrieved_documents[0]

[Document(id=fc12cd232ca4a0d8446820546b3c02e2ce6b4a8b3a8d271f66ae599b7416188c, content: 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autoba...'),
 Document(id=dde240165a39044c4ca8a4a40a04a2f91b14c5c0e01bc4dfa37acf76b57e63ef, content: 'Enter your area code (PLZ) on [this website](http://www.verivox.de/internet-vergleich/internetundtel...'),
 Document(id=e1ff4a2ca7962cc5f90ee550dd56440832cd39697c9e4cb3545938c04980776d, content: '[Amanda:](https://web.archive.org/web/20160316041117/http://www.amiexpat.com/2009/08/20/more-real-ex...'),
 Document(id=78091b22119fd20c676a31a782300900f6b79839e87052fd2d0e17550973ff9e, content: 'While hitchhiking isn't that common any more, it should still be possible to do it. Hitchhike from A...'),
 Document(id=290e7940e47da8016a01597ea448473f6ffe668b61f8a68ccec4099bb0934b22, content: 'Also note that the law refers to the length of your stay in Germany, not the length of your stay at ...')]

In [26]:
grund_truth_documents[0]

[Document(id=fc12cd232ca4a0d8446820546b3c02e2ce6b4a8b3a8d271f66ae599b7416188c, content: 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autoba...')]

#### Retrivel Eval

In [27]:
results = eval_pipeline.run(
    {
        "doc_mrr_evaluator": {
            "ground_truth_documents": grund_truth_documents,
            "retrieved_documents": retrieved_documents,
        },
        "doc_rec_evaluator": {
            "ground_truth_documents": grund_truth_documents,
            "retrieved_documents": retrieved_documents,
        },

    }
)

In [28]:
results["doc_rec_evaluator"]["score"], results["doc_mrr_evaluator"]["score"]

(0.9531013615733737, 0.8307513867876953)

(0.9521936459909228, 0.8366717095310136)




#### With Rank

In [9]:
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder
from haystack import Pipeline, Document
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder
from haystack_integrations.components.rankers.fastembed import FastembedRanker

hybrid_query = Pipeline()
hybrid_query.add_component("sparse_text_embedder",FastembedSparseTextEmbedder(model="Qdrant/bm25"))
hybrid_query.add_component("dense_text_embedder", FastembedTextEmbedder(model="jinaai/jina-embeddings-v2-small-en"))
hybrid_query.add_component("retriever", QdrantHybridRetriever(document_store=document_store, top_k=5))
hybrid_query.add_component("ranker", FastembedRanker(top_k=5))

hybrid_query.connect("sparse_text_embedder.sparse_embedding","retriever.query_sparse_embedding")
hybrid_query.connect("dense_text_embedder.embedding","retriever.query_embedding")
hybrid_query.connect("retriever.documents", "ranker.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f90348987d0>
üöÖ Components
  - sparse_text_embedder: FastembedSparseTextEmbedder
  - dense_text_embedder: FastembedTextEmbedder
  - retriever: QdrantHybridRetriever
  - ranker: FastembedRanker
üõ§Ô∏è Connections
  - sparse_text_embedder.sparse_embedding -> retriever.query_sparse_embedding (SparseEmbedding)
  - dense_text_embedder.embedding -> retriever.query_embedding (list[float])
  - retriever.documents -> ranker.documents (list[Document])

##### Rag Pipeline

In [30]:
from tqdm.auto import tqdm

In [31]:
retrieved_documents = []
for q in tqdm(q_list):
    results = hybrid_query.run(
        {"dense_text_embedder": {"text": q},
         "sparse_text_embedder": {"text": q},
         "ranker": { "query" : q }})
    

    retrieved_documents.append([Document(content=results["ranker"]["documents"][0].meta["content"]),
                                Document(
                                    content=results["ranker"]["documents"][1].meta["content"]),
                                Document(
                                    content=results["ranker"]["documents"][2].meta["content"]),
                                Document(
                                    content=results["ranker"]["documents"][3].meta["content"]),
                                Document(content=results["ranker"]["documents"][4].meta["content"])])

  0%|          | 0/3305 [00:00<?, ?it/s]

Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 50.73it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3930.93it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 59.16it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 4534.38it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 70.80it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3675.99it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 66.34it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 5899.16it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 55.33it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6413.31it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

In [32]:
eval_pipeline = Pipeline()
eval_pipeline.add_component("doc_mrr_evaluator", DocumentMRREvaluator())
eval_pipeline.add_component("doc_rec_evaluator", DocumentRecallEvaluator())

In [33]:
results = eval_pipeline.run(
    {
        "doc_mrr_evaluator": {
            "ground_truth_documents": grund_truth_documents,
            "retrieved_documents": retrieved_documents,
        },
        "doc_rec_evaluator": {
            "ground_truth_documents": grund_truth_documents,
            "retrieved_documents": retrieved_documents,
        },

    }
)

In [35]:
results["doc_rec_evaluator"]["score"], results["doc_mrr_evaluator"]["score"]

(0.9515885022692889, 0.8626122037317197)

In [32]:
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.google_genai import GoogleGenAIChatGenerator
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack import Pipeline, Document

In [33]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [34]:
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

In [35]:
template = """
You're a FAQ database assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template, required_variables=[
                               "question", "documents"])

In [36]:
generator = GoogleGenAIChatGenerator(model="gemini-2.5-flash")

In [37]:
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder
from haystack import Pipeline, Document
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
from haystack_integrations.components.embedders.fastembed import FastembedTextEmbedder, FastembedSparseTextEmbedder
from haystack_integrations.components.rankers.fastembed import FastembedRanker

hybrid_query = Pipeline()
hybrid_query.add_component("sparse_text_embedder",FastembedSparseTextEmbedder(model="Qdrant/bm25"))
hybrid_query.add_component("dense_text_embedder", FastembedTextEmbedder(model="jinaai/jina-embeddings-v2-small-en"))
hybrid_query.add_component("retriever", QdrantHybridRetriever(document_store=document_store, top_k=5))
hybrid_query.add_component("ranker", FastembedRanker(top_k=5))
hybrid_query.add_component("prompt_builder", prompt_builder)
hybrid_query.add_component("llm", generator)
hybrid_query.add_component(instance=AnswerBuilder(), name="answer_builder")
hybrid_query.connect("sparse_text_embedder.sparse_embedding","retriever.query_sparse_embedding")
hybrid_query.connect("dense_text_embedder.embedding","retriever.query_embedding")
hybrid_query.connect("retriever.documents", "ranker.documents")
hybrid_query.connect("ranker.documents", "prompt_builder")
hybrid_query.connect("prompt_builder", "llm")
hybrid_query.connect("llm.replies", "answer_builder.replies")
hybrid_query.connect("retriever", "answer_builder.documents")


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f9ed3c94510>
üöÖ Components
  - sparse_text_embedder: FastembedSparseTextEmbedder
  - dense_text_embedder: FastembedTextEmbedder
  - retriever: QdrantHybridRetriever
  - ranker: FastembedRanker
  - prompt_builder: PromptBuilder
  - llm: GoogleGenAIChatGenerator
  - answer_builder: AnswerBuilder
üõ§Ô∏è Connections
  - sparse_text_embedder.sparse_embedding -> retriever.query_sparse_embedding (SparseEmbedding)
  - dense_text_embedder.embedding -> retriever.query_embedding (list[float])
  - retriever.documents -> ranker.documents (list[Document])
  - retriever.documents -> answer_builder.documents (list[Document])
  - ranker.documents -> prompt_builder.documents (list[Document])
  - prompt_builder.prompt -> llm.messages (str)
  - llm.replies -> answer_builder.replies (list[ChatMessage])

In [38]:
#rag22 = Pipeline()
#rag22.add_component("sparse_text_embedder",FastembedSparseTextEmbedder(model="Qdrant/bm25"))
#rag22.add_component("dense_text_embedder", FastembedTextEmbedder(model="jinaai/jina-embeddings-v2-small-en"))
#rag22.add_component("retriever", QdrantHybridRetriever(document_store=document_store, top_k=5))
#rag22.add_component("ranker", FastembedRanker(top_k=5))
#rag22.add_component("prompt_builder", prompt_builder)
#rag22.add_component("llm", generator)
#rag22.add_component(instance=AnswerBuilder(), name="answer_builder")



#rag22.connect("sparse_text_embedder.sparse_embedding","retriever.query_sparse_embedding")
#rag22.connect("dense_text_embedder.embedding", "retriever.query_embedding")
#rag22.connect("retriever.documents", "ranker.documents")
#rag22.connect("ranker.documents", "prompt_builder.documents")
#rag22.connect("prompt_builder", "llm")
#rag22.connect("llm.replies", "answer_builder.replies")
#rag22.connect("retriever", "answer_builder.documents")

In [43]:


question = "How is German identity defined apart from legal citizenship requirements? and how many context provided?"

response = hybrid_query.run(
    {
        "sparse_text_embedder": {"text": question},
        "dense_text_embedder": {"text": question},
        "ranker": {"query": question},
        "prompt_builder": {"question": question},
        "answer_builder": {"query": question},
    }
)
response["answer_builder"]["answers"][0].data

Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 50.92it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 632.53it/s]


'German identity, beyond legal citizenship, is defined as strongly cultural rather than ethnic. To be culturally German, one essentially has to grow up in Germany, or, if they moved there as an adult, have lived there for several decades at least. Appearance or ethnicity is generally not relevant to this definition, with examples like Philipp R√∂sler, Cem √ñzdemir, and J√©r√¥me Boateng being considered 100% German.\n\nThere are 5 contexts provided.'

In [37]:
import pandas as pd

In [38]:
df_ground_truth = pd.read_csv("gp4o-mini-questions.csv")
ground_truth = df_ground_truth.to_dict(orient="records")

In [39]:
combined_list = list(map(lambda gt: {**gt, **next((ds for ds in ds_gpt if ds['id'] == gt['content']), {

    "test": gt["content"]
})}, ground_truth))


combined_list = [
    {**gt, **next((ds for ds in ds_gpt if ds['id'] == gt['content']), {})}
    for gt in ground_truth
]


new_ds = [
    {
        'question': gt['question'],
        'id': gt['content'],
        'content': ds['content']
    }
    for gt in ground_truth
    for ds in ds_gpt
    if gt['content'] == ds['id']
]

In [40]:
for gt in ground_truth[:6]:
    print(gt["question"])
    print(gt["content"])

What is the total length of the Autobahn network in Germany?
9d8370cf-a2c8-4c54-9f9c-476b9c09a933
How many deaths occurred on the Autobahn per billion miles in 2015?
9d8370cf-a2c8-4c54-9f9c-476b9c09a933
What were the safety statistics for US interstate highways in comparison to the Autobahn?
9d8370cf-a2c8-4c54-9f9c-476b9c09a933
How many miles did vehicles travel on US interstate highways in 2015?
9d8370cf-a2c8-4c54-9f9c-476b9c09a933
What percentage of the Autobahn has no speed limit?
9d8370cf-a2c8-4c54-9f9c-476b9c09a933
What are the primary methods for obtaining German citizenship if you have ancestors from Germany?
6ef3b8e4-f20b-4893-bf9e-f58f800afb82


In [41]:
new_ds[:6]

[{'question': 'What is the total length of the Autobahn network in Germany?',
  'id': '9d8370cf-a2c8-4c54-9f9c-476b9c09a933',
  'content': 'The Autobahn is a [network of interstate highways in Germany](https://en.m.wikipedia.org/wiki/Autobahn#/media/File%3AAutobahnen_in_Deutschland.svg) with a total length of more than 8,000 miles. [65%](https://en.wikipedia.org/wiki/Autobahn#Speed_limits) of the Autobahn has no speed limit. How safe can that be?\nVehicles traveled 147 billion miles on the Autobahn in 2015. 322 people died = 2.19 deaths per billion miles.\nIn the US, vehicles travelled 757 billion miles on interstate highways. 3,837 people died = 5.07 deaths per billion miles.\nThat means: If you drive on the interstate, your likelihood to die is 131% higher than for the same distance on the Autobahn.\n*sources:*\nStatistisches Bundesamt: [Unfallentwicklung auf deutschen Stra√üen 2015](https://www.destatis.de/DE/PresseService/Presse/Pressekonferenzen/2016/Unfallentwicklung_2015/Presseb

In [42]:
from tqdm.auto import tqdm

In [49]:
contexts = []
responses = []
questions = []
for run in tqdm(new_ds):
  question = run["question"]
  context = run["content"]
  questions.append(question)
  contexts.append(context)
  response = rag.run(
  {
  "sparse_text_embedder": {"text": question},
  "dense_text_embedder": {"text": question},
  "prompt_builder": {"question": question},
  "answer_builder": {"query": question},
  }
  )
  responses.append(response["answer_builder"]["answers"][0].data)

  0%|          | 0/3305 [00:00<?, ?it/s]

Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 56.22it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3469.23it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 69.11it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6482.70it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 67.63it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 6403.52it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 60.47it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 3063.77it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 41.21it/s]
Calculating sparse embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 2683.50it/s]
Calculating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñ

In [50]:
df = pd.DataFrame()

In [52]:
df["contexts"] = contexts
df["responses"] = responses
df["questions"] = questions

In [53]:
df.head()

Unnamed: 0,contexts,responses,questions
0,The Autobahn is a [network of interstate highw...,The total length of the Autobahn network in Ge...,What is the total length of the Autobahn netwo...
1,The Autobahn is a [network of interstate highw...,"In 2015, there were 2.19 deaths per billion mi...",How many deaths occurred on the Autobahn per b...
2,The Autobahn is a [network of interstate highw...,"In 2015, vehicles traveled 147 billion miles o...",What were the safety statistics for US interst...
3,The Autobahn is a [network of interstate highw...,"In the US, vehicles traveled 757 billion miles...",How many miles did vehicles travel on US inter...
4,The Autobahn is a [network of interstate highw...,65% of the Autobahn has no speed limit.,What percentage of the Autobahn has no speed l...


In [58]:
df.to_csv("./finals.csv", header=["context", "responses", "questions"], index=False)

In [59]:
test = pd.read_csv("./finals.csv")
test.head()

Unnamed: 0,context,responses,questions
0,The Autobahn is a [network of interstate highw...,The total length of the Autobahn network in Ge...,What is the total length of the Autobahn netwo...
1,The Autobahn is a [network of interstate highw...,"In 2015, there were 2.19 deaths per billion mi...",How many deaths occurred on the Autobahn per b...
2,The Autobahn is a [network of interstate highw...,"In 2015, vehicles traveled 147 billion miles o...",What were the safety statistics for US interst...
3,The Autobahn is a [network of interstate highw...,"In the US, vehicles traveled 757 billion miles...",How many miles did vehicles travel on US inter...
4,The Autobahn is a [network of interstate highw...,65% of the Autobahn has no speed limit.,What percentage of the Autobahn has no speed l...
