Based on https://haystack.deepset.ai/tutorials/01_basic_qa_pipeline

# Set up Document Store

In [1]:
from haystack.document_stores import WeaviateDocumentStore

document_store = WeaviateDocumentStore(
    host="http://weaviate", index="document", similarity="dot_product"
)


# Preprocess Documents

In [2]:
from haystack.utils import (
    clean_wiki_text,
    convert_files_to_docs,
    fetch_archive_from_http,
)


# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/tutorial1"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

docs = convert_files_to_docs(
    dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True
)

document_store.write_documents(docs)




  0%|          | 0/2497 [00:00<?, ?it/s]



# Initialize Retriever, Reader and Pipeline

Retriever:

In [3]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",
)

document_store.update_embeddings(retriever)




Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Reader:

In [4]:
from haystack.nodes import TransformersReader

reader = TransformersReader(
    model_name_or_path="distilbert-base-uncased-distilled-squad",
    tokenizer="distilbert-base-uncased",
    use_gpu=-1,
)


Pipeline:

In [5]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)


# Ask a question

In [6]:
from haystack.utils import print_answers

prediction = pipe.run(
    query="Who is the father of Arya Stark?",
    params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
)


print_answers(prediction, details="minimum")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: Who is the father of Arya Stark?
Answers:
[   {   'answer': 'Maisie Williams',
        'context': 'f sword fighting by Syrio Forel.\n'
                   'Arya is portrayed by English actress Maisie Williams in '
                   "HBO's Emmy-winning television adaptation of the novel "
                   "series, ''Ga"},
    {   'answer': 'Eddard and Catelyn Stark',
        'context': '\n'
                   '=== Robb Stark ===\n'
                   'Robb Stark is the oldest child of Eddard and Catelyn '
                   'Stark, and the heir to Winterfell. He is not a POV '
                   'character, but features '},
    {   'answer': 'Eddard and Catelyn Stark',
        'context': ' Arya Stark ===\n'
                   'Arya Stark is the third child and younger daughter of '
                   'Eddard and Catelyn Stark. She serves as a POV character '
                   "for 33 chapters throughout ''A Game of"},
    {   'answer': 'Eddard and Catelyn Stark',
        'con