In [None]:
# While extractive QA highlights the span of text that answers a query, generative QA can return a novel text answer that it has composed.
# In this tutorial, you will learn how to set up a generative system using the RAG model which conditions the answer generator on a set of retrieved documents.
from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.utils import fetch_archive_from_http

In [None]:
# Download sample
doc_dir = "/mnt/sda/haystack/rag_generative_qa"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip"
fetch_archive_from_http(url = s3_url, output_dir = doc_dir)

# Create dataframe with columns "title" and "text"
df = pd.read_csv("/mnt/sda/haystack/rag_generative_qa" + "/" + "small_generator_dataset.csv", sep=",")
# Minimal cleaning
df.fillna(value = "", inplace = True)

print(df.head())


In [None]:
# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents: List[Document] = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={ "name": title or ""}))

In [None]:
# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(faiss_index_factory_str = "Flat", return_embedding = True)

# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever(
    document_store = document_store,
    query_embedding_model = "facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model = "facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu = True,
    embed_title = True,
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path = "facebook/rag-token-nq",
    use_gpu = True,
    top_k = 1,
    max_length = 200,
    min_length = 2,
    embed_title = True,
    num_beams = 2,
)

# Delete existing documents in documents store
document_store.delete_documents()
# Write documents to document store
document_store.write_documents(documents)
# Add documents embeddings to index
document_store.update_embeddings(retriever = retriever)


In [None]:
QUESTIONS = [
    "who got the first nobel prize in physics",
    "when is the next deadpool movie being released",
    "which mode is used for short wave broadcast service",
    "who is the owner of reading football club",
    "when is the next scandal episode coming out",
    "when is the last time the philadelphia won the superbowl",
    "what is the most current adobe flash player version",
    "how many episodes are there in dragon ball z",
    "what is the first step in the evolution of the eye",
    "where is gall bladder situated in human body",
    "what is the main mineral in lithium batteries",
    "who is the president of usa right now",
    "where do the greasers live in the outsiders",
    "panda is a national animal of which country",
    "what is the name of manchester united stadium",
]

from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import print_answers

pipe = GenerativeQAPipeline(generator = generator, retriever = retriever)
for question in QUESTIONS:
    res = pipe.run(query = question, params = {"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
    print_answers(res, details = "minimum")