In [None]:
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss,ocr]


In [None]:
!cp ../input/pgt-qa-eval/* .
!mkdir pgt
!mv pgthb.txt pgt
!rm PGT.db

In [None]:
from haystack.nodes import PreProcessor
from haystack.utils import convert_files_to_docs

In [None]:
pgt_doc_txt = convert_files_to_docs(dir_path="pgt")

In [None]:
!ls

In [None]:
eval_preprocessor = PreProcessor(
    clean_empty_lines=False,
    clean_whitespace=False,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=False,
)

In [None]:
eval_docs = eval_preprocessor.process(pgt_doc_txt)


In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.document_stores import FAISSDocumentStore

In [None]:
!rm PGT.db
pgt_document_store = FAISSDocumentStore(embedding_dim=768, faiss_index_factory_str="Flat", sql_url="sqlite:///PGT.db")


In [None]:
document_store = pgt_document_store

In [None]:
document_store.write_documents(eval_docs)

In [None]:
from haystack.nodes import DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="sentence-transformers/facebook-dpr-question_encoder-multiset-base",
    passage_embedding_model="sentence-transformers/facebook-dpr-ctx_encoder-multiset-base",
)

In [None]:
document_store.update_embeddings(retriever)

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", context_window_size=500)

In [None]:
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline

In [None]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
document_store.add_eval_data(
    filename="eval_ds.json",
    preprocessor=eval_preprocessor
)

In [None]:
from haystack.schema import EvaluationResult, MultiLabel


In [None]:
eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=True)
eval_result = pipe.eval(labels=eval_labels, params={"Retriever": {"top_k": 10}})

In [None]:
%%time
retriever_result = eval_result["Retriever"]
reader_result = eval_result["Reader"]


In [None]:
%%time
eval_result_with_upper_bounds = pipe.eval(
    labels=eval_labels, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 4}}, add_isolated_node_eval=True
)

In [None]:
pipe.print_eval_report(eval_result_with_upper_bounds)

In [None]:
advanced_eval_result = pipe.eval(
    labels=eval_labels, params={"Retriever": {"top_k": 5}}, sas_model_name_or_path="cross-encoder/stsb-roberta-large"
)


metrics = advanced_eval_result.calculate_metrics()
print(metrics["Reader"]["sas"])

In [None]:
eval_result_with_upper_bounds.save("DPR-SENTENCEMULTIENCODER-FAISS-RoBERTaSQUAD2")

In [None]:
!zip -r DPR-SENTENCEMULTIENCODER-FAISS-RoBERTaSQUAD2.zip DPR-SENTENCEMULTIENCODER-FAISS-RoBERTaSQUAD2


In [None]:
%%time
pipe.print_eval_report(eval_result)

In [None]:
metrics = eval_result.calculate_metrics()
print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}')
print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}')
print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}')
print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}')
print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}')

print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}')
print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}')

In [None]:
#reader_eval_results = reader.eval(document_store=document_store,label_index="label")
reader_eval_results = reader.eval_on_file("./", "eval_ds.json")
top_n = reader_eval_results["top_n"]
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
# reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)

# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer including no_answers
print(f"Reader Top-{top_n}-Accuracy:", reader_eval_results["top_n_accuracy"])
# Reader Top-1-Exact Match is the proportion of questions where the first predicted answer is exactly the same as the correct answer including no_answers
print("Reader Top-1-Exact Match:", reader_eval_results["EM"])
# Reader Top-1-F1-Score is the average overlap between the first predicted answers and the correct answers including no_answers
print("Reader Top-1-F1-Score:", reader_eval_results["f1"])
# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer excluding no_answers
print(f"Reader Top-{top_n}-Accuracy (without no_answers):", reader_eval_results["top_n_accuracy_text_answer"])
# Reader Top-N-Exact Match is the proportion of questions where the predicted answer within the first n results is exactly the same as the correct answer excluding no_answers (no_answers are always present within top n).
print(f"Reader Top-{top_n}-Exact Match (without no_answers):", reader_eval_results["top_n_EM_text_answer"])
# Reader Top-N-F1-Score is the average overlap between the top n predicted answers and the correct answers excluding no_answers (no_answers are always present within top n).
print(f"Reader Top-{top_n}-F1-Score (without no_answers):", reader_eval_results["top_n_f1_text_answer"])

In [None]:
metrics = eval_result_with_upper_bounds.calculate_metrics(eval_mode="isolated")
print(metrics["Reader"]["exact_match"])
print(metrics["Reader"]["f1"])

In [None]:
%%time
prediction = pipe.run(query="What happens if I want to resit an exam?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 4}})