In [None]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab]
pip install --upgrade --force-reinstall Pillow

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems in the [documentation page](https://docs.haystack.deepset.ai/docs/telemetry#how-can-i-opt-out). More information at [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry).
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [None]:
from haystack.utils import fetch_archive_from_http

doc_dir = "data/build_your_first_question_answering_system"

fetch_archive_from_http(
    url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip",
    output_dir=doc_dir
)

INFO:haystack.utils.import_utils:Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip to 'data/build_your_first_question_answering_system'


True

In [None]:
import os
from haystack.pipelines.standard_pipelines import TextIndexingPipeline

files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
INFO:haystack.pipelines.base:It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|██████████| 183/183 [00:02<00:00, 91.08it/s] 
Preprocessing: 100%|██████████| 183/183 [00:06<00:00, 27.40docs/s]
Updating BM25 representation...: 100%|██████████| 2359/2359 [00:00<00:00, 11609.93 docs/s]


{'documents': [<Document: {'content': '\n\n"\'\'\'Fire and Blood\'\'\'" is the tenth and final episode of the first season of the HBO medieval fantasy television series \'\'Game of Thrones\'\'. First aired on June 19, 2011, it was written by the show\'s creators and executive producers David Benioff and D. B. Weiss, and directed by Alan Taylor.\n\nThe title of the episode is the motto of House Targaryen, and alludes to the aftermath of the previous episode\'s climactic events. The episode\'s action revolves around the Starks\' reactions to Eddard Stark\'s execution: Sansa is taken hostage, Arya flees in disguise, Robb and Catelyn lead an army against the Lannisters, and Jon Snow struggles with his divided loyalty. Across the narrow sea, Daenerys must deal with the blood magic that has robbed her of her husband, her son, and her army.\n\nThe episode was well received by critics, who singled out the closing scene as a particularly strong way to end the first season. In the United States,

In [None]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [None]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
prediction = pipe.run(
    query="Who is Littlefinger?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

Inferencing Samples: 100%|██████████| 1/1 [00:16<00:00, 16.80s/ Batches]


In [None]:
from pprint import pprint

pprint(prediction)

{'answers': [<Answer {'answer': 'Lord Petyr "Littlefinger" Baelish', 'type': 'extractive', 'score': 0.836114764213562, 'context': 'brother Lord Renly, Lord Varys, Grand Maester Pycelle, and Lord Petyr "Littlefinger" Baelish. Renly announces Robert\'s plans for a great tourney in ho', 'offsets_in_document': [{'start': 617, 'end': 650}], 'offsets_in_context': [{'start': 59, 'end': 92}], 'document_ids': ['2144153690395772a8173ceb3e2df787'], 'meta': {'_split_id': 2}}>,
             <Answer {'answer': 'the only person Littlefinger serves is Littlefinger', 'type': 'extractive', 'score': 0.43817535042762756, 'context': " are skeptical of this, with Sansa Stark stating 'the only person Littlefinger serves is Littlefinger.'His methods are totally unscrupulous, including", 'offsets_in_document': [{'start': 695, 'end': 746}], 'offsets_in_context': [{'start': 50, 'end': 101}], 'document_ids': ['2da0ecd72ba0e00c3c43e9043ced2bc1'], 'meta': {'_split_id': 3}}>,
             <Answer {'answer': 'confida

In [None]:
from haystack.utils import print_answers

print_answers(
    prediction,
    details="minimum" ## Choose from `minimum`, `medium`, and `all`
)

'Query: Who is Littlefinger?'
'Answers:'
[   {   'answer': 'Lord Petyr "Littlefinger" Baelish',
        'context': 'brother Lord Renly, Lord Varys, Grand Maester Pycelle, and '
                   'Lord Petyr "Littlefinger" Baelish. Renly announces '
                   "Robert's plans for a great tourney in ho"},
    {   'answer': 'the only person Littlefinger serves is Littlefinger',
        'context': " are skeptical of this, with Sansa Stark stating 'the only "
                   "person Littlefinger serves is Littlefinger.'His methods "
                   'are totally unscrupulous, including'},
    {   'answer': 'confidante Ros',
        'context': '====Season 3====\n'
                   "After Littlefinger's confidante Ros is severely beaten by "
                   "Joffrey's guards and Littlefinger fails to intervene, "
                   'Varys takes Ros into '},
    {   'answer': "short stature and his family's lands on the smallest of the "
                  'Fingers',
        