## 1. Download GoT data

In [2]:
! DIR="../data/article_txt_got/" && [ ! -d "$DIR" ] && \
mkdir -p "$DIR" && \
wget https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip && \
unzip -o wiki_gameofthrones_txt.zip -d "$DIR" && \
rm wiki_gameofthrones_txt.zip || echo "Data dir: '$DIR' already exists, skipping dowload"

--2020-09-02 18:04:55--  https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip
Resolving s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)... 52.219.140.7
Connecting to s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)|52.219.140.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1095120 (1.0M) [application/zip]
Saving to: ‘wiki_gameofthrones_txt.zip’


2020-09-02 18:04:55 (7.02 MB/s) - ‘wiki_gameofthrones_txt.zip’ saved [1095120/1095120]

Archive:  wiki_gameofthrones_txt.zip
  inflating: ../data/article_txt_got/299_Rani_Mahal__TV_series_.txt  
  inflating: ../data/article_txt_got/133_Game_of_Thrones__Season_5__soundtrack_.txt  
  inflating: ../data/article_txt_got/135_Game_of_Thrones__Season_7__soundtrack_.txt  
  inflating: ../data/article_txt_got/26_Game_of_Thrones__Season_3__soundtrack_.txt  
  inflating: ../data/article_txt_got/399_For_the_Throne__Music_Inspired_by_the_HBO_Series_Game

## 2. Index GoT data

In [2]:
HOST = 'localhost' 
PORT = 9200
INDEX_NAME = 'game_of_thrones'

from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host=HOST, port=PORT, username="", password="", index=INDEX_NAME)

09/18/2020 17:06:31 - INFO - elasticsearch -   PUT http://localhost:9200/game_of_thrones [status:200 request:0.519s]


In [2]:
doc_dir = "../data/article_txt_got"
dicts = convert_files_to_dicts(dir_path=doc_dir, 
                               clean_func=clean_wiki_text, 
                               split_paragraphs=True)
import random
print('#entries: ', len(dicts))
print('A sample: ', random.choice(dicts))

#entries:  2497
A sample:  {'text': '\n====\'\'A Storm of Swords\'\'====\nCatelyn has Jaime sent to King\'s Landing to exchange him for her captive daughters Sansa and Arya, escorted by Brienne and Jaime\'s cousin Ser Cleos Frey. They are attacked by outlaws who kill Cleos, and Jaime tries to escape in the commotion. Brienne restrains Jaime, but they are captured by the mercenary company the Brave Companions, allied to House Bolton. When their leader, Vargo Hoat, cuts off Jaime\'s hand, Brienne convinces Jaime to live to have revenge on Hoat. In return, Jaime stops the Brave Companions from raping Brienne. The two prisoners are taken to Harrenhal, where Jaime reveals that he killed the Mad King Aerys II Targaryen to stop him from burning King\'s Landing. Roose Bolton allows Jaime to return to King\'s Landing but allows Hoat to keep Brienne as his prize. Hoat attempts to rape Brienne, but she bites off his ear, and Hoat throws her into a bear pit before Jaime returns to have her release

In [3]:
document_store.write_documents(dicts)

09/02/2020 18:40:45 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.162s]
09/02/2020 18:40:45 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.087s]
09/02/2020 18:40:45 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.059s]
09/02/2020 18:40:45 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.056s]
09/02/2020 18:40:45 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.059s]


## 3. QA on GoT

In [3]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
! DIR="../models/roberta-base-squad2" && [ ! -d "$DIR" ] && \
mkdir -p "$DIR" && \
wget https://s3.amazonaws.com/models.huggingface.co/bert/deepset/roberta-base-squad2/config.json -P "$DIR" && \
wget https://cdn.huggingface.co/deepset/roberta-base-squad2/vocab.json -P "$DIR" && \
wget https://cdn.huggingface.co/deepset/roberta-base-squad2/merges.txt -P "$DIR" && \
wget https://cdn.huggingface.co/deepset/roberta-base-squad2/pytorch_model.bin -P "$DIR" && \
wget https://cdn.huggingface.co/deepset/roberta-base-squad2/tokenizer_config.json -P "$DIR" || \
echo "Data dir: '$DIR' already exists, skipping dowload"

In [5]:
READER_DiR = "../models/electra-base-squad2"
reader = TransformersReader(model=READER_DiR, tokenizer=READER_DiR,  use_gpu=0)

In [6]:
finder = Finder(reader, retriever)
prediction = finder.get_answers(question="who is the father of Arya Stark", 
                                top_k_retriever=10, 
                                top_k_reader=3)

09/18/2020 17:28:29 - INFO - elasticsearch -   POST http://localhost:9200/game_of_thrones/_search [status:200 request:0.004s]
09/18/2020 17:28:29 - INFO - haystack.retriever.sparse -   Got 0 candidates from retriever
09/18/2020 17:28:29 - INFO - haystack.finder -   Retriever did not return any documents. Skipping reader ...


In [7]:
print_answers(prediction)

{'answers': [], 'question': 'who is the father of Arya Stark'}
