## 1. Prepare the data

In [2]:
# Get latest Wikipedia English dump (this will take more than 4 hours)
# ! wget "http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"

In [3]:
# Extract text using WikiExtractor (this will take about 3 hours)
# ! python -m wikiextractor.WikiExtractor -o "data/wikipedia/" --json \
# --filter_disambig_page \
# --processes 8 \
# "data/enwiki-latest-pages-articles.xml.bz2"

## 2. Index wikipedia

In [1]:
HOST = 'localhost' 
PORT = 9200 
INDEX_NAME = 'wikipedia_en'

from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host=HOST, port=PORT, username="", password="", index=INDEX_NAME)

09/05/2020 10:27:07 - INFO - elasticsearch -   PUT http://localhost:9200/wikipedia_en [status:400 request:0.003s]


In [None]:
# clear existing index (optional)
if document_store.client.indices.exists(index=document_store.index):
    print('clear existing inddex')
    document_store.client.indices.delete(index=document_store.index)

In [None]:
# Get all dirs in wikipedia folder
from os import listdir
from os.path import isfile, join
import json
from tqdm import tqdm

wikidata_path = "../data/wikipedia"
onlydirs = [f for f in listdir(wikidata_path) if not isfile(join(wikidata_path, f))]

dicts = []
bulk_size = 5000

pbar = tqdm(onlydirs)
for directory in pbar:
    subdirs = [f for f in listdir(join(wikidata_path,directory)) if not isfile(join(wikidata_path,directory))]
    pbar.set_description(f"Processing wikipedia folder {directory}")

    for file in subdirs:
        f = open(join(wikidata_path,directory,file), "r") 
        
        # Each text file contains json structures separated by EOL
        articles = f.read().split("\n")
        
        for article in articles:
            if len(article)==0: continue

            # Article in json format
            json_formatted_article = json.loads(article)

            # Rename keys
            document = {"id": json_formatted_article["id"],
                        "name": json_formatted_article["title"],
                        "url": json_formatted_article["url"], 
                        "text": json_formatted_article["text"]}

            # Add document to bulk
            dicts.append(document)
            
            if len(dicts)>=bulk_size:
                # Index bulk
                try:
                    document_store.write_documents(dicts)
                    dicts.clear()
                except:
                    print("Bulk not indexed")
        
    
if len(dicts) > 0:
    print('final round')
    document_store.write_documents(dicts)
            
print('finished')

## 3. QA on wikipedia

In [2]:
HOST = 'localhost' 
PORT = 9200 
INDEX_NAME = 'wikipedia_en'

from haystack import Finder
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host=HOST, port=PORT, username="", password="", index=INDEX_NAME)

09/05/2020 10:27:35 - INFO - elasticsearch -   PUT http://localhost:9200/wikipedia_en [status:400 request:0.003s]


In [3]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)
READER_DiR = "../models/roberta-base-squad2"
READER_DiR = "../models/electra-base-squad2"
reader = TransformersReader(model=READER_DiR, tokenizer=READER_DiR,  use_gpu=0)

In [6]:
import logging
logging.disable(logging.WARNING)
finder = Finder(reader, retriever)
prediction = finder.get_answers(question="who is the father of Arya Stark", 
                                top_k_retriever=10, 
                                top_k_reader=3)

In [7]:
print_answers(prediction)

{   'answers': [   {   'answer': 'Robert Baratheon.',
                       'context': ' her he is the bastard son of Robert '
                                  'Baratheon. Aware of their chances of dyi',
                       'document_id': 'ErxhUHQBdihk5qAljEng',
                       'meta': {   'id': '41374178',
                                   'name': 'Arya Stark',
                                   'url': 'https://en.wikipedia.org/wiki?curid=41374178'},
                       'offset_end': 20868,
                       'offset_start': 20851,
                       'probability': 0.9908403659626074,
                       'score': None},
                   {   'answer': 'Ned',
                       'context': '.\n'
                                  'Arya accompanies her father Ned and her '
                                  "sister Sansa to King'",
                       'document_id': 'ErxhUHQBdihk5qAljEng',
                       'meta': {   'id': '41374178',
             