## Load wiki

In [3]:
import logging
from utils.data_utils import load_corpus

for _ in ("elasticsearch", "urllib3"):
    logging.getLogger(_).setLevel(logging.WARNING)

hotpot_corpus, _ = load_corpus('data/hotpot-paragraph.tsv', for_hotpot=True, require_hyperlinks=False)

In [4]:
print(len(hotpot_corpus))

5232077


## Creater index

In [15]:
from elasticsearch import Elasticsearch

data_version = '3.1'

es = Elasticsearch(['10.60.0.59:9200'], timeout=30)

index_name = f'enwiki-20171001-paragraph-{data_version}'
settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "en_analyzer": {  # TODO for case i'm it's
                    "type": "standard",
                    "stopwords": "_english_"
                },
                "simple_bigram_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "shingle", "asciifolding"]
                },
                "bigram_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "stop", "shingle", "asciifolding"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "doc_id": {
                "type": "keyword"
            },
            "url": {
                "type": "keyword"
            },
            "title": {
                "type": "text",
                "similarity": "BM25",
                "analyzer": "simple",
                #"copy_to": "full_text",
                "fields": {
                    "exact": {
                        "type": "keyword"
                    },
                    "bigram": {
                        "type": "text",
                        "similarity": "BM25",
                        "analyzer": "simple_bigram_analyzer",
                    }
                }
            },
            "title_unescaped": {
                "type": "text",
                "similarity": "BM25",
                "analyzer": "simple",
                #"copy_to": "full_text",
                "fields": {
                    "exact": {
                        "type": "keyword"
                    },
                    "bigram": {
                        "type": "text",
                        "similarity": "BM25",
                        "analyzer": "simple_bigram_analyzer",
                    }
                }
            },
            "para_id": {
                "type": "keyword"
            },
            "para_idx": {
                "type": "integer"
            },
            "para_num": {
                "type": "integer"
            },
            "text": {
                "type": "text",
                "similarity": "BM25",
                "analyzer": "en_analyzer",
                #"copy_to": "full_text",
                "fields": {
                    "bigram": {
                        "type": "text",
                        "analyzer": "bigram_analyzer"
                    }
                }
            },
            #"full_text":  {
            #    "type": "text",
            #    "similarity": "BM25",
            #    "analyzer": "en_analyzer"
            #},
            "for_hotpot": {
                "type": "boolean"
            },
            "hyperlinks": {
                "type": "object",
                "enabled": False
            }
        }
    }
}

if es.indices.exists(index_name):
    es.indices.delete(index_name)
es.indices.create(index=index_name, body=settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'enwiki-20171001-paragraph-3.1'}

## Index passages

In [16]:
from collections import defaultdict
import json
from html import unescape
import time
from tqdm.auto import tqdm

from elasticsearch.helpers import bulk, scan


def act_batch(batch):
    time_start = time.time()
    bulk(es, batch)
    time_end = time.time()
    #print(f'Indexed {len(action_batch)} items, time cost: {time_end - time_start}s')


batch_size = 1000

wiki_out = open(f'data/enwiki-20171001-paragraph-{data_version}.tsv', 'w')  # include some title paragraphs for compatibility with HotpotQA
wiki_out.write("id\ttext\ttitle\thyperlinks\n")
wiki_out.flush()
hotpot_out = open(f'data/hotpot-paragraph-{data_version}.tsv', 'w')  # only non-empty abstract paragraphs for HotpotQA
hotpot_out.write("id\ttext\ttitle\thyperlinks\tsentence_spans\n")
hotpot_out.flush()

total_num_para = 0
actions = []
# es = Elasticsearch(['10.60.0.59:9200'], timeout=30)
query = {"query": {"match_all": {}}}
para_num = es.count(index='enwiki-20171001-paragraph-3', body=query)['count']
for hit in tqdm(scan(es, query=query, index='enwiki-20171001-paragraph-3', size=batch_size), total=para_num):
    para = hit['_source']
    new_para = {
        "doc_id": para['doc_id'],
        "url": para['url'],
        "title": para['title'],
        "title_unescaped": unescape(para['title']),
        "para_id": para['para_id'],
        "para_idx": para['para_idx'],
        "para_num": para['para_num'],
        "text": para['text'],
        "hyperlinks": para['hyperlinks'],
        "for_hotpot": para['for_hotpot']
    }
    p_id = new_para['para_id']
    actions.append({
        "_index": index_name,
        "_id": p_id,
        "_source": new_para
    })
    #if para['para_idx'] >= 0:  # not the title paragraph
    wiki_out.write(f"{p_id}\t{para['text']}\t{para['title']}\t{json.dumps(para['hyperlinks'])}\n")
    wiki_out.flush()
    if p_id in hotpot_corpus:  # FIXME: miss some lines
        assert para['for_hotpot']
        hotpot_out.write(f"{p_id}\t{para['text']}\t{para['title']}\t{json.dumps(para['hyperlinks'])}\t{hotpot_corpus[p_id]['sentence_spans']}\n")
        hotpot_out.flush()
    if len(actions) == batch_size:
        act_batch(actions)
        total_num_para += len(actions)
        actions = []

wiki_out.close()
hotpot_out.close()
if len(actions) > 0:
    act_batch(actions)
    total_num_para += len(actions)
    actions = []

print(f'Total {total_num_para} paragraphs indexed in ES')

HBox(children=(FloatProgress(value=0.0, max=18231338.0), HTML(value='')))


Total 18231338 paragraphs indexed in ES
