**Index the DBpedia entities into elasticSearch based on ENTITY CENTRIC method**

In [1]:
import json
import pickle
from helper_function import preprocess, Indexer
from elasticsearch import Elasticsearch
from typing import Dict, List, Optional

In [4]:
INDEX_SETTINGS = {
    "mappings": {
        "properties": {
            "entity": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },
            "abstract": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },

        }
    }
}


In [28]:
def index_dbpdiea_entity_centric(index:str, index_settings:Dict,reset=False,
                                   filepath="../data/short_abstracts_en.ttl",
                                   es=Elasticsearch())->bool:
    """parse DBpedia entity with abstract, index to elasticsearch
    
        Args:
        index:index name for elasticsearch
        reset: set to be True, if want to reset and re-index into the elasticSearch
        filepath: file which contain information of entity name and short abstract
        es: Elasticsearch object.
    """
    es.info()
    if reset and es.indices.exists(index):
        es.indices.delete(index=index)
        print(f'"{index}" is reset.')
        
    if es.indices.exists(index):
        count = es.cat.count(index, params={"format": "json"})
        print(f'"{index}" already exists in ElasticSearch, and there are {int(count[0]["count"])} items.') 
        return True
    
    try:
        file_object = open(filepath,"r",encoding="utf8") 
        file = file_object.read()
        print(f'"{filepath}" is loaded.') 
 
    except FileNotFoundError:
        print(f'"{filepath}" is not found.')
        return False
    
    
    #process file and index into elasticSearch
    es.indices.create(index=index, body=index_settings)
    i=0
    j=0
    with open(filepath, 'r', encoding='utf-8') as file: 
        for line in file:
            j+=1
            if j%100000==0:
                print("-----------j",j)
            if (line is None) or (line[0] == '#'):
                continue
            #line = line.lower().strip()[:-5].replace('/>', '>').split(' ')
            line=line.split(' ')
            if len(line) < 3:
                continue
            entity = line[0][1:-1].split('/')[-1]
            entity=preprocess(entity)
            line[-2]=line[-2][:-4]
            abstract = preprocess(' '.join(line[2:-1]).replace('\\', ''))
            doc={"entity":entity,"abstract":abstract}
            es.index(index=index, doc_type="_doc", id=str(i), body=doc)
            i+=1
            
    es.indices.refresh(index)
    count = es.cat.count(index, params={"format": "json"})          
    print(f'{j} entities are processed, \n{count[0]["count"]} entities are indexed into elastic search')
    return True


In [1]:
# #test with small file
# index="test_entity_centric"
# index_settings=INDEX_SETTINGS
# filepath="../data/xaa_small_abs"
# index_dbpdiea_entity_centric(index, index_settings,
#                            reset=True,
#                            filepath=filepath,
#                            es=Elasticsearch())
# es = Elasticsearch()
# es.info()
# es.get(index=index,id="15")

In [5]:
es = Elasticsearch()
es.info()
INDEX_NAME = "dbpedia_entity_centric"
filepath="../data/short_abstracts_en.ttl"
index_dbpdiea_entity_centric(index=INDEX_NAME, index_settings=INDEX_SETTINGS,
                           reset=False,
                           filepath=filepath,
                           es=Elasticsearch())

  es.index(index=INDEX_NAME, doc_type="_doc", id=str(i), body=doc)


-----------i 0
-----------j 100000
-----------i 100000
-----------j 200000
-----------i 200000
-----------j 300000
-----------i 300000
-----------j 400000
-----------i 400000
-----------j 500000
-----------i 500000
-----------j 600000
-----------i 600000
-----------j 700000
-----------i 700000
-----------j 800000
-----------i 800000
-----------j 900000
-----------i 900000
-----------j 1000000
-----------i 1000000
-----------j 1100000
-----------i 1100000
-----------j 1200000
-----------i 1200000
-----------j 1300000
-----------i 1300000
-----------j 1400000
-----------i 1400000
-----------j 1500000
-----------i 1500000
-----------j 1600000
-----------i 1600000
-----------j 1700000
-----------i 1700000
-----------j 1800000
-----------i 1800000
-----------j 1900000
-----------i 1900000
-----------j 2000000
-----------i 2000000
-----------j 2100000
-----------i 2100000
-----------j 2200000
-----------i 2200000
-----------j 2300000
-----------i 2300000
-----------j 2400000
-----------i 240

In [2]:
# term="appellate" 
# field="abstract"
# hits = (
#         es.search(
#             index=INDEX_NAME,
#             query={"match": {field: term}},
#             _source=False,
#             size=1,
#         )
#         .get("hits", {})
#         .get("hits", {})
#     )
# hits