**Index the DBpedia entities into elasticSearch based on TYPE CENTRIC method**

In [1]:
from elasticsearch import Elasticsearch
from helper_function import preprocess, load_dict_from_json
import json
from typing import Dict, List, Optional

In [2]:
INDEX_NAME = "dbpdiea_type_centric"
INDEX_SETTINGS = {
    "mappings": {
        "properties": {
            "type": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },
            "abstract": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },

        }
    }
}

In [17]:
def index_dbpdiea_type_centric(index:str, index_settings:Dict,
                               reset=False,testmode=False,
                               filepath="../data/DBpedia_map_type_abstract.json",
                               es=Elasticsearch())->bool:
    """parse DBpedia DBOtype with abstract of entities which have this type,
        index to elasticsearch
    
        Args:
        index:index name for elasticsearch
        reset: set to be True, if want to reset and re-index into the elasticSearch.
        filepath: file contains a dictionary DBOtype ~ short abstract.
        es: Elasticsearch object.
    """
    es.info()
    if reset and es.indices.exists(index):
        es.indices.delete(index=index)
        print(f'"{index}" is reset.')
        
    if es.indices.exists(index):
        count = es.cat.count(index, params={"format": "json"})
        print(f'"{index}" already exists in ElasticSearch, and there are {int(count[0]["count"])} items.') 
        return True
    try:
        file_object = open(filepath,"r",encoding="utf8") 
        collections = json.loads(file_object.read())
        print(f'"{filepath}" is loaded, and the length is {len(collections)}.') 
 
    except FileNotFoundError:
        print(f'"{filepath}" is not loaded,please check.')
        return False
        
    
    #the reading for the 7th item is really slow, do some processing
    collections[7]["abstract"]=preprocess(collections[7]["abstract"])    
    es.indices.create(index=index, body=index_settings)
    for item in collections: 
        doc={"type":item["type"],"abstract":item["abstract"]}
        es.index(index=index, doc_type="_doc", id=item["id"], body=doc)
        
    es.indices.refresh(index)
    count = es.cat.count(index, params={"format": "json"})
    print(f'"{index}" are created in ElasticSearch, and {int(count[0]["count"])} items indexed.') 
    return True

In [20]:
index_dbpdiea_type_centric(INDEX_NAME,INDEX_SETTINGS)

"dbpdiea_type_centric" already exists in ElasticSearch, and there are 421 items.


  if es.indices.exists(index):
  count = es.cat.count(index, params={"format": "json"})


True

In [3]:
es=Elasticsearch(timeout=120)
es.get(index=INDEX_NAME,id="419")



{'_index': 'dbpdiea_type_centric',
 '_type': '_doc',
 '_id': '419',
 '_version': 1,
 '_seq_no': 419,
 '_primary_term': 1,
 'found': True,
 '_source': {'type': 'dbo:BoxingLeague',
  'abstract': ' Super is a boxing tournament that is held in New Zealand  Super is owned by John McRae  who is the Managing director of VSLive  VS Live was formally known as Sky Arena  Sky ARENA previously owned by SKY Television New Zealand and VADR Media  In VADR Media acquired  of SKY ARENA shares and relaunched as VS LIVE '}}