**Index the DBpedia entities into elasticSearch based on TYPE CENTRIC method**

In [1]:
from elasticsearch import Elasticsearch
from helper_function import preprocess, load_dict_from_json
import json
from typing import Dict, List, Optional

In [2]:
!which python

/c/Users/junec/AppData/Local/Programs/Python/Python38/python


In [3]:
if True:
   %run test.ipynb 

----for test


In [4]:
#%run DBpedia_map_type_entities.ipynb

In [5]:
INDEX_NAME = "dbpdiea_type_centric"
INDEX_SETTINGS = {
    "mappings": {
        "properties": {
            "type": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },
            "abstract": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },

        }
    }
}

In [6]:
def produce_map_type_abstract(load_filepath1="../data/DBpedia_map_type_entities.json",
                              load_filepath2="../data/DBpedia_map_entity_abstract.json",
                              save_path="../data/DBpedia_map_type_abstract.json")->bool:
    
    try:
        with open(load_filepath1, 'r',encoding='utf-8') as f:
            data_type = json.load(f)
        print(f'"{load_filepath1}" is loaded, and the length is {len(data_type)}.') 
    except:
        print(f'"{load_filepath1}" is not loaded.')
        return False
    
    try:
        with open(load_filepath2, 'r', encoding='utf-8') as file:
            data_abstract = json.load(file)
        print(f'"{load_filepath2}" is loaded, and the length is {len(data_abstract)}.') 
    except:
        print(f'"{load_filepath2}" is not loaded.')
        return False

    #parse type with the abstract for all the entities belong to this type
    map_type_abstract={}
    for dp_type,entities in data_type.items():
        abstract=""
        for entity in entities:
            abstract=abstract+data_abstract.get(entity,"")
        map_type_abstract[dp_type]=abstract
        
    collections=[{"id":str(i),"type":DBOtype,"abstract":map_type_abstract[DBOtype]} for i,DBOtype in enumerate(map_type_abstract)]
   
    with open(save_path, 'w',encoding='utf-8') as f:
        json.dump(collections, f, ensure_ascii=False)
    print(f'"{save_path}" is saved. There are {len(collections)} in this file')
        
    return True

In [17]:
def index_dbpdiea_type_centric(index:str, index_settings:Dict,reset=False,testmode=False,filepath="../data/DBpedia_map_type_abstract.json",es=Elasticsearch())->bool:
    """
    set reset to be True, if want to reset the index in the elasticSearch
    set testmode to be True, if want to use small test files
    """
    es.info()
    if reset and es.indices.exists(index):
        es.indices.delete(index=index)
        print(f'"{index}" is reset.')
        
    if es.indices.exists(index):
        count = es.cat.count(index, params={"format": "json"})
        print(f'"{index}" already exists in ElasticSearch, and there are {int(count[0]["count"])} items.') 
        return True
    try:
        file_object = open(filepath,"r",encoding="utf8") 
        collections = json.loads(file_object.read())
        print(f'"{filepath}" is loaded, and the length is {len(collections)}.') 
 
    except:
        print(f'"{filepath}" is not loaded,try to produce it')
        if testmode:
            load_filepath1="../data/test_DBpedia_map_type_entities.json"
            load_filepath2="../data/test_DBpedia_map_entity_abstract.json"
            save_path="../data/test_DBpedia_map_type_abstract.json"
        else:
            load_filepath1="../data/DBpedia_map_type_entities.json"
            load_filepath2="../data/DBpedia_map_entity_abstract.json"
            save_path="../data/DBpedia_map_type_abstract.json"
        produced=produce_map_type_abstract(load_filepath1,load_filepath2,save_path)
    
        if not produced:
            print(f'"{filepath}" is not successfully produced.')
            return False
        else:
            file_object = open(filepath,"r",encoding="utf8") 
            collections = json.loads(file_object.read())
            print(f'"{filepath}" is successfully produced and loaded, and the length is {len(collections)}.') 
        
    
    #the reading for the 7th item is really slow, do some processing
    collections[7]["abstract"]=preprocess(collections[7]["abstract"])    
    es.indices.create(index=index, body=index_settings)
    for item in collections: 
        doc={"type":item["type"],"abstract":item["abstract"]}
        es.index(index=index, doc_type="_doc", id=item["id"], body=doc)
        
    es.indices.refresh(index)
    count = es.cat.count(index, params={"format": "json"})
    print(f'"{index}" are created in ElasticSearch, and there are {int(count[0]["count"])} items indexed.') 
    return True

In [19]:
#small file for test
index="test_type_centric"
index_settings=INDEX_SETTINGS
filepath="../data/test_DBpedia_map_type_abstract.json"
index_dbpdiea_type_centric(index,index_settings,reset=True,testmode=True,filepath=filepath)

  if reset and es.indices.exists(index):


"test_type_centric" is reset.
"../data/test_DBpedia_map_type_abstract.json" is not loaded,try to produce it
"../data/test_DBpedia_map_type_entities.json" is loaded, and the length is 266.
"../data/test_DBpedia_map_entity_abstract.json" is loaded, and the length is 5999.
"../data/test_DBpedia_map_type_abstract.json" is saved. There are 266 in this file
"../data/test_DBpedia_map_type_abstract.json" is successfully produced and loaded, and the length is 266.


  if es.indices.exists(index):
  es.indices.create(index=index, body=index_settings)
  es.index(index=index, doc_type="_doc", id=item["id"], body=doc)
  es.indices.refresh(index)


"test_type_centric" are created in ElasticSearch, and there are 266 items indexed.


  count = es.cat.count(index, params={"format": "json"})


True

In [20]:
filepath="../data/DBpedia_map_type_abstract.json"
index_dbpdiea_type_centric(INDEX_NAME,INDEX_SETTINGS)

"dbpdiea_type_centric" already exists in ElasticSearch, and there are 421 items.


  if es.indices.exists(index):
  count = es.cat.count(index, params={"format": "json"})


True

In [21]:
es=Elasticsearch()
es.get(index=INDEX_NAME,id="419")

{'_index': 'dbpdiea_type_centric',
 '_type': '_doc',
 '_id': '419',
 '_version': 1,
 '_seq_no': 419,
 '_primary_term': 1,
 'found': True,
 '_source': {'type': 'dbo:BoxingLeague',
  'abstract': ' Super is a boxing tournament that is held in New Zealand  Super is owned by John McRae  who is the Managing director of VSLive  VS Live was formally known as Sky Arena  Sky ARENA previously owned by SKY Television New Zealand and VADR Media  In VADR Media acquired  of SKY ARENA shares and relaunched as VS LIVE '}}