In [1]:
#help(helper_function)

In [2]:
from elasticsearch import Elasticsearch
#from helper_function import preprocess, load_dict_from_json
#import helper_function as helper

In [3]:
# %load helper_function.py
import json
import string
import re
from elasticsearch import Elasticsearch

def load_dict_from_json(filename):
    try:
        with open(filename, 'r') as f:
            data = json.load(f)
        return data
    except:
        print(f'File \'{filename}\' not found.')
        return None

def save_dict_to_json(doc, filename):
    with open(filename, 'w') as f:
        json.dump(doc, f)


def reset_index(es: Elasticsearch, index_name: str, index_settings) -> None:
    """Clears index"""
    if es.indices.exists(index_name):
        es.indices.delete(index=index_name)

    es.indices.create(index=index_name, body=index_settings)

def preprocess(doc: str) -> str:
    """Preprocesses text to prepare it for feature extraction.

    Args:
        doc: String comprising the unprocessed contents of some email file.

    Returns:
        String comprising the corresponding preprocessed text.
    """
    re_html = re.compile("<[^>]+>")
    doc = re_html.sub(" ", doc)
    #remove pure digits 
    doc=re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b","",doc)
    # Replace punctuation marks (including hyphens) with spaces.
    for c in string.punctuation:
        doc = doc.replace(c, " ")
    return doc.lower()


class Indexer:
    def __init__(self,index: str,index_settings:dict, reset=True):
        #self._filepath = filepath
        self.dictionary = {}     
        self.index = index
        self.index_settings= index_settings
        es = Elasticsearch()
        es.info()
        self.es = es
        if reset:
            self.reset_index()

    
    def preprocess(self, doc: str) -> str:
        """Preprocesses text to prepare it for feature extraction.

    Args:
        doc: String comprising the unprocessed contents of some email file.

    Returns:
        String comprising the corresponding preprocessed text.
    """
        re_html = re.compile("<[^>]+>")
        doc = re_html.sub(" ", doc)
        #remove pure digits 
        doc=re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b","",doc)
        # Replace punctuation marks (including hyphens) with spaces.
        for c in string.punctuation:
            doc = doc.replace(c, " ")
        return doc.lower()

    def reset_index(self) -> None:
        """Clears index"""
        if self.es.indices.exists(self.index):
            self.es.indices.delete(index=self.index)

        self.es.indices.create(index=self.index, body=self.index_settings)
    
    def bulk_index(self,data) -> None:
        """Indexes documents from JSONL file."""
        bulk_data = []
        for item in data:
            bulk_data.append(
                {"index": {"_index": self.index, "_id": item.pop("id")}}
            )
            bulk_data.append(item)
        self.es.bulk(index=self.index, body=bulk_data, refresh=True)
    
    def check_esIndex_count(self)->int:
        self.es.indices.refresh(self.index)
        count = self.es.cat.count(self.index, params={"format": "json"})
        return int(count[0]["count"])

    def check_esIndex_content(self, id:str):
        return self.es.get(index=self.index, id=id)

In [4]:
class IndexerTrainingType(Indexer):
    def __init__(
        self,
        index: str,
        index_settings:dict
    ) -> None:
        super(IndexerTrainingType, self).__init__(index, index_settings)
    
    def prepare_bulk_data(self,data):
        dic={}
        for entry in data:
            if entry['category']=='resource':
                processed_question=preprocess(entry['question'])
                for item in entry['type']:   
                    #print(processed_question)
                    dic[item]=dic.get(item,"")+processed_question
        collections=[{"id":str(i),"type":type,"questions":dic[type]} for i,type in enumerate(dic)]
        return collections

In [9]:
def prepare_bulk_data(data):
    dic={}
    i=0
    for entry in data:
        i+=1
        #print("-----entry:",entry)
        if entry['question']==None:
            continue
        if entry['category']=='resource':
            processed_question=preprocess(entry['question'])
            for item in entry['type']:      
                #print(processed_question)
                dic[item]=dic.get(item,"")+processed_question
    collections=[{"id":str(i),"type":type,"questions":dic[type]} for i,type in enumerate(dic)]
    print("----i",i)
    return collections


----i 17571


In [8]:
INDEX_NAME = "trainning_type_questions"
INDEX_SETTINGS = {
    "mappings": {
        "properties": {
            "type": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },
            "questions": {
                "type": "text",
                "term_vector": "yes",
                "analyzer": "english",
            },

        }
    }
}
training_data=load_dict_from_json("../../smart-dataset/datasets/DBpedia/smarttask_dbpedia_train.json")
collections=prepare_bulk_data(training_data)
index_trainning_type=Indexer(INDEX_NAME,INDEX_SETTINGS)
index_trainning_type.bulk_index(collections)
index_trainning_type.check_esIndex_count()
#index_trainning_type.check_esIndex_content("10")

  if self.es.indices.exists(self.index):
  self.es.indices.create(index=self.index, body=self.index_settings)
  self.es.indices.refresh(self.index)
  count = self.es.cat.count(self.index, params={"format": "json"})


306

In [15]:
with open("../data/training_types.json", 'w',encoding='utf-8') as f:
  json.dump(collections, f, ensure_ascii=False)