In [12]:
import json
import requests
import os
from data_normalizer import normalize
from dotenv import load_dotenv
load_dotenv()

False

In [13]:
def get_concepts(text):
    with open('data/intent_alias_data.json', encoding="utf8") as f:
        dictionary = json.load(f)
    
    out = {}
    for concept in dictionary:
        for alias in sorted(dictionary[concept], key=len, reverse=1):
            if alias in text:
                out[concept]=alias
                break
    return out

In [14]:
NER_MODEL_BERT = "phobert_large"
NER_MODEL_BILSTM = "BiLSTM"
NER_MODEL_BILSTM_CRF = "BiLSTM+CRF"

INTENT_MODEL_ONE_VS_REST ="onevsrest"

def extract_ner(text, model=NER_MODEL_BERT):
    """
    Input Arguments:
        - text : the sentence which will be extracted NER
    """
    ner_service_url = os.getenv("NER_SERVICE_URL", default="http://localhost:8001/api/v1/ner")
    data = {'model': model, 'text': text}
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    r = requests.post(ner_service_url, data=json.dumps(data), headers=headers)
    return r.json()

In [15]:
def ner_postprocess(entities_raw):
    """Postprocess for NER: 
    1. normalize values
    2. only accept the related information with previous question

    Args:
        entities_raw (Dict): result of NER service

    Returns:
        Dict, Dict: raw and normalized entities
    """
    entities_normed = dict()
    entities_raw_out = dict()
    for entity in entities_raw:
        key = entity['label']
        if key == 'O': continue
        value_raw = entity['content']
        if key not in entities_raw_out.keys():
            entities_raw_out[key] = [value_raw]
        else:
            entities_raw_out[key].append(value_raw)
        
        value_normed = normalize(value_raw, key)
        if key not in entities_normed.keys():
            entities_normed[key] = [value_normed]
        else:
            entities_normed[key].append(value_normed)
    
    return entities_raw_out, entities_normed

In [16]:
def get_entities(text):
    entities_raw = extract_ner(text)
    _, entities = ner_postprocess(entities_raw)
    return entities

In [22]:
import constants
def gen_query(targets, evidences):
    out = """
    MATCH ({}) 
    WHERE {}
    RETURN {}
    LIMIT 100
        """
    conditions = []

    attrs = [constants.LABEL_REAL_ESTATE_TYPE, constants.LABEL_REAL_ESTATE_SUB_TYPE, \
            constants.LABEL_POSITION, constants.LABEL_DIRECTION, \
            constants.LABEL_FRONT_LENGTH, constants.LABEL_ROAD_WIDTH, \
            constants.LABEL_FLOOR, constants.LABEL_BED_ROOM, constants.LABEL_LIVING_ROOM, constants.LABEL_BATH_ROOM, \
            constants.LABEL_SURROUNDING, constants.LABEL_PROJECT_NAME, \
            constants.LABEL_LEGAL, constants.LABEL_TRANSACTION]
    key2dbcol = {
                'tang': constants.LABEL_FLOOR,
                'ban cong': constants.LABEL_FLOOR_BAN_CONG,
                'gac': constants.LABEL_FLOOR_GAC,
                'ham': constants.LABEL_FLOOR_HAM,
                'lung': constants.LABEL_FLOOR_LUNG,
                'san thuong': constants.LABEL_FLOOR_SAN_THUONG,
                'tret': constants.LABEL_FLOOR_TRET
            }

    for attr in attrs:
        if attr in evidences:
            if attr == constants.LABEL_FLOOR:
                for val in evidences[attr]:
                    target_k = key2dbcol[val['type']]
                    target_v = val['value']
                    conditions.append(f"{target_k} = '{target_v}'")
            else:
                conditions.append(f"{attr} = '{evidences[attr][0]}'")
    
    loc_attrs = [constants.LABEL_DISTRICT, constants.LABEL_CITY, constants.LABEL_WARD, constants.LABEL_STREET]
    for attr in loc_attrs:
        if attr in evidences:
            conditions.append(f"{attr}.name = '{evidences[attr][0]}'")
    
    PRICE_OFFSET_CONST = 0.1
    if constants.LABEL_PRICE in evidences:
        for ele in evidences[constants.LABEL_PRICE][:1]:
            low, high = ele
            if high is None:
                high = low + low*PRICE_OFFSET_CONST
                low = low - low*PRICE_OFFSET_CONST
            
            conditions.append(f"{constants.LABEL_PRICE} BETWEEN {low} AND {high}")
    
    AREA_OFFSET_CONST = 0.1
    if constants.LABEL_AREA in evidences:
        for ele in evidences[constants.LABEL_AREA][:1]:
            low, high = ele
            if high is None:
                high = low + low*AREA_OFFSET_CONST
                low = low - low*AREA_OFFSET_CONST
            
            conditions.append(f"{constants.LABEL_AREA} BETWEEN {low} AND {high}")
    
    if constants.LABEL_USAGE in evidences:
        conditions.append("({})".format(" OR ".join([f"{constants.LABEL_USAGE} LIKE '%, {x},%' OR {constants.LABEL_USAGE} LIKE '{x}, %' OR {constants.LABEL_USAGE} LIKE '%, {x}'" for x in evidences[constants.LABEL_USAGE]])))

    if not targets:
        targets = ['*']

    return out.format(
        ', '.join(targets),
        ' AND '.join([f"{x}" for x in conditions]),
        ', '.join(targets),
        )

In [18]:
import prettytable

def gen_query_ontology(text):
    table = prettytable.PrettyTable(["Step", "Result"])
    table.add_row(["Input", text])
    
    concepts = get_concepts(text)
    evidences = get_entities(text)
    
    table.add_rows([
        ["Match alias", concepts],
        ["Find individuals", evidences]
    ])
    
    targets = list(set(concepts.keys()).difference(set(evidences.keys())))
    table.add_row(["Referents", targets])

    query = gen_query(targets,evidences)
    table.add_row(["Query", query])
    
    print(table)
    return query

In [27]:
text = "nhà ở q8 thường có giá khoảng bao nhiêu ạ"
# text = "mình đang có 2 tỷ, nên mua nhà hoặc căn hộ ở quận nào nhỉ"
# text = "nhà hoặc căn hộ giá khoảng 2 tỷ thì mua ở quận nào nhỉ"

gen_query_ontology(text)
i = 0

evidences:  {'district': ['8']}
attr:  district
-----
+------------------+-------------------------------------------------------------------------------+
|       Step       |                                     Result                                    |
+------------------+-------------------------------------------------------------------------------+
|      Input       |                   nhà ở q8 thường có giá khoảng bao nhiêu ạ                   |
|   Match alias    | {'Price': 'giá khoảng bao nhiêu', 'House': 'nhà', 'Yes': 'có', 'Hello': 'hi'} |
| Find individuals |                              {'district': ['8']}                              |
|    Referents     |                       ['Price', 'Yes', 'House', 'Hello']                      |
|      Query       |                                                                               |
|                  |                         MATCH (Price, Yes, House, Hello)                      |
|                  |                 