In [23]:
import json
import requests
import os
import pandas as pd
from neo4j import GraphDatabase
from data_normalizer import normalize
from dotenv import load_dotenv
load_dotenv()

False

In [2]:
def get_concepts(text):
    with open('data/intent_alias_data.json', encoding="utf8") as f:
        dictionary = json.load(f)
    
    out = {}
    for concept in dictionary:
        for alias in sorted(dictionary[concept], key=len, reverse=1):
            if alias in text:
                out[concept]=alias
                break
    return out

In [9]:
NER_MODEL_BERT = "phobert_large"
NER_MODEL_BILSTM = "BiLSTM"
NER_MODEL_BILSTM_CRF = "BiLSTM+CRF"

INTENT_MODEL_ONE_VS_REST ="onevsrest"

def extract_ner(text, model=NER_MODEL_BERT):
    """
    Input Arguments:
        - text : the sentence which will be extracted NER
    """
    ner_service_url = os.getenv("NER_SERVICE_URL", default="http://localhost:8001/api/v1/ner")
    data = {'model': model, 'text': text}
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    r = requests.post(ner_service_url, data=json.dumps(data), headers=headers)
    return r.json()

In [4]:
def ner_postprocess(entities_raw):
    """Postprocess for NER: 
    1. normalize values
    2. only accept the related information with previous question

    Args:
        entities_raw (Dict): result of NER service

    Returns:
        Dict, Dict: raw and normalized entities
    """
    entities_normed = dict()
    entities_raw_out = dict()
    for entity in entities_raw:
        key = (entity['label']).title()
        if key == 'O': continue
        value_raw = entity['content']
        if key not in entities_raw_out.keys():
            entities_raw_out[key] = [value_raw]
        else:
            entities_raw_out[key].append(value_raw)
        
        value_normed = normalize(value_raw, key)
        if key not in entities_normed.keys():
            entities_normed[key] = [value_normed]
        else:
            entities_normed[key].append(value_normed)
    
    return entities_raw_out, entities_normed

In [36]:
def get_entities(text):
    entities_raw = extract_ner(text)
    _, entities = ner_postprocess(entities_raw)
    return entities

In [129]:
import constants
def gen_query(concepts_keys, evidences_keys, targets, evidences):
    out = """
    MATCH {} 
    WHERE {}
    RETURN {}
    LIMIT 30
        """
    conditions = []
    matched_labels = []
    idx_target_labels = []
    condition_labels = []
    returned_labels = []
    
    # Define matched labels.
    if not concepts_keys:
        matched_labels = ['n']
    else:
        for idx, target in enumerate(concepts_keys):
            idx_target_labels.append('n' +str(idx))
            matched_labels.append('(n' +str(idx) + ':' + target + ')')

    # Define returned labels.
    returned_labels = matched_labels.copy()
    
    # Define condition labels.
    attrs = [constants.LABEL_REAL_ESTATE_TYPE, constants.LABEL_REAL_ESTATE_SUB_TYPE, \
            constants.LABEL_POSITION, constants.LABEL_DIRECTION, \
            constants.LABEL_FRONT_LENGTH, constants.LABEL_ROAD_WIDTH, \
            constants.LABEL_FLOOR, constants.LABEL_BED_ROOM, constants.LABEL_LIVING_ROOM, constants.LABEL_BATH_ROOM, \
            constants.LABEL_SURROUNDING, constants.LABEL_PROJECT_NAME, \
            constants.LABEL_LEGAL, constants.LABEL_TRANSACTION]
    key2dbcol = {
                'tang': constants.LABEL_FLOOR,
                'ban cong': constants.LABEL_FLOOR_BAN_CONG,
                'gac': constants.LABEL_FLOOR_GAC,
                'ham': constants.LABEL_FLOOR_HAM,
                'lung': constants.LABEL_FLOOR_LUNG,
                'san thuong': constants.LABEL_FLOOR_SAN_THUONG,
                'tret': constants.LABEL_FLOOR_TRET
            }

    for attr in attrs:
        if attr in evidences:
            if attr == constants.LABEL_FLOOR:
                for val in evidences[attr]:
                    target_k = key2dbcol[val['type']]
                    target_v = val['value']
                    conditions.append(f"{target_k} = '{target_v}'")
            else:
                conditions.append(f"{attr}.individual = '{evidences[attr][0]}'")
    
    loc_attrs = [constants.LABEL_DISTRICT, constants.LABEL_CITY, constants.LABEL_WARD, constants.LABEL_STREET]
    for attr in loc_attrs:
        attr = attr.title()
        if attr in evidences:
            for match_label in matched_labels:
                if attr in match_label:
                    idx_condition_node = match_label.split('(')[1].split(':')[0]
                    returned_labels.remove(match_label)
            
            conditions.append(f"{idx_condition_node}.individual = '{evidences[attr][0]}'")
    
    PRICE_OFFSET_CONST = 0.1
    if constants.LABEL_PRICE in evidences:
        for ele in evidences[constants.LABEL_PRICE][:1]:
            low, high = ele
            if high is None:
                high = low + low*PRICE_OFFSET_CONST
                low = low - low*PRICE_OFFSET_CONST
            
            conditions.append(f"{constants.LABEL_PRICE} BETWEEN {low} AND {high}")
    
    AREA_OFFSET_CONST = 0.1
    if constants.LABEL_AREA in evidences:
        for ele in evidences[constants.LABEL_AREA][:1]:
            low, high = ele
            if high is None:
                high = low + low*AREA_OFFSET_CONST
                low = low - low*AREA_OFFSET_CONST
            
            conditions.append(f"{constants.LABEL_AREA} BETWEEN {low} AND {high}")
    
    if constants.LABEL_USAGE in evidences:
        conditions.append("({})".format(" OR ".join([f"{constants.LABEL_USAGE} LIKE '%, {x},%' OR {constants.LABEL_USAGE} LIKE '{x}, %' OR {constants.LABEL_USAGE} LIKE '%, {x}'" for x in evidences[constants.LABEL_USAGE]])))

    # Adjust returned labels
    adjusted_return_labels = []
    
    for return_label in returned_labels:
        return_label = f"{return_label.split('(')[1].split(':')[0]}.individual as {return_label.split(':')[1].split(')')[0]}"  
        adjusted_return_labels.append(return_label)
        
    return out.format(
        ', '.join(matched_labels),
        ' AND '.join([f"{x}" for x in conditions]),
        ', '.join(adjusted_return_labels),
        )

In [130]:
import prettytable

def gen_query_ontology(text):
    table = prettytable.PrettyTable(["Step", "Result"])
    table.add_row(["Input", text])
    
    concepts = get_concepts(text)
    evidences = get_entities(text)
    
    table.add_rows([
        ["Match alias", concepts],
        ["Find individuals", evidences]
    ])
    
    targets = list(set(concepts.keys()).difference(set(evidences.keys())))
    table.add_row(["Target concepts", targets])

    query = gen_query(concepts.keys(), evidences.keys(), targets, evidences)
    table.add_row(["Query", query])
    
    print(table)
    return query

In [131]:
text = "nhà ở quận 8 thường có giá khoảng bao nhiêu"

cqlNodeQuery = gen_query_ontology(text)
i = 0

+------------------+-----------------------------------------------------------------------+
|       Step       |                                 Result                                |
+------------------+-----------------------------------------------------------------------+
|      Input       |              nhà ở quận 8 thường có giá khoảng bao nhiêu              |
|   Match alias    | {'Price': 'giá khoảng bao nhiêu', 'District': 'quận', 'House': 'nhà'} |
| Find individuals |                          {'District': ['8']}                          |
| Target concepts  |                           ['House', 'Price']                          |
|      Query       |                                                                       |
|                  |                MATCH (n0:Price), (n1:District), (n2:House)            |
|                  |                         WHERE n1.individual = '8'                     |
|                  |           RETURN n0.individual as Price, n2.indiv

In [24]:
# Connect neo4j desktop.
driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', 'password'))

In [132]:
def run_query(query):
    with driver.session(database="htdb") as session:
        results = session.run(query)
        
        table_results = prettytable.PrettyTable(results.keys())
        for r in results:
            table_results.add_row(r.values())
            

        return table_results


In [133]:
result_data = run_query(cqlNodeQuery)

print(result_data)

+--------------+----------------------+
|    Price     |        House         |
+--------------+----------------------+
| 1000000000.0 |      shophouse       |
| 1000000000.0 |       chung cu       |
| 1000000000.0 |         lien         |
| 1000000000.0 |      officetel       |
| 1000000000.0 |    toa vinaconex     |
| 1000000000.0 |        cap 4         |
| 1000000000.0 |        studio        |
| 1000000000.0 |    shophouse bien    |
| 1000000000.0 |       biet thu       |
| 1000000000.0 |      khach san       |
| 1000000000.0 |       bo kinh        |
| 1000000000.0 |          ch          |
| 1000000000.0 |      hang sang       |
| 1000000000.0 |       song lap       |
| 1000000000.0 |       lien ke        |
| 1000000000.0 |     cong nghiep      |
| 1000000000.0 |  shophouse mat bien  |
| 1000000000.0 |        duplex        |
| 1000000000.0 | shophouse thuong mai |
| 1000000000.0 |      smarthome       |
| 1000000000.0 |     villas bien      |
| 1000000000.0 |      sill park       |
