In [38]:
import json
import requests
import os
import pandas as pd
from neo4j import GraphDatabase
from data_normalizer import normalize
from dotenv import load_dotenv
import constants
import gen_conditions
import prettytable

load_dotenv()


False

In [14]:
def get_concepts(text):
    with open('data/intent_alias_data.json', encoding="utf8") as f:
        dictionary = json.load(f)

    out = {}
    for concept in dictionary:
        for alias in sorted(dictionary[concept], key=len, reverse=1):
            if alias in text:
                out[concept] = alias
                break
    return out

In [15]:
def get_match_relation(concept_key_list):
    with open('data/match_relation_dict.json', encoding="utf8") as f:
        match_relation_dict = json.load(f)

    match_relation_out = []
    for start_node in match_relation_dict:
        if start_node in concept_key_list:
            for end_node in match_relation_dict[start_node]:
                if end_node in concept_key_list:
                    match_relation_out.append(
                        match_relation_dict[start_node][end_node])

    return match_relation_out

In [16]:
NER_MODEL_BERT = "phobert_large"
NER_MODEL_BILSTM = "BiLSTM"
NER_MODEL_BILSTM_CRF = "BiLSTM+CRF"

INTENT_MODEL_ONE_VS_REST = "onevsrest"


def extract_ner(text, model=NER_MODEL_BERT):
    """
    Input Arguments:
        - text : the sentence which will be extracted NER
    """
    ner_service_url = os.getenv("NER_SERVICE_URL",
                                default="http://localhost:8001/api/v1/ner")

    data = {'model': model, 'text': text}

    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}

    r = requests.post(ner_service_url, data=json.dumps(data), headers=headers)

    return r.json()

In [17]:
def ner_postprocess(entities_raw):
    """Postprocess for NER: 
    1. normalize values
    2. only accept the related information with previous question

    Args:
        entities_raw (Dict): result of NER service

    Returns:
        Dict, Dict: raw and normalized entities
    """
    entities_normed = dict()
    entities_raw_out = dict()

    for entity in entities_raw:
        key = entity['label']
        if key == 'O': continue

        value_raw = entity['content']
        if key not in entities_raw_out.keys():
            entities_raw_out[key] = [value_raw]
        else:
            entities_raw_out[key].append(value_raw)

        value_normed = normalize(value_raw, key)
        if key not in entities_normed.keys():
            entities_normed[key] = [value_normed]
        else:
            entities_normed[key].append(value_normed)

    return entities_raw_out, entities_normed

In [18]:
def get_entities(text):
    entities_raw = extract_ner(text)
    _, entities = ner_postprocess(entities_raw)
    return entities

In [39]:
def gen_query(concepts_keys, evidences_keys, targets, evidences,
              match_relation):
    out = """
    MATCH {} 
    WHERE {}
    RETURN {}
    LIMIT 50
        """
    conditions = []
    matched_labels = []  # Format: (alias:Node_name)
    returned_labels = []  # Format: (alias.individual)

    # Define matched labels.
    if not concepts_keys:
        matched_labels = ['n']
    else:
        for concept_key in concepts_keys:
            matched_labels.append('(' + concept_key + ':' +
                                  concept_key.title() + ')')

    # Define returned labels.
    returned_labels = matched_labels.copy()

    # ----- Define condition labels. -----
    # Condition common.
    [returned_labels,
     conditions] = gen_conditions.condition_common(evidences, matched_labels,
                                                   returned_labels, conditions)

    # Condition for location.
    [returned_labels,
     conditions] = gen_conditions.condition_location(evidences, matched_labels,
                                                     returned_labels,
                                                     conditions)

    # Condition for price.
    [returned_labels, conditions
     ] = gen_conditions.condition_price_n_area(True, evidences, matched_labels,
                                               returned_labels, conditions)

    # Condition for area.
    [returned_labels, conditions
     ] = gen_conditions.condition_price_n_area(False, evidences,
                                               matched_labels, returned_labels,
                                               conditions)

    # Condition for usage.
    [returned_labels,
     conditions] = gen_conditions.condition_usage(evidences, matched_labels,
                                                  returned_labels, conditions)

    # Adjust returned labels
    adjusted_return_labels = []

    for return_label in returned_labels:
        if (constants.LABEL_REAL_ESTATE_TYPE).title() in return_label:
            return_label = return_label.replace(
                (constants.LABEL_REAL_ESTATE_TYPE).title(),
                (constants.LABEL_HOUSE).title())
        elif (constants.LABEL_REAL_ESTATE_SUB_TYPE).title() in return_label:
            return_label = return_label.replace(
                (constants.LABEL_REAL_ESTATE_SUB_TYPE).title(),
                (constants.LABEL_HOUSE).title())

        return_label = f"collect(distinct {return_label.split('(')[1].split(':')[0]}.individual) AS {(return_label.split(':')[1].split(')')[0]).title()}"

        adjusted_return_labels.append(return_label)

    return out.format(
        ', '.join(match_relation),
        ' \nAND '.join([f"{x}" for x in conditions]),
        ',\n '.join(adjusted_return_labels),
    )


In [40]:
def gen_query_ontology(text):
    table = prettytable.PrettyTable(["Step", "Result"])
    table.max_width = 80
    table.add_row(["Input", text])

    concepts = get_concepts(text)

    match_relation = get_match_relation(concepts.keys())

    evidences = get_entities(text)

    table.add_rows([["Match alias", concepts], ["Find individuals",
                                                evidences]])

    targets = list(set(concepts.keys()).difference(set(evidences.keys())))
    table.add_row(["Target concepts", targets])

    query = gen_query(concepts.keys(), evidences.keys(), targets, evidences,
                      match_relation)
    table.add_row(["Query", query])

    print(table)
    # print(query)
    return query

In [41]:
text = "nhà hoặc căn hộ giá khoảng 2 tỷ thì mua ở quận nào"

cqlNodeQuery = gen_query_ontology(text)

price
area
+------------------+----------------------------------------------------------------------------------+
|       Step       |                                      Result                                      |
+------------------+----------------------------------------------------------------------------------+
|      Input       |                nhà hoặc căn hộ giá khoảng 2 tỷ thì mua ở quận nào                |
|   Match alias    | {'real_estate_type': 'căn hộ', 'transaction': 'mua', 'price': 'giá', 'district': |
|                  |                                  'ở quận nào'}                                   |
| Find individuals |     {'real_estate_type': ['nha', 'can ho'], 'price': [(2000000000.0, None)],     |
|                  |                             'transaction': ['mua']}                              |
| Target concepts  |                                   ['district']                                   |
|      Query       |                                 

In [22]:
# Connect neo4j DB.
driver = GraphDatabase.driver('bolt://localhost:7687',
                              auth=('neo4j', 'password'))


In [23]:
def run_query(query):
    with driver.session(database="htdb") as session:
        results = session.run(query)

        table_results = prettytable.PrettyTable(results.keys())
        table_results.max_width = 60
        for r in results:
            table_results.add_row(r.values())

        return table_results
        # return results


In [24]:
result_data = run_query(cqlNodeQuery)

print(result_data)

+----------------------------------------------------+
|                      District                      |
+----------------------------------------------------+
| ['hoang mai', '5 tu liem', 'long bien', 'kien an'] |
+----------------------------------------------------+


In [26]:
with open('data/question_dict.json', 'r', encoding="utf-8") as fp:
    question_dict = json.load(fp)
    fp.close()

REAL_ESTATE_TYPE = question_dict["real_estate_type"]
PRICE = question_dict["price"]
AREA = question_dict["area"]
DISTRICT = question_dict["district"]
BED_ROOM = question_dict["bed_room"]
FLOOR = question_dict["floor"]
LEGAL = question_dict["legal"]
POSITION = question_dict["position"]
LOCATION = question_dict["location"]
POTENTIAL = question_dict["potential"]