In [1]:
import json
import os
import requests
import prettytable
from neo4j import GraphDatabase
from dotenv import load_dotenv
import util
import constants
from data_normalizer import normalize

load_dotenv()




False

In [2]:
# Connect neo4j DB.
driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', 'password'))


In [3]:
NER_MODEL_BERT = "phobert_large"

# Connect NER service to get the entities contained in the user input text..
def extract_ner(text, model=NER_MODEL_BERT):
    """
    Input Arguments:
        - text : the sentence which will be extracted NER
    """
    ner_service_url = os.getenv("NER_SERVICE_URL", default="http://localhost:8001/api/v1/ner")

    data = {'model': model, 'text': text}

    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}

    r = requests.post(ner_service_url, data=json.dumps(data), headers=headers)

    return r.json()

In [4]:
def ner_postprocess(entities_raw):
    """Postprocess for NER: 
    1. normalize values
    2. only accept the related information with previous question

    Args:
        entities_raw (Dict): result of NER service

    Returns:
        Dict, Dict: raw and normalized entities
    """
    entities_normed = dict()
    entities_raw_out = dict()

    for entity in entities_raw:
        # If label is 0, then ignore.
        key = entity['label']
        if key == 'O': continue

        # Create new or add entities.
        value_raw = entity['content']
        if key not in entities_raw_out.keys():
            entities_raw_out[key] = [value_raw]
        else:
            entities_raw_out[key].append(value_raw)

        # Normalize entities.
        value_normed = normalize(value_raw, key)
        if key not in entities_normed.keys():
            entities_normed[key] = [value_normed]
        else:
            entities_normed[key].append(value_normed)

    return entities_raw_out, entities_normed

In [5]:
# Handle get entities.
def handle_get_entities(text):
    """
  It takes a text, extracts entities from it, and returns a list of entities

  :param text: The text to be processed
  :return: A list of entities
  """
    entities_raw = extract_ner(text)
    _, entities = ner_postprocess(entities_raw)
    
    return entities

In [6]:
# Identify ontological concepts.
def get_concepts(text):
    """
  It takes a string of text and returns a dictionary of concepts and their aliases

  :param text: The text to be analyzed
  :return: A dictionary of concepts and their aliases.
  """
    # Load file intent_alias_data.json.
    with open('data/intent_alias_data.json', encoding="utf8") as f:
        intent_alias_data = json.load(f)

    # Get ontological concepts based on predefined aliases.
    out = {}
    for concept in intent_alias_data:
        for alias in sorted(intent_alias_data[concept], key=len, reverse=1):
            if alias in text:
                out[concept] = alias
                break
    return out

In [7]:
# Remove the labels whose respective individuals are found in the natural query.
def recheck_returned_labels(matched_labels, returned_labels, attr):
    ##
    # Remove the label which has a condition from returned_label.
    #
    # Argument:
    #     matched_labels: The labels which were matched.
    #     returned_labels: The labels which were returned.
    #     attr: The attribute that you want to check for.
    # Return:
    #     returned_labels.
    # #

    for match_label in matched_labels:
        if attr in match_label:
            returned_labels.remove(match_label)

    return returned_labels

In [8]:
# Create WHERE clauses for common labels. 
def condition_common(evidences, matched_labels, returned_labels, conditions):
    ##
    # Condition common: real estate, position, direction, floor....
    #
    # Return:
    #       returned_labels.
    #       conditions.
    # #

    attrs = [constants.LABEL_REAL_ESTATE_TYPE, constants.LABEL_REAL_ESTATE_SUB_TYPE, \
              constants.LABEL_POSITION, constants.LABEL_DIRECTION, \
              constants.LABEL_FRONT_LENGTH, constants.LABEL_ROAD_WIDTH, \
              constants.LABEL_FLOOR, constants.LABEL_BED_ROOM, constants.LABEL_LIVING_ROOM, constants.LABEL_BATH_ROOM, \
              constants.LABEL_SURROUNDING, constants.LABEL_PROJECT_NAME, \
              constants.LABEL_LEGAL, constants.LABEL_TRANSACTION]

    key2dbcol = {
        'tang': constants.LABEL_FLOOR,
        'ban cong': constants.LABEL_FLOOR_BAN_CONG,
        'gac': constants.LABEL_FLOOR_GAC,
        'ham': constants.LABEL_FLOOR_HAM,
        'lung': constants.LABEL_FLOOR_LUNG,
        'san thuong': constants.LABEL_FLOOR_SAN_THUONG,
        'tret': constants.LABEL_FLOOR_TRET
    }

    for attr in attrs:
        if attr in evidences:
            if attr == constants.LABEL_FLOOR:
                # Condition for label floor.
                for val in evidences[attr]:
                    target_k = key2dbcol[val['type']]
                    target_v = val['value']

                    returned_labels = recheck_returned_labels(
                        matched_labels, returned_labels, target_k)

                    conditions.append(f"{target_k}.individual = '{target_v}'")
            else:
                returned_labels = recheck_returned_labels(
                    matched_labels, returned_labels, attr)

                if attr == constants.LABEL_REAL_ESTATE_TYPE:
                    # Condition for label real estate type.
                    conditions.append(
                        f"{attr}.individual IN {evidences[attr]}")
                else:
                    # Conditions for other labels.
                    conditions.append(
                        f"{attr}.individual = '{evidences[attr][0]}'")

    return [returned_labels, conditions]


In [9]:
def condition_location(evidences, matched_labels, returned_labels, conditions):
    ##
    # Condition for city, district, ward, street.
    #
    # Return:
    #       returned_labels.
    #       conditions.
    # #

    loc_attrs = [
        constants.LABEL_DISTRICT, constants.LABEL_CITY, constants.LABEL_WARD,
        constants.LABEL_STREET
    ]

    for attr in loc_attrs:
        if attr in evidences:
            returned_labels = recheck_returned_labels(matched_labels,
                                                      returned_labels, attr)

            conditions.append(f"{attr}.individual = '{evidences[attr][0]}'")

    return [returned_labels, conditions]


In [10]:
def condition_price_n_area(is_price, evidences, matched_labels,
                           returned_labels, conditions):
    ##
    # Condition for price and area.
    #
    # Return:
    #       returned_labels.
    #       conditions.
    # #

    OFFSET_CONST = 0.1

    condition_item = is_price and constants.LABEL_PRICE or constants.LABEL_AREA

    if condition_item in evidences:
        for ele in evidences[condition_item][:1]:
            low, high = ele

            if high is None:
                high = low + low * OFFSET_CONST
                low = low - low * OFFSET_CONST

            returned_labels = recheck_returned_labels(matched_labels,
                                                      returned_labels,
                                                      condition_item)

            conditions.append(
                f"('{low}' <= {condition_item}.individual OR {condition_item}.individual <= '{high}')"
            )

    return [returned_labels, conditions]


In [11]:
def condition_usage(evidences, matched_labels, returned_labels, conditions):
    ##
    # Condition for usage.
    #
    # Return:
    #       returned_labels.
    #       conditions.
    # #

    if constants.LABEL_USAGE in evidences:
        returned_labels = recheck_returned_labels(matched_labels,
                                                  returned_labels,
                                                  constants.LABEL_USAGE)

        conditions.append("({})".format(" OR ".join([
            f"{constants.LABEL_USAGE}.individual LIKE '%, {x},%' + 'OR {constants.LABEL_USAGE}.individual LIKE '{x}, %' OR {constants.LABEL_USAGE}.individual LIKE '%, {x}'"
            for x in evidences[constants.LABEL_USAGE]
        ])))

    return [returned_labels, conditions]


In [12]:
def handle_gen_query(concepts_keys, evidences, match_relation):
    """
    It takes a list of concepts, a list of evidences, and a list of relations, and returns a query that
    matches the concepts, and filters the results based on the evidences

    :param concepts_keys: ['location', 'price', 'area', 'usage']
    :param evidences: {'location': ['Hà Nội'], 'price': ['1 tỷ'], 'area': ['100 m2']}
    :param match_relation:
    :return: The query returns the following:
    """
    out = """
    MATCH {}
    WHERE {}
    RETURN {}
    LIMIT 50
        """
    conditions = []
    matched_labels = []  # Format: (alias:Node_name)
    returned_labels = []  # Format: (alias.individual)

    # Define matched labels.
    if not concepts_keys:
        matched_labels = ['n']
    else:
        for concept_key in concepts_keys:
            matched_labels.append('(' + concept_key + ':' +
                                  concept_key.title() + ')')

    # Define returned labels.
    returned_labels = matched_labels.copy()

    # ----- Define condition labels. -----
    # Condition common.
    [returned_labels,
     conditions] = condition_common(evidences, matched_labels, returned_labels,
                                    conditions)

    # Condition for location.
    [returned_labels,
     conditions] = condition_location(evidences, matched_labels,
                                      returned_labels, conditions)

    # Condition for price.
    [returned_labels,
     conditions] = condition_price_n_area(True, evidences, matched_labels,
                                          returned_labels, conditions)

    # Condition for area.
    [returned_labels,
     conditions] = condition_price_n_area(False, evidences, matched_labels,
                                          returned_labels, conditions)

    # Condition for usage.
    [returned_labels,
     conditions] = condition_usage(evidences, matched_labels, returned_labels,
                                   conditions)

    # Adjust returned labels
    adjusted_return_labels = []

    for return_label in returned_labels:
        if (constants.LABEL_REAL_ESTATE_TYPE).title() in return_label:
            return_label = return_label.replace(
                (constants.LABEL_REAL_ESTATE_TYPE).title(),
                (constants.LABEL_HOUSE).title())
        elif (constants.LABEL_REAL_ESTATE_SUB_TYPE).title() in return_label:
            return_label = return_label.replace(
                (constants.LABEL_REAL_ESTATE_SUB_TYPE).title(),
                (constants.LABEL_HOUSE).title())

        # Create a Return clause for the label to be queried.
        return_label = f"collect(distinct {return_label.split('(')[1].split(':')[0]}.individual) AS {(return_label.split(':')[1].split(')')[0]).title()}"

        adjusted_return_labels.append(return_label)

    # Create a complete query to query in Neo4j.
    return out.format(
        ',\n'.join(match_relation),
        ' \nAND '.join([f"{x}" for x in conditions]),
        ',\n '.join(adjusted_return_labels),
    )


In [13]:
# Gen query ontology.
def gen_query_ontology(text):
    LABEL_POTENTIAL = "potential"
    
    # Create table to show data.
    table = prettytable.PrettyTable(["Step", "Result"])
    table.max_width = 80
    table.add_row(["Input", text])
    # print(text)

    # Get concepts.
    concepts = get_concepts(text)
    # print(concepts)

    # Identify relationships between concepts.
    match_relation = util.get_match_relation(concepts.keys())

    # Get entities by NER service.
    evidences = handle_get_entities(text)

    # Remove label house number.
    if any(constants.LABEL_HOUSE_NUMBER in e for e in evidences.keys()):
        evidences[constants.LABEL_REAL_ESTATE_TYPE] = evidences[constants.LABEL_HOUSE_NUMBER]
        del evidences[constants.LABEL_HOUSE_NUMBER]

    # Remove label real estate sub type.
    if any(constants.LABEL_REAL_ESTATE_SUB_TYPE in e for e in evidences.keys()):
        evidences[constants.LABEL_REAL_ESTATE_TYPE] = evidences[constants.LABEL_REAL_ESTATE_SUB_TYPE]
        del evidences[constants.LABEL_REAL_ESTATE_SUB_TYPE]

    # Remove label usage.
    if any(constants.LABEL_USAGE in e for e in evidences.keys()):
        evidences[LABEL_POTENTIAL] = evidences[constants.LABEL_USAGE]
        del evidences[constants.LABEL_USAGE]
    
    # print(evidences)
    table.add_rows([["Match alias", concepts], ["Find individuals",evidences]])

    # Identify target concepts.
    targets = list(set(concepts.keys()).difference(set(evidences.keys())))
    # print(targets)
    table.add_row(["Target concepts", targets])

    # Handle gen query.
    query = handle_gen_query(concepts.keys(), evidences, match_relation)
    # print(query)
    
    table.add_row(["Query", query])
    print(table)

    return query

In [19]:
# Run query in Neo4j.
def run_query(query):
    with driver.session(database="htdb") as session:
        results = session.run(query)
        
        table_results = prettytable.PrettyTable(results.keys())
        table_results.max_width = 60
        
        for r in results:
            table_results.add_row(r.values())            

        print(table_results)

In [18]:
question = "nhà giá khoảng 2 tỷ thì mua ở quận nào"

cqlNodeQuery = gen_query_ontology(question)

run_query(cqlNodeQuery)

In [16]:
# Load list questions in file question_dict.json
with open('data/question_dict.json', 'r', encoding="utf-8") as fp:
    question_dict = json.load(fp)
    fp.close()

PRICE = question_dict["price"]
AREA = question_dict["area"]
DISTRICT = question_dict["district"]
BED_ROOM = question_dict["bed_room"]
FLOOR = question_dict["floor"]
LEGAL = question_dict["legal"]
POSITION = question_dict["position"]
DIRECTION = question_dict["direction"]
FRONT_LENGTH = question_dict["front_length"]
POTENTIAL = question_dict["potential"]

In [17]:
# for question in PRICE:
#     cqlNodeQuery = gen_query_ontology(question)
#     run_query(cqlNodeQuery)
#     print("----------------##########################################----------------")
