# AMR graphs => Knowledge Graphs

In [52]:
import re
import amrlib
import spacy
import warnings
from igraph import Graph
import requests
import penman
import logging
import time
from nltk.corpus import propbank

In [None]:
amrlib.setup_spacy_extension()
stog = amrlib.load_stog_model()
nlp = spacy.load("en_core_web_sm")

In [15]:
s = """
(w / want-01
   :ARG0 (b / boy)
   :ARG1 (b2 / believe-01
             :ARG0 (g / girl)
             :ARG1 b))
"""
triples = penman.decode(s).triples
print(*triples, sep="\n")

('w', ':instance', 'want-01')
('w', ':ARG0', 'b')
('b', ':instance', 'boy')
('w', ':ARG1', 'b2')
('b2', ':instance', 'believe-01')
('b2', ':ARG0', 'g')
('g', ':instance', 'girl')
('b2', ':ARG1', 'b')


In [16]:
print("Roles for 'turn.01':")
rs = propbank.roleset('turn.01')
for r in rs.findall("roles/role"):
    print(f"ARG{r.attrib['n']}:", r.attrib['descr'])

Roles for 'turn.01':
ARG0: turner
ARG1: thing turning
ARGm: direction, location


In [12]:
def run_query(query):
    """
    This function queries the Wikidata-API(https://www.wikidata.org/wiki/Wikidata:Data_access#Wikidata_Query_Service)
    """
    url = "https://query.wikidata.org/sparql"
    headers = {"Accept": "application/json"}
    data = ""

    # In case of rate limit error, try again after 2 seconds
    for attempt in range(3):
        try:
            response = requests.get(url, headers=headers, params={"query": query})
            data = response.json()
            return data["results"]["bindings"]

        except Exception as e:
            logging.error(f"Attempt {attempt + 1} failed: {e}\ndata: {data}")
            if attempt == 2:
                logging.error(f"Querying failed! {e}")
                return None
            time.sleep(3)

In [13]:
def find_q_id(entity_name):
    """
    Find the QID of an entity by its name.
    """
    query = f"""
    SELECT ?item WHERE {{
      ?item rdfs:label "{entity_name.lower()}"@en .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    print("Querying for entity:", entity_name)
    results = run_query(query)
    if results:
        return results[0]["item"]["value"].split("/")[-1]
    else:
        return None

In [14]:
find_q_id("dry weight")

Querying for entity: dry weight


'Q1140878'

In [15]:
def query_conceptnet(word, lang="en") -> dict:
    """
    Query ConceptNet for a given word and language.
    :param word: word to query
    :param lang: language code (default: "en")
    :return: response from ConceptNet API
    """
    word = word.replace(' ', '_').lower()
    url = f"https://api.conceptnet.io/c/{lang}/{word}"
    response = requests.get(url).json()
    return response

In [16]:
def in_conceptnet(response) -> bool:
    """
    Check if a word is in ConceptNet.
    """
    return "edges" in response and len(response["edges"]) > 0

In [34]:
query_conceptnet("boy")

{'@context': ['http://api.conceptnet.io/ld/conceptnet5.7/context.ld.json'],
 '@id': '/c/en/boy',
 'edges': [{'@id': '/a/[/r/RelatedTo/,/c/en/boy/,/c/en/male/]',
   '@type': 'Edge',
   'dataset': '/d/verbosity',
   'end': {'@id': '/c/en/male',
    '@type': 'Node',
    'label': 'male',
    'language': 'en',
    'term': '/c/en/male'},
   'license': 'cc:by/4.0',
   'rel': {'@id': '/r/RelatedTo', '@type': 'Relation', 'label': 'RelatedTo'},
   'sources': [{'@id': '/and/[/s/process/split_words/,/s/resource/verbosity/]',
     '@type': 'Source',
     'contributor': '/s/resource/verbosity',
     'process': '/s/process/split_words'},
    {'@id': '/s/resource/verbosity',
     '@type': 'Source',
     'contributor': '/s/resource/verbosity'}],
   'start': {'@id': '/c/en/boy',
    '@type': 'Node',
    'label': 'boy',
    'language': 'en',
    'term': '/c/en/boy'},
   'surfaceText': '[[boy]] is related to [[male]]',
   'weight': 6.479814812168632},
  {'@id': '/a/[/r/RelatedTo/,/c/en/boy/,/c/en/child/]'

In [17]:
def is_concept(word: str) -> bool:
    """
    Check if a word is a concept (i.e., contains a dash and a number at the end).
    """
    return bool(re.search(r"-\d+$", word))

In [18]:
def is_repeated_var(word: str) -> bool:
    """
    Check if word is a repeated variable - contains a disambiguating number.
    """
    return bool(re.search(r"[-a-zA-Z]*[a-zA-Z]\d$", word))

In [19]:
def is_decimal_fraction(word: str) -> bool:
    """
    Check if word is a decimal fraction.
    """
    return bool(re.match(r"^\d+[,/\.]\d+$", word))

In [22]:
def get_args(concept: str) -> dict:
    """
    Get the argument roles for a given concept from PropBank.
    """
    concept = concept.replace("-", ".")
    print("Looking up roles for concept:", concept)
    try:
        rs = propbank.roleset(concept)
        return {f"ARG{r.attrib['n']}": r.attrib['descr'].replace(' ', '_') for r in rs.findall("roles/role")}
    except ValueError as e:
        warnings.warn(f"PropBank lookup failed: {e}")
        return {}

In [23]:
def postprocess_penman(triples: list[tuple]) -> tuple[list[tuple], list[tuple]]:
    """
    Postprocess AMR triples to convert them to knowledge graph.
    """
    source_triples = []
    processed_triples = []
    name_triple = ()
    name = ""
    searching_name = False
    # Map variables to concepts (from :instance relations)
    concept_map = {t[0]: t[2] for t in triples if t[1].strip(":") == "instance"}
    # Disambiguate repeated variables if they are not concepts and appear multiple times
    concept_map = dict([(k, v+k[-1])
                        if is_repeated_var(k) and not is_concept(v) and list(concept_map.values()).count(v) > 1
                        else (k, v) for k, v in concept_map.items()])
    # Get argument roles for concepts from PropBank
    concept_args = {word: get_args(word) for word in set(concept_map.values()) if is_concept(word)}
    # Add pb (PropBank) or cn (ConceptNet) prefix to concepts
    for k, v in concept_map.items():
        if is_concept(v):
            concept_map[k] = f"pb:{v}"
        elif in_conceptnet(query_conceptnet(v)):
            concept_map[k] = f"cn:{v}"
        else:
            concept_map[k] = v
    term_map = {}
    for i, triple in enumerate(triples, 1):
        subject, relation, object = triple
        # Skip instance relations (the variables will be replaced with concepts)
        if relation == ":instance":
            source_triples.append(triple)
            processed_triples.append(())
            continue

        # print("Triple:", (subject, relation, object))

        # Join name parts if they are already collected
        if searching_name and "op" not in relation:
            processed_triples.pop()  # Remove the last empty tuple
            processed_triples.append((*name_triple, name.strip()))
            searching_name = False
            name = ""
        # Start name extraction
        if relation == ":name":
            subject = concept_map[subject] if subject in concept_map else subject
            subject = subject if subject.startswith("cn:") else "amr:" + subject
            name_triple = (subject, "amr:name")
            processed_triples.append(())
            source_triples.append(triple)
            continue
        # Continue name extraction
        if re.match(r"(?:name|n)\d?", subject) and relation.startswith(":op"):
            name += " " + object.strip('"')
            searching_name = True
            processed_triples.append(())
            source_triples.append(triple)
            continue

        # Replace variables with concepts
        if subject in concept_map:
            subject = concept_map[subject]
        if object in concept_map:
            object = concept_map[object]

        # If the object is a literal, like part of a name
        if '"' in object:
            if not relation.startswith(":op"):
                warnings.warn("Literal object without :op relation: " + str(triple))
            relation = "rdfs:label"
            object = object.strip('"')
        else:
            relation = "amr" + relation

        # Identify compound nouns / terms
        if not any([object.endswith("-"), is_concept(subject), is_concept(object), object.isnumeric(), is_decimal_fraction(object)]):
            clean_subj = re.search(r"([a-zA-Z]+)\d?$", subject)[0]
            clean_obj = re.search(r"([a-zA-Z]+)\d?$", object)[0]
            possible_term = ""
            if relation in ["amr:mod", "amr:source"]:
                possible_term = f"{clean_obj} {clean_subj}"
            if relation in ["amr:poss"]:
                possible_term = f"{clean_subj} of {clean_obj}"
            if relation.endswith("ARG1"):
                possible_term = f"{clean_subj} {clean_obj}"
                print("ARG1 relation, possible term:", possible_term, "(check the necessity)")
            if possible_term:
                q_id = find_q_id(possible_term)
                if q_id:
                    print(f"Found Q-id for term '{possible_term}': {q_id}")
                    # Avoid mapping concepts from PropBank (pb:) or ConceptNet (cn:)
                    print("Subjects and objects:", subject, object)
                    print("ConceptNet check: ", possible_term, in_conceptnet(query_conceptnet(possible_term)))
                    if in_conceptnet(query_conceptnet(possible_term)):
                        composite_term = f"cn:{possible_term.replace(' ', '_')}"
                        source_triples.append(())  # Add empty triple for ConceptNet concepts
                        # TODO: fix if not Q-id but P-id
                        # Add equivalence relation as an extra triple
                        processed_triples.append((composite_term, "owl:sameAs", f"wd:{q_id}"))
                        term_map[subject] = composite_term
                        term_map[object] = composite_term
                    else:
                        subject = subject if subject.startswith("cn:") else f"amr:{subject}"
                        object = object if object.startswith("cn:") else f"amr:{object}"
                        term_map[subject] = f"wd:{q_id}"
                        term_map[object] = f"wd:{q_id}"

                    source_triples.append(())
                    processed_triples.append((f"wd:{q_id}", "rdfs:label", possible_term))
                    continue

        # Map AGRs to their roles if the subject is a concept from PropBank
        if is_concept(subject) and "ARG" in relation:
            args = concept_args[subject[3:]]  # Remove pb: prefix
            if not relation[4:] in args: # Also for empty args in case of missing PropBank entry
                warnings.warn(f"Argument {relation} not found for concept {subject}")
            else:
                descriptor = args[relation[4:]].lower()
                if "," in descriptor:
                    # TODO: handle multiple roles properly: secondary_attribute_or_described-as
                    descriptor = descriptor.replace(",", "_or")
                    global COMMAS_IN_DESCR
                    COMMAS_IN_DESCR.append((subject, f"=>{descriptor}<=", object))
                relation = f"pb:{descriptor}"  # Remove amr: prefix

        # Add the amr prefix
        if ":" not in subject:
            subject = "amr:" + subject
        if ":" not in object and not object.isnumeric() and not is_decimal_fraction(object):
            object = "amr:" + object

        source_triples.append(triple)
        processed_triples.append((subject, relation, object))

    if searching_name:
        # In case the last triple was a name part, finalize it
        processed_triples.pop()  # Remove the last empty tuple
        processed_triples.append((*name_triple, name.strip()))

    if term_map:
        print("Identified terms to replace:", term_map)
        # Replace terms in the triples with their identified concepts
        for i, triple in enumerate(processed_triples):
            if not triple:
                continue
            subject, relation, object = triple
            if object in term_map:
                object = term_map[object]
            if subject in term_map:
                subject = term_map[subject]
            processed_triples[i] = (subject, relation, object)
    return source_triples, processed_triples


In [24]:
def create_graph(triples: list[tuple]):
    """
    Creates a graph from given triples
    :param triples: preprocessed triples
    :return: an igraph instance
    """
    g = Graph(directed=True)
    triples = [t for t in triples if t]  # Remove empty tuples
    subj_nodes = [subj for subj, _, _ in triples]
    obj_nodes = [obj for _, _, obj in triples]
    g.add_vertices(list(set(subj_nodes + obj_nodes)))
    for subj, pred, obj in triples:
        g.add_edge(subj, obj, label=pred)
    return g

In [25]:
def sentence_to_kg(sentence: str, verbose: bool = True):
    """
    Parses a given sentence into AMR triples and attempts to convert them to a valid knowledge graph (work in progress).
    :param sentence: sentence to convert
    :return:
    """
    sentence, graph = stog.parse_sents([sentence.strip()])[0].split("\n", 1)
    if verbose:
        print("Sentence:", sentence)
        print("Graph:\n", graph)
    triples = penman.decode(graph).triples
    source_triples, processed_triples = postprocess_penman(triples)
    if verbose:
        print("Triples:")
        for t1, t2 in zip(source_triples, processed_triples):
            print(t1, "=>", t2)
    return create_graph(processed_triples)

In [26]:
amr_examples = [
    "The boy desires the girl to believe him.",
    "The girl made an adjustment to the machine.",
    "It is obligatory that the boy not go.",
    "The boy does not have permission to go.",
    "The regulatory documents were changed.",
    "He described the mission as a failure.",
    "The boy would rather go.",
    "It’s impossible for the boy to go.",
    "Where did the girl find the boy?",
    "Whose toy did the girl find?",
    "I know the person you saw.",
    "Do you want tea or coffee?",
    "The man is a lawyer.",  # (l / lawyer :domain (m / man)) - weird
    "The boy destroyed the room.",
    "The boy is responsible for the results of his work.",
    "I observed the army moving quickly.",  # 16
    "I hardly know her.",  # lookup
    "He drove west, from Houston to Austin.",
    "I drove to Indianapolis on I-65.",
    "I drove through the tunnel.",
    "The soldier hummed a tune for the girl as he walked with her to town.",  # hum.01 instead of hum.02 (as in PropBank) -- problems with both concepts
    "There is no information about the case.",  # lookup 22
    "He worked for two hours.",
    "I ate pasta with a fork.",
    "She talked to him in French.",  # ('amr:language', 'amr:name', 'French')
    "The boy sang very beautifully.",  # lookup 26
    "The mayor proposed to lower crime by hiring more police officers.", # TODO: relation disambiguation
    "Nicole went to England by train.",
    "He went to the store to buy wood for a new fence.", # lookup 29
    "The boy murmured softly to soothe the girl, because he was worried about her.",
    "The game continued despite the rain.",
    "The boy will sing if he is given money.",
    "The boy will sing unless he is given money.",
    "The torpedo struck, causing damage to the ship.",
    "The boy provided chocolate to the girl.",
    "They built the bridge in Maryland in December.",
    "The engine of the car got rusty from the rain.",
    "The boy won the race in the Olympics.",
    "We met three times.",  # 39: ('pb:meet-03', 'pb:one_party', 'amr:we')
    "We play bridge every Wednesday afternoon.", # TODO: pb:rate-entity-91 (40)
    "The girl left because the boy arrived.",
    "The nation defaulted in June.",
    "The man died in his house between the field and the river.",
    "The Shanghai legal system.",
    "There was shouting, and the boy left.",  # lookup 45
    "The boy arrived and was promptly killed.",  # the causal connection is lost
    "The boy arrived and left on Tuesday.",
    "The brightest boy.",  # have.degree.91 instead of :degree
    "Nine of the twenty soldiers died.",  # ('pb:include-91', 'amr:ARG1', 'amr:soldier'), ('pb:include-91', 'amr:ARG2', 'amr:soldier2') — direction of rel, weird args
    "Four of the five survivors had the disease, including three who were diagnosed.",  # duplicated info about included patients
    "Marie Skłodowska-Curie received the Nobel Prize in 1911.",
    "During the past 30 years, 70% of the glaciers in the Alps have retreated.",
    "20 Canadian dollars",
    "The aircraft's velocity reached three times the speed of sound.",  # TODO: speed of sound is a single entity:
    # ('amr:product-of', 'amr:op1', 'amr:3') ('amr:product-of', 'amr:op2', 'amr:speed') ('amr:speed', 'amr:poss', 'amr:sound')
    "Patrick Makau finished the marathon in 2 hours, 3 minutes and 38 seconds.",
    "February 40, 2012",
    "Mary was playing chess while her sister was playing with toys.",  # relation disambiguation: ('p', ':time', 'p3') => ('pb:play-01', 'amr:time', 'pb:play-01')
]

In [18]:
len(amr_examples)

57

In [21]:
# Running a specific example
lst = amr_examples
check_id = 57

print("Example", check_id)
kg = sentence_to_kg(lst[check_id-1], verbose=True)
print(kg)
print("Vertices:", kg.vcount(), "Edges:", kg.ecount())

Example 57
Sentence: # ::snt Mary was playing chess while her sister was playing with toys.
Graph:
 (p / play-01
      :ARG0 (p2 / person
            :name (n / name
                  :op1 "Mary"))
      :ARG1 (c / chess)
      :time (p3 / play-01
            :ARG0 (p4 / person
                  :ARG0-of (h / have-rel-role-91
                        :ARG1 p2
                        :ARG2 (s / sister)))
            :ARG2 (t / toy)))
Looking up roles for concept: play.01
Looking up roles for concept: have.rel.role.91




Triples:
('p', ':instance', 'play-01') => ()
('p', ':ARG0', 'p2') => ('pb:play-01', 'pb:player', 'amr:person2')
('p2', ':instance', 'person') => ()
('p2', ':name', 'n') => ()
('n', ':instance', 'name') => ()
('n', ':op1', '"Mary"') => ('amr:person2', 'amr:name', 'Mary')
('p', ':ARG1', 'c') => ('pb:play-01', 'pb:game/music', 'cn:chess')
('c', ':instance', 'chess') => ()
('p', ':time', 'p3') => ('pb:play-01', 'amr:time', 'pb:play-01')
('p3', ':instance', 'play-01') => ()
('p3', ':ARG0', 'p4') => ('pb:play-01', 'pb:player', 'amr:person4')
('p4', ':instance', 'person') => ()
('h', ':ARG0', 'p4') => ('pb:have-rel-role-91', 'amr:ARG0', 'amr:person4')
('h', ':instance', 'have-rel-role-91') => ()
('h', ':ARG1', 'p2') => ('pb:have-rel-role-91', 'amr:ARG1', 'amr:person2')
('h', ':ARG2', 's') => ('pb:have-rel-role-91', 'amr:ARG2', 'cn:sister')
('s', ':instance', 'sister') => ()
('p3', ':ARG2', 't') => ('pb:play-01', 'pb:instrument', 'cn:toy')
('t', ':instance', 'toy') => ()
IGRAPH DN-- 8 9 --
+ a

