From c15b0f577ecb6c27cf16580dcb91c7391aed866d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 00:15:54 +0100 Subject: [PATCH 1/8] If MentionSpan is not provided, suppose only the head is in the span Otherwise, the mention is not reachable via `head.coref_mentions` and `len(mention.words) == 0`, which is both misleading. --- udapi/core/coref.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index be8f6883..1e2df00b 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -194,6 +194,8 @@ def load_coref_from_misc(doc): mention = CorefMention(node, cluster) if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] + else: + mentions.words = [node] cluster_type = node.misc["ClusterType" + index_str] if cluster_type is not None: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: From fd9120b3fc8531c52d7dfcecafa4b1c95418647a Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 00:26:31 +0100 Subject: [PATCH 2/8] MentionMisc --- udapi/core/coref.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 1e2df00b..793dd4d0 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -6,7 +6,7 @@ @functools.total_ordering class CorefMention(object): """Class for representing a mention (instance of an entity).""" - __slots__ = ['_head', '_cluster', '_bridging', '_words'] + __slots__ = ['_head', '_cluster', '_bridging', '_words', 'misc'] def __init__(self, head, cluster=None): self._head = head @@ -15,6 +15,7 @@ def __init__(self, head, cluster=None): cluster._mentions.append(self) self._bridging = None self._words = [] + self.misc = None def __lt__(self, other): """Does this mention precedes (word-order wise) the `other` mention? @@ -195,7 +196,7 @@ def load_coref_from_misc(doc): if node.misc["MentionSpan" + index_str]: mention.span = node.misc["MentionSpan" + index_str] else: - mentions.words = [node] + mention.words = [node] cluster_type = node.misc["ClusterType" + index_str] if cluster_type is not None: if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: @@ -204,6 +205,7 @@ def load_coref_from_misc(doc): # TODO deserialize Bridging and SplitAnte mention._bridging = node.misc["Bridging" + index_str] cluster._split_ante = node.misc["SplitAnte" + index_str] + mention.misc = node.misc["MentionMisc" + index_str] index += 1 index_str = f"[{index}]" cluster_id = node.misc["ClusterId" + index_str] @@ -239,6 +241,8 @@ def store_coref_to_misc(doc): head.misc["ClusterType" + index_str] = cluster.cluster_type head.misc["Bridging" + index_str] = mention.bridging head.misc["SplitAnte" + index_str] = cluster.split_ante + if mention.misc: + head.misc["MentionMisc" + index_str] = mention.misc def span_to_nodes(root, span): From b4987f091c1912e262b4043cddd2ebb5f0f722dc Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 18:43:18 +0100 Subject: [PATCH 3/8] support for Bridging --- udapi/core/coref.py | 84 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 793dd4d0..46d42e95 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -1,6 +1,7 @@ """Classes for handling coreference.""" import re import functools +import collections import logging @functools.total_ordering @@ -59,6 +60,9 @@ def cluster(self, new_cluster): @property def bridging(self): + if self._bridging: + return self._bridging + self._bridging = BridgingLinks(self) return self._bridging # TODO add/edit bridging @@ -156,7 +160,7 @@ def split_ante(self): # TODO add/edit split_ante - # TODO adapt depending on how mention.bridging is implemented (callable list subclass) + # TODO or should we create a BridgingLinks instance with a fake src_mention? def all_bridging(self): for m in self._mentions: if m._bridging: @@ -164,6 +168,75 @@ def all_bridging(self): yield b +BridgingLink = collections.namedtuple('BridgingLink', 'target relation') + + +class BridgingLinks(collections.abc.MutableSequence): + """BridgingLinks class serves as a list of BridgingLinks tuples with additional methods. + + Example usage: + >>> bl = BridgingLinks(src_mention) # empty links + >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples + >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset') # from a string + >>> for cluster, relation in bl: + >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") + >>> print(str(bl)) # c12:Part,c56:Subset + >>> bl('Part').targets == [c12] + >>> bl('Part|Subset').targets == [c12, c56] + >>> bl.append((c89, 'Funct')) + """ + def __init__(self, src_mention, value=None): + self.src_mention = src_mention + self._data = [] + if value is not None: + if isinstance(value, str): + self._from_string(string) + elif isinstance(value, collections.abc.Sequence): + for v in value: + self._data.append(BridgingLink(v[0], v[1])) + super().__init__() + + def __getitem__(self, key): + return self._data[key] + + def __len__(self): + return len(self._data) + + # TODO delete backlinks of old links + def __setitem__(self, key, new_value): + self._data[key] = BridgingLink(new_value[0], new_value[1]) + + def __delitem__(self, key): + del self._data[key] + + def insert(self, key, new_value): + self._data.insert(key, BridgingLink(new_value[0], new_value[1])) + + def __str__(self): + return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self) + + def _from_string(self, string): + self._data.clear() + clusters = self.src_mention.head.root.coref_clusters + for link_str in string.split(','): + target, relation = link_str.split(':') + self._data.append(BridgingLink(clusters[target], relation)) + + def __call__(self, relations_re=None): + """Return a subset of links contained in this list as specified by the args. + Args: + relations: only links with a relation matching this regular expression will be returned + """ + if relations_re is None: + return self + return Links(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)]) + + @property + def targets(self): + """Return a list of the target clusters (without relations).""" + return [link.target for link in self._data] + + def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): clusters = head.root.bundle.document.coref_clusters if not cluster_id: @@ -202,8 +275,10 @@ def load_coref_from_misc(doc): if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type - # TODO deserialize Bridging and SplitAnte - mention._bridging = node.misc["Bridging" + index_str] + bridging_str = node.misc["Bridging" + index_str] + if bridging_str: + mention._bridging = BridgingLinks(mention, bridging_str) + # TODO deserialize SplitAnte cluster._split_ante = node.misc["SplitAnte" + index_str] mention.misc = node.misc["MentionMisc" + index_str] index += 1 @@ -239,7 +314,8 @@ def store_coref_to_misc(doc): head.misc["ClusterId" + index_str] = cluster.cluster_id head.misc["MentionSpan" + index_str] = mention.span head.misc["ClusterType" + index_str] = cluster.cluster_type - head.misc["Bridging" + index_str] = mention.bridging + if mention._bridging: + head.misc["Bridging" + index_str] = str(mention.bridging) head.misc["SplitAnte" + index_str] = cluster.split_ante if mention.misc: head.misc["MentionMisc" + index_str] = mention.misc From c83debdb596692e7006f3303a6faec644ca7df9c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Fri, 26 Feb 2021 21:24:18 +0100 Subject: [PATCH 4/8] handle SplitAnte --- udapi/core/coref.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 46d42e95..5c8f5b89 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -102,13 +102,13 @@ def span(self, new_span): class CorefCluster(object): """Class for representing all mentions of a given entity.""" - __slots__ = ['_cluster_id', '_mentions', 'cluster_type', '_split_ante'] + __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante'] def __init__(self, cluster_id, cluster_type=None): self._cluster_id = cluster_id self._mentions = [] self.cluster_type = cluster_type - self._split_ante = None + self.split_ante = [] @property def cluster_id(self): @@ -154,12 +154,6 @@ def create_mention(self, head=None, mention_words=None, mention_span=None): mention.span = mention_span return mention - @property - def split_ante(self): - return self._split_ante - - # TODO add/edit split_ante - # TODO or should we create a BridgingLinks instance with a fake src_mention? def all_bridging(self): for m in self._mentions: @@ -202,7 +196,7 @@ def __getitem__(self, key): def __len__(self): return len(self._data) - # TODO delete backlinks of old links + # TODO delete backlinks of old links, dtto for SplitAnte def __setitem__(self, key, new_value): self._data[key] = BridgingLink(new_value[0], new_value[1]) @@ -275,11 +269,24 @@ def load_coref_from_misc(doc): if cluster.cluster_type is not None and cluster_type != cluster.cluster_type: logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}") cluster.cluster_type = cluster_type + bridging_str = node.misc["Bridging" + index_str] if bridging_str: mention._bridging = BridgingLinks(mention, bridging_str) - # TODO deserialize SplitAnte - cluster._split_ante = node.misc["SplitAnte" + index_str] + + split_ante_str = node.misc["SplitAnte" + index_str] + if split_ante_str: + split_antes = [] + for ante_str in split_ante_str.split('+'): + if ante_str in clusters: + split_antes.append(clusters[ante_str]) + else: + # split cataphora, e.g. "We, that is you and me..." + cluster = CorefCluster(ante_str) + clusters[ante_str] = cluster + split_antes.append(cluster) + cluster.split_ante = split_antes + mention.misc = node.misc["MentionMisc" + index_str] index += 1 index_str = f"[{index}]" @@ -316,7 +323,9 @@ def store_coref_to_misc(doc): head.misc["ClusterType" + index_str] = cluster.cluster_type if mention._bridging: head.misc["Bridging" + index_str] = str(mention.bridging) - head.misc["SplitAnte" + index_str] = cluster.split_ante + if cluster.split_ante: + serialized = '+'.join((c.cluster_id for c in cluster.split_ante)) + head.misc["SplitAnte" + index_str] = serialized if mention.misc: head.misc["MentionMisc" + index_str] = mention.misc From 2ffdb2a2a605691855d8ac958fd0f4b8908030bb Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 27 Feb 2021 13:09:24 +0100 Subject: [PATCH 5/8] bugfix: prevent infinite recursion when loading coref_clusters from MISC We cannot access src_mention.head.root.coref_clusters when loading Bridging because these are not ready yet. --- udapi/core/coref.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 5c8f5b89..818bb248 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -171,7 +171,7 @@ class BridgingLinks(collections.abc.MutableSequence): Example usage: >>> bl = BridgingLinks(src_mention) # empty links >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples - >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset') # from a string + >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset', clusters) # from a string >>> for cluster, relation in bl: >>> print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}") >>> print(str(bl)) # c12:Part,c56:Subset @@ -179,12 +179,18 @@ class BridgingLinks(collections.abc.MutableSequence): >>> bl('Part|Subset').targets == [c12, c56] >>> bl.append((c89, 'Funct')) """ - def __init__(self, src_mention, value=None): + def __init__(self, src_mention, value=None, clusters=None): self.src_mention = src_mention self._data = [] if value is not None: if isinstance(value, str): - self._from_string(string) + if clusters is None: + raise ValueError('BridgingClusters: clusters must be provided if initializing with a string') + try: + self._from_string(value, clusters) + except ValueError: + logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n") + raise elif isinstance(value, collections.abc.Sequence): for v in value: self._data.append(BridgingLink(v[0], v[1])) @@ -209,9 +215,8 @@ def insert(self, key, new_value): def __str__(self): return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self) - def _from_string(self, string): + def _from_string(self, string, clusters): self._data.clear() - clusters = self.src_mention.head.root.coref_clusters for link_str in string.split(','): target, relation = link_str.split(':') self._data.append(BridgingLink(clusters[target], relation)) @@ -272,7 +277,7 @@ def load_coref_from_misc(doc): bridging_str = node.misc["Bridging" + index_str] if bridging_str: - mention._bridging = BridgingLinks(mention, bridging_str) + mention._bridging = BridgingLinks(mention, bridging_str, clusters) split_ante_str = node.misc["SplitAnte" + index_str] if split_ante_str: From 399322d82b3fe8246460bf7f9a0c54eefadca23c Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sat, 27 Feb 2021 15:18:27 +0100 Subject: [PATCH 6/8] corefud.PrintCluster for revealing cluster_id used in several documents --- udapi/block/corefud/printcluster.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 udapi/block/corefud/printcluster.py diff --git a/udapi/block/corefud/printcluster.py b/udapi/block/corefud/printcluster.py new file mode 100644 index 00000000..5f44fab2 --- /dev/null +++ b/udapi/block/corefud/printcluster.py @@ -0,0 +1,19 @@ +from udapi.core.block import Block +from collections import Counter + +class PrintCluster(Block): + """Block corefud.PrintCluster prints all mentions of a given cluster.""" + + def __init__(self, cluster_id, **kwargs): + super().__init__(**kwargs) + self.cluster_id = cluster_id + + def process_document(self, doc): + cluster = doc.coref_clusters.get(self.cluster_id) + if cluster and cluster.mentions: + print(f"Coref cluster {self.cluster_id} has {len(cluster.mentions)} mentions in document {doc.meta['docname']}:") + counter = Counter() + for mention in cluster.mentions: + counter[' '.join([w.form for w in mention.words])] += 1 + for form, count in counter.most_common(): + print(f"{count:4}: {form}") From 30ebb49f514dad17322111037057c1299f07031d Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 13:50:27 +0100 Subject: [PATCH 7/8] fixing typo found in the review --- udapi/core/coref.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 818bb248..41ba5a41 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -60,9 +60,8 @@ def cluster(self, new_cluster): @property def bridging(self): - if self._bridging: - return self._bridging - self._bridging = BridgingLinks(self) + if not self._bridging: + self._bridging = BridgingLinks(self) return self._bridging # TODO add/edit bridging @@ -166,7 +165,7 @@ def all_bridging(self): class BridgingLinks(collections.abc.MutableSequence): - """BridgingLinks class serves as a list of BridgingLinks tuples with additional methods. + """BridgingLinks class serves as a list of BridgingLink tuples with additional methods. Example usage: >>> bl = BridgingLinks(src_mention) # empty links From 44926b440436440be4a35f0f511d6937e69e5c59 Mon Sep 17 00:00:00 2001 From: Martin Popel Date: Sun, 28 Feb 2021 13:56:43 +0100 Subject: [PATCH 8/8] coref must be loaded also from empty nodes An alternative to #77 --- udapi/core/coref.py | 4 ++-- udapi/core/document.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/udapi/core/coref.py b/udapi/core/coref.py index 41ba5a41..143e50ff 100644 --- a/udapi/core/coref.py +++ b/udapi/core/coref.py @@ -252,7 +252,7 @@ def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs): def load_coref_from_misc(doc): clusters = {} - for node in doc.nodes: + for node in doc.nodes_and_empty: index, index_str = 0, "" cluster_id = node.misc["ClusterId"] if not cluster_id: @@ -302,7 +302,7 @@ def store_coref_to_misc(doc): if not doc._coref_clusters: return attrs = ("ClusterId", "MentionSpan", "ClusterType", "Bridging", "SplitAnte") - for node in doc.nodes: + for node in doc.nodes_and_empty: for key in list(node.misc): if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs): del node.misc[key] diff --git a/udapi/core/document.py b/udapi/core/document.py index 58b4b34e..f02f831e 100644 --- a/udapi/core/document.py +++ b/udapi/core/document.py @@ -88,12 +88,20 @@ def trees(self): @property def nodes(self): - """An iterator over all nodes in the document.""" + """An iterator over all nodes (excluding empty nodes) in the document.""" for bundle in self: for tree in bundle: for node in tree._descendants: yield node + @property + def nodes_and_empty(self): + """An iterator over all nodes and empty nodes in the document.""" + for bundle in self: + for tree in bundle: + for node in tree.descendants_and_empty: + yield node + def draw(self, **kwargs): """Pretty print the trees using TextModeTrees.""" TextModeTrees(**kwargs).run(self)