From c15b0f577ecb6c27cf16580dcb91c7391aed866d Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Fri, 26 Feb 2021 00:15:54 +0100
Subject: [PATCH 1/8] If MentionSpan is not provided, suppose only the head is
 in the span

Otherwise, the mention is not reachable via `head.coref_mentions`
and `len(mention.words) == 0`, which is both misleading.
---
 udapi/core/coref.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index be8f6883..1e2df00b 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -194,6 +194,8 @@ def load_coref_from_misc(doc):
             mention = CorefMention(node, cluster)
             if node.misc["MentionSpan" + index_str]:
                 mention.span = node.misc["MentionSpan" + index_str]
+            else:
+                mentions.words = [node]
             cluster_type = node.misc["ClusterType" + index_str]
             if cluster_type is not None:
                 if cluster.cluster_type is not None and cluster_type != cluster.cluster_type:

From fd9120b3fc8531c52d7dfcecafa4b1c95418647a Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Fri, 26 Feb 2021 00:26:31 +0100
Subject: [PATCH 2/8] MentionMisc

---
 udapi/core/coref.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index 1e2df00b..793dd4d0 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -6,7 +6,7 @@
 @functools.total_ordering
 class CorefMention(object):
     """Class for representing a mention (instance of an entity)."""
-    __slots__ = ['_head', '_cluster', '_bridging', '_words']
+    __slots__ = ['_head', '_cluster', '_bridging', '_words', 'misc']
 
     def __init__(self, head, cluster=None):
         self._head = head
@@ -15,6 +15,7 @@ def __init__(self, head, cluster=None):
             cluster._mentions.append(self)
         self._bridging = None
         self._words = []
+        self.misc = None
 
     def __lt__(self, other):
         """Does this mention precedes (word-order wise) the `other` mention?
@@ -195,7 +196,7 @@ def load_coref_from_misc(doc):
             if node.misc["MentionSpan" + index_str]:
                 mention.span = node.misc["MentionSpan" + index_str]
             else:
-                mentions.words = [node]
+                mention.words = [node]
             cluster_type = node.misc["ClusterType" + index_str]
             if cluster_type is not None:
                 if cluster.cluster_type is not None and cluster_type != cluster.cluster_type:
@@ -204,6 +205,7 @@ def load_coref_from_misc(doc):
             # TODO deserialize Bridging and SplitAnte
             mention._bridging = node.misc["Bridging" + index_str]
             cluster._split_ante = node.misc["SplitAnte" + index_str]
+            mention.misc = node.misc["MentionMisc" + index_str]
             index += 1
             index_str = f"[{index}]"
             cluster_id = node.misc["ClusterId" + index_str]
@@ -239,6 +241,8 @@ def store_coref_to_misc(doc):
             head.misc["ClusterType" + index_str] = cluster.cluster_type
             head.misc["Bridging" + index_str] = mention.bridging
             head.misc["SplitAnte" + index_str] = cluster.split_ante
+            if mention.misc:
+                head.misc["MentionMisc" + index_str] = mention.misc
 
 
 def span_to_nodes(root, span):

From b4987f091c1912e262b4043cddd2ebb5f0f722dc Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Fri, 26 Feb 2021 18:43:18 +0100
Subject: [PATCH 3/8] support for Bridging

---
 udapi/core/coref.py | 84 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 4 deletions(-)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index 793dd4d0..46d42e95 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -1,6 +1,7 @@
 """Classes for handling coreference."""
 import re
 import functools
+import collections
 import logging
 
 @functools.total_ordering
@@ -59,6 +60,9 @@ def cluster(self, new_cluster):
 
     @property
     def bridging(self):
+        if self._bridging:
+            return self._bridging
+        self._bridging = BridgingLinks(self)
         return self._bridging
 
     # TODO add/edit bridging
@@ -156,7 +160,7 @@ def split_ante(self):
 
     # TODO add/edit split_ante
 
-    # TODO adapt depending on how mention.bridging is implemented (callable list subclass)
+    # TODO or should we create a BridgingLinks instance with a fake src_mention?
     def all_bridging(self):
         for m in self._mentions:
             if m._bridging:
@@ -164,6 +168,75 @@ def all_bridging(self):
                     yield b
 
 
+BridgingLink = collections.namedtuple('BridgingLink', 'target relation')
+
+
+class BridgingLinks(collections.abc.MutableSequence):
+    """BridgingLinks class serves as a list of BridgingLinks tuples with additional methods.
+
+    Example usage:
+    >>> bl = BridgingLinks(src_mention)                                   # empty links
+    >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples
+    >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset')            # from a string
+    >>> for cluster, relation in bl:
+    >>>     print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}")
+    >>> print(str(bl)) # c12:Part,c56:Subset
+    >>> bl('Part').targets == [c12]
+    >>> bl('Part|Subset').targets == [c12, c56]
+    >>> bl.append((c89, 'Funct'))
+    """
+    def __init__(self, src_mention, value=None):
+        self.src_mention = src_mention
+        self._data = []
+        if value is not None:
+            if isinstance(value, str):
+                self._from_string(string)
+            elif isinstance(value, collections.abc.Sequence):
+                for v in value:
+                    self._data.append(BridgingLink(v[0], v[1]))
+        super().__init__()
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def __len__(self):
+        return len(self._data)
+
+    # TODO delete backlinks of old links
+    def __setitem__(self, key, new_value):
+        self._data[key] = BridgingLink(new_value[0], new_value[1])
+
+    def __delitem__(self, key):
+        del self._data[key]
+
+    def insert(self, key, new_value):
+        self._data.insert(key, BridgingLink(new_value[0], new_value[1]))
+
+    def __str__(self):
+        return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self)
+
+    def _from_string(self, string):
+        self._data.clear()
+        clusters = self.src_mention.head.root.coref_clusters
+        for link_str in string.split(','):
+            target, relation = link_str.split(':')
+            self._data.append(BridgingLink(clusters[target], relation))
+
+    def __call__(self, relations_re=None):
+        """Return a subset of links contained in this list as specified by the args.
+        Args:
+        relations: only links with a relation matching this regular expression will be returned
+        """
+        if relations_re is None:
+            return self
+        return Links(self.src_mention, [l for l in self._data if re.match(relations_re, l.relation)])
+
+    @property
+    def targets(self):
+        """Return a list of the target clusters (without relations)."""
+        return [link.target for link in self._data]
+
+
 def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs):
     clusters = head.root.bundle.document.coref_clusters
     if not cluster_id:
@@ -202,8 +275,10 @@ def load_coref_from_misc(doc):
                 if cluster.cluster_type is not None and cluster_type != cluster.cluster_type:
                     logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}")
                 cluster.cluster_type = cluster_type
-            # TODO deserialize Bridging and SplitAnte
-            mention._bridging = node.misc["Bridging" + index_str]
+            bridging_str = node.misc["Bridging" + index_str]
+            if bridging_str:
+                mention._bridging = BridgingLinks(mention, bridging_str)
+            # TODO deserialize SplitAnte
             cluster._split_ante = node.misc["SplitAnte" + index_str]
             mention.misc = node.misc["MentionMisc" + index_str]
             index += 1
@@ -239,7 +314,8 @@ def store_coref_to_misc(doc):
             head.misc["ClusterId" + index_str] = cluster.cluster_id
             head.misc["MentionSpan" + index_str] = mention.span
             head.misc["ClusterType" + index_str] = cluster.cluster_type
-            head.misc["Bridging" + index_str] = mention.bridging
+            if mention._bridging:
+                head.misc["Bridging" + index_str] = str(mention.bridging)
             head.misc["SplitAnte" + index_str] = cluster.split_ante
             if mention.misc:
                 head.misc["MentionMisc" + index_str] = mention.misc

From c83debdb596692e7006f3303a6faec644ca7df9c Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Fri, 26 Feb 2021 21:24:18 +0100
Subject: [PATCH 4/8] handle SplitAnte

---
 udapi/core/coref.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index 46d42e95..5c8f5b89 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -102,13 +102,13 @@ def span(self, new_span):
 
 class CorefCluster(object):
     """Class for representing all mentions of a given entity."""
-    __slots__ = ['_cluster_id', '_mentions', 'cluster_type', '_split_ante']
+    __slots__ = ['_cluster_id', '_mentions', 'cluster_type', 'split_ante']
 
     def __init__(self, cluster_id, cluster_type=None):
         self._cluster_id = cluster_id
         self._mentions = []
         self.cluster_type = cluster_type
-        self._split_ante = None
+        self.split_ante = []
 
     @property
     def cluster_id(self):
@@ -154,12 +154,6 @@ def create_mention(self, head=None, mention_words=None, mention_span=None):
             mention.span = mention_span
         return mention
 
-    @property
-    def split_ante(self):
-        return self._split_ante
-
-    # TODO add/edit split_ante
-
     # TODO or should we create a BridgingLinks instance with a fake src_mention?
     def all_bridging(self):
         for m in self._mentions:
@@ -202,7 +196,7 @@ def __getitem__(self, key):
     def __len__(self):
         return len(self._data)
 
-    # TODO delete backlinks of old links
+    # TODO delete backlinks of old links, dtto for SplitAnte
     def __setitem__(self, key, new_value):
         self._data[key] = BridgingLink(new_value[0], new_value[1])
 
@@ -275,11 +269,24 @@ def load_coref_from_misc(doc):
                 if cluster.cluster_type is not None and cluster_type != cluster.cluster_type:
                     logging.warning(f"cluster_type mismatch in {node}: {cluster.cluster_type} != {cluster_type}")
                 cluster.cluster_type = cluster_type
+
             bridging_str = node.misc["Bridging" + index_str]
             if bridging_str:
                 mention._bridging = BridgingLinks(mention, bridging_str)
-            # TODO deserialize SplitAnte
-            cluster._split_ante = node.misc["SplitAnte" + index_str]
+
+            split_ante_str = node.misc["SplitAnte" + index_str]
+            if split_ante_str:
+                split_antes = []
+                for ante_str in split_ante_str.split('+'):
+                    if ante_str in clusters:
+                        split_antes.append(clusters[ante_str])
+                    else:
+                        # split cataphora, e.g. "We, that is you and me..."
+                        cluster = CorefCluster(ante_str)
+                        clusters[ante_str] = cluster
+                        split_antes.append(cluster)
+                cluster.split_ante = split_antes
+
             mention.misc = node.misc["MentionMisc" + index_str]
             index += 1
             index_str = f"[{index}]"
@@ -316,7 +323,9 @@ def store_coref_to_misc(doc):
             head.misc["ClusterType" + index_str] = cluster.cluster_type
             if mention._bridging:
                 head.misc["Bridging" + index_str] = str(mention.bridging)
-            head.misc["SplitAnte" + index_str] = cluster.split_ante
+            if cluster.split_ante:
+                serialized = '+'.join((c.cluster_id for c in cluster.split_ante))
+                head.misc["SplitAnte" + index_str] = serialized
             if mention.misc:
                 head.misc["MentionMisc" + index_str] = mention.misc
 

From 2ffdb2a2a605691855d8ac958fd0f4b8908030bb Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Sat, 27 Feb 2021 13:09:24 +0100
Subject: [PATCH 5/8] bugfix: prevent infinite recursion when loading
 coref_clusters from MISC

We cannot access src_mention.head.root.coref_clusters when loading Bridging
because these are not ready yet.
---
 udapi/core/coref.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index 5c8f5b89..818bb248 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -171,7 +171,7 @@ class BridgingLinks(collections.abc.MutableSequence):
     Example usage:
     >>> bl = BridgingLinks(src_mention)                                   # empty links
     >>> bl = BridgingLinks(src_mention, [(c12, 'Part'), (c56, 'Subset')]) # from a list of tuples
-    >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset')            # from a string
+    >>> bl = BridgingLinks(src_mention, 'c12:Part,c56:Subset', clusters)  # from a string
     >>> for cluster, relation in bl:
     >>>     print(f"{bl.src_mention} ->{relation}-> {cluster.cluster_id}")
     >>> print(str(bl)) # c12:Part,c56:Subset
@@ -179,12 +179,18 @@ class BridgingLinks(collections.abc.MutableSequence):
     >>> bl('Part|Subset').targets == [c12, c56]
     >>> bl.append((c89, 'Funct'))
     """
-    def __init__(self, src_mention, value=None):
+    def __init__(self, src_mention, value=None, clusters=None):
         self.src_mention = src_mention
         self._data = []
         if value is not None:
             if isinstance(value, str):
-                self._from_string(string)
+                if clusters is None:
+                    raise ValueError('BridgingClusters: clusters must be provided if initializing with a string')
+                try:
+                    self._from_string(value, clusters)
+                except ValueError:
+                    logging.error(f"Problem when parsing {value} in {src_mention.words[0]}:\n")
+                    raise
             elif isinstance(value, collections.abc.Sequence):
                 for v in value:
                     self._data.append(BridgingLink(v[0], v[1]))
@@ -209,9 +215,8 @@ def insert(self, key, new_value):
     def __str__(self):
         return ','.join(f'{l.target._cluster_id}:{l.relation}' for l in self)
 
-    def _from_string(self, string):
+    def _from_string(self, string, clusters):
         self._data.clear()
-        clusters = self.src_mention.head.root.coref_clusters
         for link_str in string.split(','):
             target, relation = link_str.split(':')
             self._data.append(BridgingLink(clusters[target], relation))
@@ -272,7 +277,7 @@ def load_coref_from_misc(doc):
 
             bridging_str = node.misc["Bridging" + index_str]
             if bridging_str:
-                mention._bridging = BridgingLinks(mention, bridging_str)
+                mention._bridging = BridgingLinks(mention, bridging_str, clusters)
 
             split_ante_str = node.misc["SplitAnte" + index_str]
             if split_ante_str:

From 399322d82b3fe8246460bf7f9a0c54eefadca23c Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Sat, 27 Feb 2021 15:18:27 +0100
Subject: [PATCH 6/8] corefud.PrintCluster for revealing cluster_id used in
 several documents

---
 udapi/block/corefud/printcluster.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 udapi/block/corefud/printcluster.py

diff --git a/udapi/block/corefud/printcluster.py b/udapi/block/corefud/printcluster.py
new file mode 100644
index 00000000..5f44fab2
--- /dev/null
+++ b/udapi/block/corefud/printcluster.py
@@ -0,0 +1,19 @@
+from udapi.core.block import Block
+from collections import Counter
+
+class PrintCluster(Block):
+    """Block corefud.PrintCluster prints all mentions of a given cluster."""
+
+    def __init__(self, cluster_id, **kwargs):
+        super().__init__(**kwargs)
+        self.cluster_id = cluster_id
+
+    def process_document(self, doc):
+        cluster = doc.coref_clusters.get(self.cluster_id)
+        if cluster and cluster.mentions:
+            print(f"Coref cluster {self.cluster_id} has {len(cluster.mentions)} mentions in document {doc.meta['docname']}:")
+            counter = Counter()
+            for mention in cluster.mentions:
+                counter[' '.join([w.form for w in mention.words])] += 1
+            for form, count in counter.most_common():
+                print(f"{count:4}: {form}")

From 30ebb49f514dad17322111037057c1299f07031d Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Sun, 28 Feb 2021 13:50:27 +0100
Subject: [PATCH 7/8] fixing typo found in the review

---
 udapi/core/coref.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index 818bb248..41ba5a41 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -60,9 +60,8 @@ def cluster(self, new_cluster):
 
     @property
     def bridging(self):
-        if self._bridging:
-            return self._bridging
-        self._bridging = BridgingLinks(self)
+        if not self._bridging:
+            self._bridging = BridgingLinks(self)
         return self._bridging
 
     # TODO add/edit bridging
@@ -166,7 +165,7 @@ def all_bridging(self):
 
 
 class BridgingLinks(collections.abc.MutableSequence):
-    """BridgingLinks class serves as a list of BridgingLinks tuples with additional methods.
+    """BridgingLinks class serves as a list of BridgingLink tuples with additional methods.
 
     Example usage:
     >>> bl = BridgingLinks(src_mention)                                   # empty links

From 44926b440436440be4a35f0f511d6937e69e5c59 Mon Sep 17 00:00:00 2001
From: Martin Popel <popel@ufal.mff.cuni.cz>
Date: Sun, 28 Feb 2021 13:56:43 +0100
Subject: [PATCH 8/8] coref must be loaded also from empty nodes

An alternative to #77
---
 udapi/core/coref.py    |  4 ++--
 udapi/core/document.py | 10 +++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/udapi/core/coref.py b/udapi/core/coref.py
index 41ba5a41..143e50ff 100644
--- a/udapi/core/coref.py
+++ b/udapi/core/coref.py
@@ -252,7 +252,7 @@ def create_coref_cluster(head, cluster_id=None, cluster_type=None, **kwargs):
 
 def load_coref_from_misc(doc):
     clusters = {}
-    for node in doc.nodes:
+    for node in doc.nodes_and_empty:
         index, index_str = 0, ""
         cluster_id = node.misc["ClusterId"]
         if not cluster_id:
@@ -302,7 +302,7 @@ def store_coref_to_misc(doc):
     if not doc._coref_clusters:
         return
     attrs = ("ClusterId", "MentionSpan", "ClusterType", "Bridging", "SplitAnte")
-    for node in doc.nodes:
+    for node in doc.nodes_and_empty:
         for key in list(node.misc):
             if any(re.match(attr + r'(\[\d+\])?$', key) for attr in attrs):
                 del node.misc[key]
diff --git a/udapi/core/document.py b/udapi/core/document.py
index 58b4b34e..f02f831e 100644
--- a/udapi/core/document.py
+++ b/udapi/core/document.py
@@ -88,12 +88,20 @@ def trees(self):
 
     @property
     def nodes(self):
-        """An iterator over all nodes in the document."""
+        """An iterator over all nodes (excluding empty nodes) in the document."""
         for bundle in self:
             for tree in bundle:
                 for node in tree._descendants:
                     yield node
 
+    @property
+    def nodes_and_empty(self):
+        """An iterator over all nodes and empty nodes in the document."""
+        for bundle in self:
+            for tree in bundle:
+                for node in tree.descendants_and_empty:
+                    yield node
+
     def draw(self, **kwargs):
         """Pretty print the trees using TextModeTrees."""
         TextModeTrees(**kwargs).run(self)