From a76d15bad7406fe1a79d2f9d1d95ae7a5324dd82 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Sat, 13 Apr 2024 15:43:37 +0200 Subject: [PATCH 1/2] deps to empty tokens must be recursively redirected to the non-empty ancestors --- udapi/block/corefud/delete.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py index 31d71eca..d246a942 100644 --- a/udapi/block/corefud/delete.py +++ b/udapi/block/corefud/delete.py @@ -10,19 +10,43 @@ def __init__(self, empty=False, **kwargs): super().__init__(**kwargs) self.empty = empty + + def _deps_ignore_nodes(self, node, parents_to_ignore): + """ Retrieve deps from the node, recursively ignoring specified parents. + """ + newdeps = [] + stack = [(node, [])] + while stack: + proc_node, skipped_nodes = stack.pop() + # if there is a cycle of skipped nodes, ground the subtree to the root + if proc_node in skipped_nodes: + newdeps.append({'parent': node.root, 'deprel': 'root'}) + continue + for dep in proc_node.deps: + # keep deps with a parent that shouldn't be ignored + if not dep['parent'] in parents_to_ignore: + newdeps.append(dep) + continue + # process the ignored parent recursively + stack.append((dep['parent'], skipped_nodes + [proc_node])) + return newdeps + def process_document(self, doc): # This block should work both with coreference loaded (deserialized) and not. doc._eid_to_entity = None for root in doc.trees: if self.empty: - root.empty_nodes = [] for node in root.descendants: - if node.raw_deps != '_': - node.raw_deps = '|'.join(d for d in node.raw_deps.split('|') if not '.' in d) - if node.raw_deps == '': - node.raw_deps = '0:root' + # process only the nodes dependent on empty nodes + if not '.' in node.raw_deps: + continue + newdeps = self._deps_ignore_nodes(node, root.empty_nodes) + newdeps_sorted = sorted(set((dep['parent'].ord, dep['deprel']) for dep in newdeps)) + node.raw_deps = '|'.join(f"{p}:{r}" for p, r in newdeps_sorted) + if '.' in node.misc['Functor'].split(':')[0]: del node.misc['Functor'] + root.empty_nodes = [] for node in root.descendants + root.empty_nodes: node._mentions = [] From 8ce75d53f7fa3b0c72c83546839f6957c4816713 Mon Sep 17 00:00:00 2001 From: Michal Novak Date: Sun, 14 Apr 2024 13:21:47 +0200 Subject: [PATCH 2/2] propagate to non-empty ancestors only if the root is unreachable otherwise --- udapi/block/corefud/delete.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/udapi/block/corefud/delete.py b/udapi/block/corefud/delete.py index d246a942..6a4e1388 100644 --- a/udapi/block/corefud/delete.py +++ b/udapi/block/corefud/delete.py @@ -10,6 +10,27 @@ def __init__(self, empty=False, **kwargs): super().__init__(**kwargs) self.empty = empty + def is_root_reachable_by_deps(self, node, parents_to_ignore=None): + """ Check if the root node is reachable from node, possibly after deleting the parents_to_ignore nodes. + """ + stack = [(node, [])] + while stack: + proc_node, path = stack.pop() + # root is reachable + if proc_node == node.root: + break + # path forms a cycle, the root cannot be reached through this branch + if proc_node in path: + continue + for dep in proc_node.deps: + # the root cannot be reached through ignored nodes + if dep['parent'] in parents_to_ignore: + continue + # process the parent recursively + stack.append((dep['parent'], path + [proc_node])) + else: + return False + return True def _deps_ignore_nodes(self, node, parents_to_ignore): """ Retrieve deps from the node, recursively ignoring specified parents. @@ -40,9 +61,14 @@ def process_document(self, doc): # process only the nodes dependent on empty nodes if not '.' in node.raw_deps: continue - newdeps = self._deps_ignore_nodes(node, root.empty_nodes) - newdeps_sorted = sorted(set((dep['parent'].ord, dep['deprel']) for dep in newdeps)) - node.raw_deps = '|'.join(f"{p}:{r}" for p, r in newdeps_sorted) + # just remove empty parents if the root remains reachable + if self.is_root_reachable_by_deps(node, root.empty_nodes): + node.deps = [dep for dep in node.deps if not dep['parent'] in root.empty_nodes] + # otherwise propagate to non-empty ancestors + else: + newdeps = self._deps_ignore_nodes(node, root.empty_nodes) + newdeps_sorted = sorted(set((dep['parent'].ord, dep['deprel']) for dep in newdeps)) + node.raw_deps = '|'.join(f"{p}:{r}" for p, r in newdeps_sorted) if '.' in node.misc['Functor'].split(':')[0]: del node.misc['Functor']