Skip to content

Commit 4b0c5cd

Browse files
committed
A new block to split a token marked as erroneous.
1 parent 7d531c0 commit 4b0c5cd

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

udapi/block/ud/splittoken.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
Block ud.SplitToken will split a given token into multiple tokens.
3+
"""
4+
from udapi.core.block import Block
5+
import re
6+
import logging
7+
8+
9+
class SplitToken(Block):
10+
"""
11+
Split a token into two or more. A MISC attribute is used to mark the tokens
12+
that should be split. (The attribute may have been set by an annotator or
13+
by a previous block that tests the specific conditions under which splitting
14+
is desired.) Multiword tokens are currently not supported: The node to be
15+
split cannot belong to a MWT. Note that the result will not be a MWT either
16+
(use the block ud.AddMwt if that is desired). There will be simply a new
17+
attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes
18+
(indicating that this was an error in the source text).
19+
"""
20+
21+
def __init__(self, misc_name='SplitToken', **kwargs):
22+
"""
23+
Args:
24+
misc_name: name of the MISC attribute that can trigger the splitting
25+
default: SplitToken
26+
The value of the attribute should indicate where to split the token.
27+
It should be a string that is identical to node.form except that
28+
there is one or more spaces where the token should be split.
29+
"""
30+
super().__init__(**kwargs)
31+
self.misc_name = misc_name
32+
33+
def process_node(self, node):
34+
"""
35+
The SplitToken (or equivalent) attribute in MISC will trigger action.
36+
Either the current node will be split to multiple nodes and the
37+
attribute will be removed from MISC, or a warning will be issued that
38+
the splitting cannot be done and the attribute will stay in MISC. Note
39+
that multiword token lines and empty nodes are not even scanned for
40+
the attribute, so if it is there, it will stay there but no warning
41+
will be printed.
42+
"""
43+
value = node.misc[self.misc_name]
44+
if value == '':
45+
return
46+
if node.multiword_token:
47+
logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.")
48+
node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
49+
return
50+
###!!! This block currently must not be applied on data containing
51+
###!!! enhanced dependencies. We must first implement adjustments of
52+
###!!! the enhanced structure.
53+
if node.deps:
54+
logging.fatal('At present this block cannot be applied to data with enhanced dependencies.')
55+
# Verify that the value of the MISC attribute can be used as specification
56+
# of the split.
57+
if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value):
58+
logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.")
59+
node.misc['Bug'] = f'{self.misc_name}BadValue'
60+
return
61+
if re.search(r'\s', node.form):
62+
logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').")
63+
node.misc['Bug'] = 'SplittingTokenNotSupportedHere'
64+
return
65+
if re.sub(r' ', '', value) != node.form:
66+
logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.")
67+
node.misc['Bug'] = f'{self.misc_name}BadValue'
68+
return
69+
# Do the split.
70+
space_after = node.misc['SpaceAfter']
71+
forms = value.split(' ')
72+
# Optionally, SplitTokenMorpho in MISC can have the morphological annotation
73+
# of the new tokens. For example:
74+
# SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act
75+
if node.misc['SplitTokenMorpho'] != '':
76+
morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ')
77+
del node.misc['SplitTokenMorpho']
78+
else:
79+
morphoblocks = ['' for x in forms]
80+
node.form = forms[0]
81+
last_node = node
82+
for form, morpho in zip(forms[1:], morphoblocks[1:]):
83+
last_node.misc['SpaceAfter'] = 'No'
84+
last_node.misc['CorrectSpaceAfter'] = 'Yes'
85+
lemma = form
86+
upos = node.upos
87+
feats = str(node.feats)
88+
xpos = node.xpos
89+
if morpho != '':
90+
cols = morpho.split('\\t')
91+
for c in cols:
92+
colname, value = c.split('=', 1)
93+
if colname == 'LEMMA':
94+
lemma = value
95+
elif colname == 'UPOS':
96+
upos = value
97+
elif colname == 'FEATS':
98+
feats = re.sub(r'\\p', '|', value)
99+
elif colname == 'XPOS':
100+
xpos = value
101+
else:
102+
logging.fatal(f"c = {c}")
103+
new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep')
104+
new_node.shift_after_node(last_node)
105+
last_node = new_node
106+
last_node.misc['SpaceAfter'] = space_after
107+
del node.misc[self.misc_name]

0 commit comments

Comments
 (0)