Skip to content

Commit

Permalink
More compound rules
Browse files Browse the repository at this point in the history
  • Loading branch information
zverok committed May 28, 2020
1 parent 9b3e112 commit 42063d4
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 61 deletions.
2 changes: 1 addition & 1 deletion .flake8
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 100
max-line-length = 120
show-source = True
count = True
6 changes: 4 additions & 2 deletions examples/e_lookup.py
Expand Up @@ -4,11 +4,13 @@
from spyll.hunspell.dictionary import Dictionary
from spyll.hunspell.algo.lookup import analyze

dic = Dictionary('tests/fixtures/hunspell-orig/checkcompoundcase')
dic = Dictionary('tests/fixtures/hunspell-orig/affixes')

# print(dic.aff.pfx)
# print(list(analyze(dic.aff, dic.dic, 'foo-bar')))
# print(dic.lookup('foo-baz'))
# print(list(analyze(dic.aff, dic.dic, 'foo-baz')))
print(dic.lookup('BAZBar'))
print(dic.lookup('reworked'))

# print(list(analyze(dic.aff, dic.dic, 'implied')))

100 changes: 78 additions & 22 deletions spyll/hunspell/algo/lookup.py
@@ -1,21 +1,66 @@
import itertools
import collections
import re
from enum import Enum
from typing import List, Iterator, Union, Optional
import dataclasses
from dataclasses import dataclass

from spyll.hunspell import data
import spyll.hunspell.algo.capitalization as cap
import spyll.hunspell.algo.permutations as pmt


CompoundPos = Enum('CompoundPos', 'BEGIN MIDDLE END')

Paradigm = collections.namedtuple('Paradigm',
['stem', 'prefix', 'suffix', 'prefix2', 'suffix2'],
defaults=[None, None, None, None]
)

@dataclass
class Paradigm:
text: str
stem: str
prefix: Optional[data.aff.Prefix] = None
suffix: Optional[data.aff.Suffix] = None
prefix2: Optional[data.aff.Prefix] = None
suffix2: Optional[data.aff.Suffix] = None

def replace(self, **changes):
return dataclasses.replace(self, **changes)


Compound = List[Paradigm]


def lookup(aff: data.Aff, dic: data.Dic, word: str, *, capitalization=True, allow_nosuggest=True) -> bool:
if aff.forbiddenword and \
any(aff.forbiddenword in w.flags for w in dic.homonyms(word)):
return False

def is_found(variant):
return any(
analyze(aff, dic, variant, capitalization=capitalization, allow_nosuggest=allow_nosuggest)
)

def try_break(text, depth=0):
if depth > 10:
return

yield [text]
for pat in aff.breakpatterns:
for m in re.finditer(pat, text):
start = text[:m.start(1)]
rest = text[m.end(1):]
for breaking in try_break(rest, depth=depth+1):
yield [start, *breaking]

if is_found(word):
return True

for parts in try_break(word):
if all(is_found(part) for part in parts if part):
return True

return False


def analyze(aff: data.Aff, dic: data.Dic, word: str, *,
capitalization=True,
allow_nosuggest=True) -> Iterator[Union[Paradigm, Compound]]:
Expand Down Expand Up @@ -66,6 +111,7 @@ def analyze_affixed(

def analyze_compound(aff: data.Aff, dic: data.Dic, word: str,
allow_nosuggest=True) -> Iterator[Compound]:

if aff.compoundbegin or aff.compoundflag:
by_flags = split_compound_by_flags(aff, dic, word, allow_nosuggest=allow_nosuggest)
else:
Expand All @@ -78,24 +124,34 @@ def analyze_compound(aff: data.Aff, dic: data.Dic, word: str,
by_rules = iter(())

def bad_compound(compound):

for left_paradigm in compound[:-1]:
left = left_paradigm.text

if aff.compoundforbidflag:
# We don't check right: compoundforbid prohibits words at the beginning and middle
for dword in dic.homonyms(left):
if aff.compoundforbidflag in dword.flags:
return True

for right_paradigm in compound[1:]:
# FIXME: In fact, full words, not stems!
left = left_paradigm.stem
right = right_paradigm.stem
right = right_paradigm.text
if aff.checkcompoundrep:
for candidate in pmt.replchars(left + right, aff.rep):
if isinstance(candidate, str) and any(analyze_affixed(aff, dic, candidate)):
return True
if aff.checkcompoundtriple:
if len(set(left[-2:] + right[:1])) == 1 or len(set(left[-1:] + right[:2])) == 1:
return True
if aff.checkcompoundcase:
r = right[0]
l = left[-1]
if (r == r.upper() or l == l.upper()) and r != '-' and l != '-':
return True
return False


yield from (compound
for compound in itertools.chain(by_flags, by_rules)
if not bad_compound(compound)
)
for compound in itertools.chain(by_flags, by_rules)
if not bad_compound(compound))


def have_compatible_flags(
Expand Down Expand Up @@ -178,7 +234,7 @@ def _split_affixes(
word: str,
compoundpos: Optional[CompoundPos] = None) -> Iterator[Paradigm]:

yield Paradigm(word) # "Whole word" is always existing option
yield Paradigm(word, word) # "Whole word" is always existing option

if compoundpos:
suffix_allowed = compoundpos == CompoundPos.END or aff.compoundpermitflag
Expand All @@ -202,7 +258,7 @@ def _split_affixes(

if suffix_allowed and form.prefix.crossproduct:
yield from (
form2._replace(prefix=form.prefix)
form2.replace(prefix=form.prefix)
for form2 in desuffix(aff, form.stem, required_flags=suffix_required_flags, forbidden_flags=forbidden_flags, crossproduct=True)
)

Expand All @@ -216,7 +272,7 @@ def desuffix(
crossproduct: bool = False) -> Iterator[Paradigm]:

def good_suffix(suffix):
return crossproduct == suffix.crossproduct and \
return (not crossproduct or suffix.crossproduct) and \
all(f in suffix.flags for f in required_flags) and \
all(f not in suffix.flags for f in forbidden_flags)

Expand All @@ -229,15 +285,15 @@ def good_suffix(suffix):
for suffix in possible_suffixes:
stem = suffix.regexp.sub(suffix.strip, word)

yield Paradigm(stem, suffix=suffix)
yield Paradigm(word, stem, suffix=suffix)

if not nested: # only one level depth
for form2 in desuffix(aff, stem,
required_flags=[suffix.flag, *required_flags],
forbidden_flags=forbidden_flags,
nested=True,
crossproduct=crossproduct):
yield form2._replace(suffix2=suffix)
yield form2.replace(suffix2=suffix, text=word)


def deprefix(
Expand All @@ -260,15 +316,15 @@ def good_prefix(prefix):
for prefix in possible_prefixes:
stem = prefix.regexp.sub(prefix.strip, word)

yield Paradigm(stem, prefix=prefix)
yield Paradigm(word, stem, prefix=prefix)

# TODO: Only if compoundpreffixes are allowed in *.aff
if not nested: # only one level depth
for form2 in deprefix(aff, stem,
required_flags=[prefix.flag, *required_flags],
forbidden_flags=forbidden_flags,
nested=True):
yield form2._replace(prefix2=prefix)
yield form2.replace(prefix2=prefix, text=word)


# Compounding details
Expand Down Expand Up @@ -324,7 +380,7 @@ def split_compound_by_rules(
parts = [*prev_parts, homonym]
flag_sets = [w.flags for w in parts]
if any(r.fullmatch(flag_sets) for r in compoundrules):
yield [Paradigm(homonym)]
yield [Paradigm(word_rest, word_rest)]

if len(word_rest) < aff.compoundmin * 2 or \
(aff.compoundwordsmax and len(prev_parts) >= aff.compoundwordsmax):
Expand All @@ -342,4 +398,4 @@ def split_compound_by_rules(
compoundrules=compoundrules, prev_parts=parts
)
for rest in by_rules:
yield [Paradigm(beg), *rest]
yield [Paradigm(beg, beg), *rest]
2 changes: 2 additions & 0 deletions spyll/hunspell/data/aff.py
Expand Up @@ -125,6 +125,8 @@ class Aff:

checkcompoundcase: bool = False
checkcompounddup: bool = False
checkcompoundrep: bool = False
checkcompoundtriple: bool = False

# IO:
oconv: List[Tuple[str, str]] = field(default_factory=list)
Expand Down
37 changes: 1 addition & 36 deletions spyll/hunspell/dictionary.py
@@ -1,4 +1,3 @@
import re
from typing import Iterator

from spyll.hunspell import data, readers
Expand All @@ -23,41 +22,7 @@ def roots(self, *,
yield word

def lookup(self, word: str, *, capitalization=True, allow_nosuggest=True) -> bool:
if self.aff.forbiddenword and \
any(self.aff.forbiddenword in w.flags for w in self.dic.homonyms(word)):
return False

def is_found(variant):
return any(
lookup.analyze(
self.aff,
self.dic,
variant,
capitalization=capitalization,
allow_nosuggest=allow_nosuggest
)
)

def try_break(text, depth=0):
if depth > 10:
return

yield [text]
for pat in self.aff.breakpatterns:
for m in re.finditer(pat, text):
start = text[:m.start(1)]
rest = text[m.end(1):]
for breaking in try_break(rest, depth=depth+1):
yield [start, *breaking]

if is_found(word):
return True

for parts in try_break(word):
if all(is_found(part) for part in parts if part):
return True

return False
return lookup.lookup(self.aff, self.dic, word, capitalization=capitalization, allow_nosuggest=allow_nosuggest)

def is_forbidden(self, word: str) -> bool:
if not self.aff.forbiddenword:
Expand Down

0 comments on commit 42063d4

Please sign in to comment.