Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parser bugfixes #35

Merged
merged 9 commits into from
Dec 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml

This file was deleted.

21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1 +1,22 @@
None 0.3.2 (2019-12-19)
***********************

Features
--------

- - Introduced `towncrier` to track changes
- Added linter `flake8`
- Refactored code to match `pep8`
#32

Fix
---

- - Fix WORD split by `-`

- Split by ` ` (empty space) as well

- Coverage score increase
#35


3 changes: 0 additions & 3 deletions changes/32.feature.md

This file was deleted.

16 changes: 11 additions & 5 deletions rita/engine/translate_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,17 @@ def phrase_parse(value, config, op=None):
"""
TODO: Does not support operators
"""
buff = value.split("-")
yield next(generic_parse("ORTH", buff[0], config=config, op=None))
for b in buff[1:]:
yield next(generic_parse("ORTH", "-", config=config, op=None))
yield next(generic_parse("ORTH", b, config=config, op=None))
splitter = next((s for s in ["-", " "]
if s in value), None)
if splitter:
buff = value.split(splitter)
yield next(generic_parse("ORTH", buff[0], config=config, op=None))
for b in buff[1:]:
if splitter != " ":
yield next(generic_parse("ORTH", splitter, config=config, op=None))
yield next(generic_parse("ORTH", b, config=config, op=None))
else:
yield generic_parse("ORTH", value, config=config, op=None)


PARSERS = {
Expand Down
7 changes: 2 additions & 5 deletions rita/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def explode(v):
return chain(*new_lst)


def resolve_value(obj, config=None):
def resolve_value(obj, config):
context = []

logger.debug("Resolving value: {0}".format(obj))
Expand All @@ -39,10 +39,7 @@ def resolve_value(obj, config=None):
elif isinstance(obj, types.GeneratorType):
return ("either", list(obj), None)

if config:
return obj(config=config)
else:
return obj()
return obj(config=config)


def ANY(config, op=None):
Expand Down
12 changes: 10 additions & 2 deletions rita/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,19 @@ def handle_multi_word(rules, config):
WORD("Knee-length") => WORD("Knee"), WORD("-"), WORD("length")
"""
for group_label, pattern in rules:
yield (group_label, pattern)
def gen():
for p in pattern:
(name, args, op) = p
if name == "value" and is_complex(args):
yield ("phrase", args, op)
else:
yield p

yield (group_label, list(gen()))


def is_complex(arg):
splitters = ["-"]
splitters = ["-", " "]
return any([s in arg
for s in splitters])

Expand Down
29 changes: 25 additions & 4 deletions tests/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,14 @@ def test_tokenize_exec_macro():
def test_tokenize_two_exec_macros():
lex = RitaLexer()
lex.build()
tokens = list(lex.tokenize("""
!CONFIG("setting.1", "1")
!CONFIG("setting.2", "0")
"""))
tokens = list(
lex.tokenize(
"""
!CONFIG("setting.1", "1")
!CONFIG("setting.2", "0")
"""
)
)
assert len(tokens) == 14
assert tokens[0].type == "EXEC"
assert tokens[1].type == "KEYWORD"
Expand All @@ -89,3 +93,20 @@ def test_tokenize_two_exec_macros():
assert tokens[8].type == "KEYWORD"
assert tokens[10].type == "LITERAL"
assert tokens[12].type == "LITERAL"


def test_tokenize_list_w_one_item():
lex = RitaLexer()
lex.build()

tokens = list(
lex.tokenize(
"""
members = { "first" }
"""
)
)

assert tokens[0].type == "NAME"
assert tokens[1].type == "ASSIGN"
assert tokens[3].type == "LITERAL"
75 changes: 52 additions & 23 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ def test_parser_assign_literal_and_ignore_it(config):

results = p.parse(
"""
my_variable = "Test"
my_variable = "Test"

{WORD("something")} -> MARK("TEST")
"""
{WORD("something")} -> MARK("TEST")
"""
)
assert len(results) == 2

Expand All @@ -61,10 +61,10 @@ def test_parser_assign_literal_and_use_it(config):

results = p.parse(
"""
my_variable = "Test"
my_variable = "Test"

{WORD(my_variable)} -> MARK("TEST")
"""
{WORD(my_variable)} -> MARK("TEST")
"""
)
assert len(results) == 2

Expand All @@ -80,8 +80,8 @@ def test_parser_just_assign_macro(config):

results = p.parse(
"""
x = WORD("Test")
"""
x = WORD("Test")
"""
)
assert len(results) == 1

Expand All @@ -92,9 +92,9 @@ def test_parser_assign_two_variables(config):

results = p.parse(
"""
a = "A"
b = "B"
"""
a = "A"
b = "B"
"""
)
assert len(results) == 2

Expand All @@ -105,10 +105,10 @@ def test_parser_assign_macro_and_use_it(config):

results = p.parse(
"""
my_variable = WORD("Test")
my_variable = WORD("Test")

{my_variable} -> MARK("TEST")
"""
{my_variable} -> MARK("TEST")
"""
)
assert len(results) == 2

Expand All @@ -124,10 +124,10 @@ def test_parser_import_module(config):

results = p.parse(
"""
IMPORT("rita.modules.fuzzy") -> EXEC
IMPORT("rita.modules.fuzzy") -> EXEC

FUZZY("test") -> MARK("FUZZY_MATCH")
"""
FUZZY("test") -> MARK("FUZZY_MATCH")
"""
)

assert len(results) == 2
Expand All @@ -140,10 +140,10 @@ def test_parser_import_module_shortcut(config, caplog):

results = p.parse(
"""
!IMPORT("rita.modules.fuzzy")
!IMPORT("rita.modules.fuzzy")

FUZZY("test") -> MARK("FUZZY_MATCH")
"""
FUZZY("test") -> MARK("FUZZY_MATCH")
"""
)

assert len(results) == 2
Expand All @@ -155,10 +155,39 @@ def test_parser_config(config):

p.parse(
"""
!CONFIG("foo", "bar")
!CONFIG("testing", "1")
"""
!CONFIG("foo", "bar")
!CONFIG("testing", "1")
"""
)

assert config.foo == "bar"
assert config.testing


def test_parser_list_w_one_item(config):
p = RitaParser(config)
p.build(debug=True)

results = p.parse(
"""
members = { "one" }

IN_LIST(members) -> MARK("MEMBER")
"""
)

assert len(results) == 2

def test_parser_list_w_two_items(config):
p = RitaParser(config)
p.build(debug=True)

results = p.parse(
"""
members = {"one", "two"}

IN_LIST(members) -> MARK("MEMBER")
"""
)

assert len(results) == 2
64 changes: 64 additions & 0 deletions tests/test_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,33 @@ def compiler(self, rules):
pytest.importorskip("spacy", minversion="2.1")
return rita.compile_string(rules, use_engine="spacy")

def test_punct(self):
rules = self.compiler('PUNCT->MARK("SOME_PUNCT")')
print(rules)
assert len(rules) == 1
assert rules[0] == {
"pattern": [{"IS_PUNCT": True}],
"label": "SOME_PUNCT"
}

def test_number(self):
rules = self.compiler('NUM("42")->MARK("SOME_NUMBER")')
print(rules)
assert len(rules) == 1
assert rules[0] == {
"pattern": [{"LOWER": "42"}],
"label": "SOME_NUMBER"
}

def test_pos(self):
rules = self.compiler('POS("VERB")->MARK("SOME_POS")')
print(rules)
assert len(rules) == 1
assert rules[0] == {
"pattern": [{"POS": "VERB"}],
"label": "SOME_POS"
}

def test_single_word(self):
rules = self.compiler('WORD("Test")->MARK("SOME_LABEL")')
print(rules)
Expand Down Expand Up @@ -142,6 +169,31 @@ def test_double_branching_list(self):
"pattern": [{"LOWER": "test"}, {"LOWER": "-"}, {"LOWER": "5"}]
}

def test_word_with_spaces(self):
rules = self.compiler('''
WORD("test1 test2")->MARK("SPLIT_WORD")
''')
print(rules)
# It should be split into two: WORD("test1"), WORD("test2")
assert len(rules) == 1
assert rules[0] == {
"label": "SPLIT_WORD",
"pattern": [{"LOWER": "test1"}, {"LOWER": "test2"}]
}

def test_word_with_dash(self):
rules = self.compiler('''
WORD("test1-test2")->MARK("SPLIT_WORD")
''')
print(rules)
# It should be split into two: WORD("test1"), WORD("test2")
assert len(rules) == 1
assert rules[0] == {
"label": "SPLIT_WORD",
"pattern": [{"LOWER": "test1"}, {"LOWER": "-"}, {"LOWER": "test2"}]
}



class TestStandalone(object):
@property
Expand All @@ -155,6 +207,18 @@ def flags(self):
def compiler(self, rules):
return rita.compile_string(rules, use_engine="standalone").patterns

def test_punct(self):
rules = self.compiler('PUNCT->MARK("SOME_PUNCT")')
print(rules)
assert len(rules) == 1
assert rules[0] == re.compile(r"(?P<SOME_PUNCT>[.,!;?:])", self.flags)

def test_number(self):
rules = self.compiler('NUM("42")->MARK("SOME_NUMBER")')
print(rules)
assert len(rules) == 1
assert rules[0] == re.compile(r"(?P<SOME_NUMBER>(42))", self.flags)

def test_single_word(self):
rules = self.compiler('WORD("Test")->MARK("SOME_LABEL")')
print(rules)
Expand Down