diff --git a/CHANGELOG.md b/CHANGELOG.md index e30aa74..28f0911 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,42 @@ +0.5.0 (2020-06-18) +**************************** + +Features +-------- + +- Added `PREFIX` macro which allows to attach word in front of list items or words + #47 +- Allow to pass variables directly when doing `compile` and `compile_string` + #51 +- Allow to compile (and later load) rules using rita CLI while using standalone engine (spacy is already supported) + #53 +- Added ability to import rule files into rule file. Recursive import is supported as well. + #55 +- Added possibility to define pattern as a variable and reuse it in other patterns: + + Example: + .. code-block:: RITA + + ComplexNumber = {NUM+, WORD("/")?, NUM?} + + {PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT") + + {PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH") + #64 + +Fix +--- + +- Fix issue with multiple wildcard words using standalone engine + #46 +- Don't crash when no rules are provided + #50 +- Fix Number and ANY-OF parsing + #59 +- Allow escape characters inside LITERAL + #62 + + 0.4.0 (2020-01-25) **************************** diff --git a/changes/46.fix.rst b/changes/46.fix.rst deleted file mode 100644 index 8b154f2..0000000 --- a/changes/46.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix issue with multiple wildcard words using standalone engine diff --git a/changes/47.feature.rst b/changes/47.feature.rst deleted file mode 100644 index 9cf8eba..0000000 --- a/changes/47.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Added `PREFIX` macro which allows to attach word in front of list items or words diff --git a/changes/50.fix.rst b/changes/50.fix.rst deleted file mode 100644 index 5092be6..0000000 --- a/changes/50.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Don't crash when no rules are provided diff --git a/changes/51.feature.rst b/changes/51.feature.rst deleted file mode 100644 index 826c187..0000000 --- a/changes/51.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Allow to pass variables directly when doing `compile` and `compile_string` diff --git a/changes/53.feature.rst b/changes/53.feature.rst deleted file mode 100644 index b686611..0000000 --- a/changes/53.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Allow to compile (and later load) rules using rita CLI while using standalone engine (spacy is already supported) diff --git a/changes/55.feature.rst b/changes/55.feature.rst deleted file mode 100644 index eaa8a56..0000000 --- a/changes/55.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Added ability to import rule files into rule file. Recursive import is supported as well. diff --git a/changes/59.fix.rst b/changes/59.fix.rst deleted file mode 100644 index 27a6570..0000000 --- a/changes/59.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix Number and ANY-OF parsing diff --git a/changes/62.fix.rst b/changes/62.fix.rst deleted file mode 100644 index 63b6f2b..0000000 --- a/changes/62.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Allow escape characters inside LITERAL \ No newline at end of file diff --git a/docs/advanced.md b/docs/advanced.md index 8816c24..e73ddaf 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -11,3 +11,14 @@ Eg.: ``` @import "examples/simple-match.rita" ``` + +# Reusing patterns + +You can define (since version 0.5.0+) pattern as a variable: + +``` +ComplexNumber = {NUM+, WORD("/")?, NUM?} + +{PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT") +{PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH") +``` \ No newline at end of file diff --git a/examples/complex-number.rita b/examples/complex-number.rita new file mode 100644 index 0000000..6ca3ba6 --- /dev/null +++ b/examples/complex-number.rita @@ -0,0 +1,3 @@ +Complex_Number = { NUM+, WORD("/")?, NUM? } +{PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH") +{PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8cb0cbe..54fb0a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "rita-dsl" -version = "0.4.7" +version = "0.5.0" description = "DSL for building language rules" authors = [ "Šarūnas Navickas " diff --git a/rita/__init__.py b/rita/__init__.py index 3b06adc..de87184 100644 --- a/rita/__init__.py +++ b/rita/__init__.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -__version__ = (0, 4, 7, os.getenv("VERSION_PATCH")) +__version__ = (0, 5, 0, os.getenv("VERSION_PATCH")) def get_version(): diff --git a/rita/engine/translate_spacy.py b/rita/engine/translate_spacy.py index b9c14b4..856998d 100644 --- a/rita/engine/translate_spacy.py +++ b/rita/engine/translate_spacy.py @@ -88,7 +88,7 @@ def phrase_parse(value, config, op=None): def rules_to_patterns(label, data, config): - print(data) + logger.debug(data) return { "label": label, "pattern": [p diff --git a/rita/engine/translate_standalone.py b/rita/engine/translate_standalone.py index 35344d0..fe627af 100644 --- a/rita/engine/translate_standalone.py +++ b/rita/engine/translate_standalone.py @@ -91,7 +91,7 @@ def gen(): yield data[0] for (t, d, op) in data[1:]: - yield (t, d, op) + yield t, d, op return ( label, diff --git a/rita/macros.py b/rita/macros.py index b68d4e8..d624594 100644 --- a/rita/macros.py +++ b/rita/macros.py @@ -1,28 +1,12 @@ import logging import types -from itertools import chain +from rita.utils import flatten logger = logging.getLogger(__name__) -def flatten(lst): - if len(lst) > 1: - return lst - - def explode(v): - if callable(v): - return v() - else: - return v - - new_lst = map(explode, lst) - return chain(*new_lst) - - def resolve_value(obj, config): - context = [] - logger.debug("Resolving value: {0}".format(obj)) if isinstance(obj, str): @@ -32,9 +16,7 @@ def resolve_value(obj, config): return obj elif isinstance(obj, list): - for item in obj: - context.append(item) - return context + return obj elif isinstance(obj, types.GeneratorType): return "either", list(obj), None @@ -69,10 +51,8 @@ def ASSIGN(k, v, config, op=None): def IN_LIST(*args, config, op=None): - variants = [] - for arg in flatten(args): - variants.append(resolve_value(arg, config=config)) - return "any_of", variants, None + return "any_of", [resolve_value(arg, config=config) + for arg in flatten(args)], None def PATTERN(*args, config, op=None): diff --git a/rita/parser.py b/rita/parser.py index 326f63c..81da302 100644 --- a/rita/parser.py +++ b/rita/parser.py @@ -195,7 +195,5 @@ def build(self, **kwargs): def parse(self, data): if data.strip() == "": return [] - print(data) - print(r"{}".format(data)) return self.parser.parse(r"{}".format(data), lexer=self.lexer, debug=logger) diff --git a/rita/preprocess.py b/rita/preprocess.py index 9eedb7a..5a7556e 100644 --- a/rita/preprocess.py +++ b/rita/preprocess.py @@ -3,6 +3,7 @@ from functools import reduce from rita.utils import Node, deaccent +from rita.macros import resolve_value logger = logging.getLogger(__name__) @@ -17,7 +18,7 @@ def apply_prefix(pattern, prefix): return (name, list(["{0}{1}".format(prefix, item) for item in args]), op) elif name == "value": - return (name, "{0}{1}".format(prefix, args), op) + return name, "{0}{1}".format(prefix, args), op else: logger.warning("Don't know how to apply prefix on: {}".format(name)) return pattern @@ -35,7 +36,7 @@ def gen(): else: yield p for group_label, pattern in rules: - yield (group_label, list(gen())) + yield group_label, list(gen()) def handle_deaccent(rules, config): @@ -51,7 +52,7 @@ def gen(): if name == "value": (v1, v2) = (args, deaccent(args)) if v1 != v2: - yield ("any_of", (v1, v2,), op) + yield "any_of", (v1, v2,), op else: yield p elif name == "any_of": @@ -64,11 +65,11 @@ def items(): else: yield v1 - yield ("any_of", list(items()), op) + yield "any_of", list(items()), op else: yield p - yield (group_label, list(gen())) + yield group_label, list(gen()) def add_implicit_punct(rules, config): @@ -81,12 +82,12 @@ def add_implicit_punct(rules, config): def gen(): for p in pattern: yield p - yield ("punct", None, "?") + yield "punct", None, "?" if len(pattern) == 1: - yield (group_label, pattern) + yield group_label, pattern else: - yield (group_label, list(gen())[:-1]) + yield group_label, list(gen())[:-1] def handle_multi_word(rules, config): @@ -104,11 +105,11 @@ def gen(): for p in pattern: (name, args, op) = p if name == "value" and is_complex(args): - yield ("phrase", args, op) + yield "phrase", args, op else: yield p - yield (group_label, list(gen())) + yield group_label, list(gen()) def is_complex(arg): @@ -175,15 +176,15 @@ def handle_rule_branching(rules, config): if any([p == "either" for (p, _, _) in pattern]): for p in branch_pattern(pattern, config): - yield (group_label, p) + yield group_label, p # Covering case when there are complex items in list elif any([p == "any_of" and has_complex(o) for (p, o, _) in pattern]): for p in branch_pattern(pattern, config): - yield (group_label, p) + yield group_label, p else: - yield (group_label, pattern) + yield group_label, pattern def dummy(rules, config): @@ -195,7 +196,23 @@ def dummy(rules, config): def rule_tuple(d): - return (d["label"], d["data"]) + return d["label"], d["data"] + + +def expand_patterns(rules, config): + """ + We can have situations where inside pattern we have another pattern (via Variable). + We want to expand this inner pattern and prepend to outer pattern + """ + for group_label, pattern in rules: + def gen(): + for p in pattern: + if callable(p): + yield resolve_value(p, config=config) + else: + yield p + + yield group_label, list(gen()) def preprocess_rules(root, config): @@ -205,7 +222,7 @@ def preprocess_rules(root, config): for doc in root if doc and doc()] - pipeline = [dummy, handle_deaccent, handle_rule_branching, handle_multi_word, handle_prefix] + pipeline = [dummy, expand_patterns, handle_deaccent, handle_rule_branching, handle_multi_word, handle_prefix] if config.implicit_punct: logger.info("Adding implicit Punctuations") diff --git a/rita/utils.py b/rita/utils.py index c6b0524..8d1277e 100644 --- a/rita/utils.py +++ b/rita/utils.py @@ -1,7 +1,7 @@ import logging from unicodedata import normalize, category -from itertools import cycle +from itertools import cycle, chain logger = logging.getLogger(__name__) @@ -100,3 +100,20 @@ def deaccent(text): "".join(c for c in normalize("NFD", text) if category(c) != "Mn")) + + +def flatten(lst, shallow=False): + def explode(v): + if callable(v): + return v() + else: + return v + + if len(lst) > 1 and not shallow: + return lst + + new_lst = map(explode, lst) + if shallow: + return new_lst + else: + return chain(*new_lst) diff --git a/tests/test_examples.py b/tests/test_examples.py index e2b8b4f..4125637 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -197,3 +197,14 @@ def parse_rows(parser, rows): iterations=3, rounds=3 ) + + +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +def test_variable_pattern(engine): + parser = engine(load_rules("examples/complex-number.rita")) + text = """ + It is 17 1/2 inches width and 10 inches height + """ + + results = parser(text) + assert len(results) == 2 diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 9da8333..57d4293 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -126,3 +126,14 @@ def test_tokenize_variable_w_escape(): assert tokens[2].type == "LITERAL" assert tokens[4].type == "ARROW" assert tokens[5].type == "KEYWORD" + + +def test_pattern_in_variable(): + lex = RitaLexer() + lex.build() + + tokens = list( + lex.tokenize(r'COMPLEX_NUMBER = {NUM+, WORD("/")?, NUM}') + ) + + assert len(tokens) == 14 diff --git a/tests/test_parser.py b/tests/test_parser.py index 6907dd5..fcc33ca 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -210,3 +210,18 @@ def test_parser_literal_w_escape(config): ) assert len(results) == 1 + + +def test_parser_pattern_in_variable(config): + p = RitaParser(config) + p.build(debug=True) + + results = p.parse( + ''' + Complex_Number = { NUM+, WORD("/")?, NUM? } + {PATTERN(Complex_Number), WORD("inch")}->MARK("WIDTH") + ''' + ) + + print(results) + assert len(results) == 2