From 610ae9c5d8bf9ea8994f6de316aa02444c04812a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Fri, 28 Aug 2020 15:43:45 +0300 Subject: [PATCH 01/14] Wireup new engine --- rita/config.py | 2 + rita/engine/translate_rust.py | 88 +++++++++++++++++++++++++++++++++++ tests/test_examples.py | 7 +-- tests/utils.py | 14 ++++++ 4 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 rita/engine/translate_rust.py diff --git a/rita/config.py b/rita/config.py index 2c3f169..e012010 100644 --- a/rita/config.py +++ b/rita/config.py @@ -8,6 +8,7 @@ pass from rita.engine.translate_standalone import compile_rules as standalone_engine +from rita.engine.translate_rust import compile_rules as rust_engine from rita.utils import SingletonMixin @@ -27,6 +28,7 @@ def __init__(self): # spacy_engine is not imported pass self.register_engine(2, "standalone", standalone_engine) + self.register_engine(3, "rust", rust_engine) def register_engine(self, priority, key, compile_fn): self.available_engines.append((priority, key, compile_fn)) diff --git a/rita/engine/translate_rust.py b/rita/engine/translate_rust.py new file mode 100644 index 0000000..bb55ed7 --- /dev/null +++ b/rita/engine/translate_rust.py @@ -0,0 +1,88 @@ +import os +import logging + +from ctypes import * + +from rita.engine.translate_standalone import rules_to_patterns, RuleExecutor + +logger = logging.getLogger(__name__) + + +class ResultEntity(Structure): + _fields_ = [ + ("label", c_char_p), + ("text", c_char_p), + ("start", c_size_t), + ("end", c_size_t), + ] + + +class ResultsWrapper(Structure): + _fields_ = [ + ("count", c_uint), + ("results", (ResultEntity * 32)) + ] + + +class Context(Structure): + _fields_ = [] + + +def load_lib(): + try: + if "nt" in os.name: + lib = cdll.LoadLibrary("rita_rust.dll") + elif os.name == "posix": + lib = cdll.LoadLibrary("librita_rust.dylib") + else: + lib = cdll.LoadLibrary("librita_rust.so") + lib.compile.restype = POINTER(Context) + lib.execute.argtypes = [POINTER(Context), c_char_p] + lib.execute.restype = ResultsWrapper + lib.clean_env.argtypes = [POINTER(Context)] + return lib + except Exception as ex: + logger.error("Failed to load rita-rust library, reason: {}\n\n" + "Most likely you don't have required shared library to use it".format(ex)) + + +class RustRuleExecutor(RuleExecutor): + def __init__(self, patterns, config): + self.config = config + self.context = None + + self.lib = load_lib() + self.patterns = [self._build_regex_str(label, rules) + for label, rules in patterns] + + self.compile() + + @staticmethod + def _build_regex_str(label, rules): + return r"(?P<{0}>{1})".format(label, "".join(rules)) + + def compile(self): + c_array = (c_char_p * len(self.patterns))(*list([p.encode("UTF-8") for p in self.patterns])) + self.context = self.lib.compile(c_array, len(c_array)) + return self.context + + def _results(self, text): + raw = self.lib.execute(self.context, text.encode("UTF-8")) + for i in range(0, raw.count): + match = raw.results[i] + yield { + "start": match.start, + "end": match.end, + "text": match.text.decode("UTF-8").strip(), + "label": match.label, + } + + def clean_context(self): + self.lib.clean_env(self.context) + + +def compile_rules(rules, config, **kwargs): + logger.info("Using rita-rust rule implementation") + patterns = [rules_to_patterns(*group) for group in rules] + executor = RustRuleExecutor(patterns, config) + return executor diff --git a/tests/test_examples.py b/tests/test_examples.py index 6f1373e..339cf89 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -3,7 +3,7 @@ import rita -from utils import spacy_engine, standalone_engine, load_rules +from utils import spacy_engine, standalone_engine, rust_engine, load_rules @pytest.fixture(scope="session") @@ -215,7 +215,7 @@ def test_compile_context(): } -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_benchmark(benchmark, engine, bench_text): """ These tests will only run if parameters: @@ -248,7 +248,7 @@ def test_variable_pattern(engine): assert len(results) == 2 -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_inlist_longest(engine): parser = engine(""" units = {"m", "mm", "cm"} @@ -285,6 +285,7 @@ def test_inlist_word_based(engine): @pytest.mark.parametrize('engine', [standalone_engine, spacy_engine]) def test_pluralize(engine): + pytest.importorskip("inflect") parser = engine(""" !IMPORT("rita.modules.pluralize") diff --git a/tests/utils.py b/tests/utils.py index 69612d4..ff2f3b0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -34,6 +34,20 @@ def parse(text): return parse +def rust_engine(rules, **kwargs): + from rita.engine.translate_rust import load_lib + l = load_lib() + if l is None: + pytest.skip("Missing rita-rust dynamic lib, skipping related tests") + parser = rita.compile_string(rules, use_engine="rust", **kwargs) + print(parser.patterns) + + def parse(text): + results = list(parser.execute(text)) + return list([(r["text"], r["label"]) for r in results]) + return parse + + def normalize_output(r): return re.sub(r"\s+", " ", r.strip().replace("\n", "")) From 5f6c4bd32bd36842c6f9ae85d3abbe1247f348dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Fri, 28 Aug 2020 16:00:00 +0300 Subject: [PATCH 02/14] Include Rust engine in more tests --- tests/test_examples.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index 339cf89..46f6e3a 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -30,7 +30,7 @@ def test_color_car(engine): assert entities.issuperset(expected) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_fuzzy_matching(engine): parser = engine(load_rules("examples/fuzzy-matching.rita")) @@ -55,7 +55,7 @@ def test_fuzzy_matching(engine): assert entities[0] == ("SQUIRREL", "CRITTER") -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_election(engine): parser = engine( """ @@ -75,7 +75,7 @@ def test_election(engine): assert entities.issuperset(expected) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_dash_case(engine): parser = engine(load_rules("examples/dress-match.rita")) text = """ @@ -92,7 +92,7 @@ def test_dash_case(engine): assert entities.issuperset(expected) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_exclude_word(engine): parser = engine(load_rules("examples/excluding-word.rita")) @@ -106,13 +106,13 @@ def test_exclude_word(engine): assert len(r2) == 0 -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_escape_string(engine): # If it compiles - good enough engine(load_rules("examples/match-with-escaped-string.rita")) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_case_sensitive(engine): parser = engine( """ @@ -140,7 +140,7 @@ def test_case_sensitive(engine): assert filtered[0] == ("Bitcoin Cash", "CRYPTO") -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_with_implicit_hyphon(engine): parser = engine( """ @@ -159,7 +159,7 @@ def test_with_implicit_hyphon(engine): assert results[0] == ("Hello - world", "HYPHON_LABEL") -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_without_implicit_hyphon(engine): parser = engine( """ @@ -178,7 +178,7 @@ def test_without_implicit_hyphon(engine): assert results[0] == ("Hello", "HELLO_LABEL") -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_prefix(engine): parser = engine( """ @@ -237,7 +237,7 @@ def parse_rows(parser, rows): ) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_variable_pattern(engine): parser = engine(load_rules("examples/complex-number.rita")) text = """ @@ -267,7 +267,7 @@ def test_inlist_longest(engine): assert result == "width 10 mm" -@pytest.mark.parametrize('engine', [standalone_engine]) +@pytest.mark.parametrize('engine', [standalone_engine, rust_engine]) def test_inlist_word_based(engine): parser = engine(""" units = {"m", "mm", "cm", "inches", "in"} @@ -283,7 +283,7 @@ def test_inlist_word_based(engine): assert len(results) == 0 -@pytest.mark.parametrize('engine', [standalone_engine, spacy_engine]) +@pytest.mark.parametrize('engine', [standalone_engine, spacy_engine, rust_engine]) def test_pluralize(engine): pytest.importorskip("inflect") parser = engine(""" From 5455b85b25370fc2c190cbf03e4a2a4a0f56e9ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Fri, 28 Aug 2020 16:00:42 +0300 Subject: [PATCH 03/14] Include new engine --- tests/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 52e42c1..59f3296 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -20,7 +20,7 @@ def test_registered_engines(cfg): def test_registered_engines_has_spacy(cfg): pytest.importorskip("spacy", minversion="2.1") from rita.engine.translate_spacy import compile_rules - assert len(cfg.available_engines) == 2 + assert len(cfg.available_engines) == 3 assert cfg.default_engine == compile_rules From 8df3c774ccc20557135ca3b1ed5416bb68571764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Fri, 28 Aug 2020 16:06:42 +0300 Subject: [PATCH 04/14] Few tweaks --- rita/engine/translate_rust.py | 2 +- tests/test_examples.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rita/engine/translate_rust.py b/rita/engine/translate_rust.py index bb55ed7..ab98431 100644 --- a/rita/engine/translate_rust.py +++ b/rita/engine/translate_rust.py @@ -74,7 +74,7 @@ def _results(self, text): "start": match.start, "end": match.end, "text": match.text.decode("UTF-8").strip(), - "label": match.label, + "label": match.label.decode("UTF-8"), } def clean_context(self): diff --git a/tests/test_examples.py b/tests/test_examples.py index 46f6e3a..8108bac 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -92,7 +92,7 @@ def test_dash_case(engine): assert entities.issuperset(expected) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) def test_exclude_word(engine): parser = engine(load_rules("examples/excluding-word.rita")) @@ -106,7 +106,7 @@ def test_exclude_word(engine): assert len(r2) == 0 -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) def test_escape_string(engine): # If it compiles - good enough engine(load_rules("examples/match-with-escaped-string.rita")) From a9175a96b6eb18d6639a45653f018ead80380059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Fri, 28 Aug 2020 19:58:31 +0300 Subject: [PATCH 05/14] Deal with raw strings --- rita/engine/translate_rust.py | 2 +- tests/test_examples.py | 2 +- tests/utils.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rita/engine/translate_rust.py b/rita/engine/translate_rust.py index ab98431..65ced1f 100644 --- a/rita/engine/translate_rust.py +++ b/rita/engine/translate_rust.py @@ -59,7 +59,7 @@ def __init__(self, patterns, config): @staticmethod def _build_regex_str(label, rules): - return r"(?P<{0}>{1})".format(label, "".join(rules)) + return r"(?P<{0}>{1})".format(label, "".join(rules)).encode().decode("unicode-escape") def compile(self): c_array = (c_char_p * len(self.patterns))(*list([p.encode("UTF-8") for p in self.patterns])) diff --git a/tests/test_examples.py b/tests/test_examples.py index 8108bac..73a76f6 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -106,7 +106,7 @@ def test_exclude_word(engine): assert len(r2) == 0 -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) def test_escape_string(engine): # If it compiles - good enough engine(load_rules("examples/match-with-escaped-string.rita")) diff --git a/tests/utils.py b/tests/utils.py index ff2f3b0..4135a5a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -39,6 +39,7 @@ def rust_engine(rules, **kwargs): l = load_lib() if l is None: pytest.skip("Missing rita-rust dynamic lib, skipping related tests") + print("Trying to run: {}".format(rules)) parser = rita.compile_string(rules, use_engine="rust", **kwargs) print(parser.patterns) From e6e40a71b955c95bb44a8b4bf8c00a3edeab293c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:01:45 +0300 Subject: [PATCH 06/14] Few minor tweaks --- rita/engine/translate_rust.py | 6 ++++-- tests/test_examples.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/rita/engine/translate_rust.py b/rita/engine/translate_rust.py index 65ced1f..98c0e48 100644 --- a/rita/engine/translate_rust.py +++ b/rita/engine/translate_rust.py @@ -59,11 +59,13 @@ def __init__(self, patterns, config): @staticmethod def _build_regex_str(label, rules): - return r"(?P<{0}>{1})".format(label, "".join(rules)).encode().decode("unicode-escape") + # return r"(?P<{0}>{1})".format(label, "".join(rules)).encode().decode("unicode-escape") + return r"(?P<{0}>{1})".format(label, "".join(rules)) def compile(self): + flag = 0 if self.config.ignore_case else 1 c_array = (c_char_p * len(self.patterns))(*list([p.encode("UTF-8") for p in self.patterns])) - self.context = self.lib.compile(c_array, len(c_array)) + self.context = self.lib.compile(c_array, len(c_array), flag) return self.context def _results(self, text): diff --git a/tests/test_examples.py b/tests/test_examples.py index 73a76f6..bd53be2 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -30,7 +30,7 @@ def test_color_car(engine): assert entities.issuperset(expected) -@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine, rust_engine]) +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) def test_fuzzy_matching(engine): parser = engine(load_rules("examples/fuzzy-matching.rita")) From 08caceba4c929d07680b8daa2c3d53e57559e8c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:02:20 +0300 Subject: [PATCH 07/14] Version bump --- pyproject.toml | 2 +- rita/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5aa1dbf..75e30d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "rita-dsl" -version = "0.5.10" +version = "0.6.0" description = "DSL for building language rules" authors = [ "Šarūnas Navickas " diff --git a/rita/__init__.py b/rita/__init__.py index 16ad15f..3541ac4 100644 --- a/rita/__init__.py +++ b/rita/__init__.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -__version__ = (0, 5, 10, os.getenv("VERSION_PATCH")) +__version__ = (0, 6, 0, os.getenv("VERSION_PATCH")) def get_version(): From c2a4e6a42be9bc86170d5f7961f517d9535a19b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:04:18 +0300 Subject: [PATCH 08/14] style fix --- tests/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 4135a5a..6678d93 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -36,8 +36,8 @@ def parse(text): def rust_engine(rules, **kwargs): from rita.engine.translate_rust import load_lib - l = load_lib() - if l is None: + lib = load_lib() + if lib is None: pytest.skip("Missing rita-rust dynamic lib, skipping related tests") print("Trying to run: {}".format(rules)) parser = rita.compile_string(rules, use_engine="rust", **kwargs) From ff09e99172edc5add38add467027683321890b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:05:38 +0300 Subject: [PATCH 09/14] Another style fix --- rita/engine/translate_rust.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rita/engine/translate_rust.py b/rita/engine/translate_rust.py index 98c0e48..7c97719 100644 --- a/rita/engine/translate_rust.py +++ b/rita/engine/translate_rust.py @@ -1,7 +1,7 @@ import os import logging -from ctypes import * +from ctypes import (c_char_p, c_size_t, c_uint, Structure, cdll, POINTER) from rita.engine.translate_standalone import rules_to_patterns, RuleExecutor @@ -59,7 +59,6 @@ def __init__(self, patterns, config): @staticmethod def _build_regex_str(label, rules): - # return r"(?P<{0}>{1})".format(label, "".join(rules)).encode().decode("unicode-escape") return r"(?P<{0}>{1})".format(label, "".join(rules)) def compile(self): From 61a77dbae46bb9fe3f4dbee88ea04c6236dff494 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:31:47 +0300 Subject: [PATCH 10/14] Basic docs --- docs/engines.md | 36 +++++++++++++++++++++++++++++++ docs/modules.md | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 2 ++ 3 files changed, 94 insertions(+) create mode 100644 docs/engines.md create mode 100644 docs/modules.md diff --git a/docs/engines.md b/docs/engines.md new file mode 100644 index 0000000..381ae07 --- /dev/null +++ b/docs/engines.md @@ -0,0 +1,36 @@ +# Engines + +In RITA what we call `engine` is a system we will compile rules to, and which will do the heavy lifting after that. + +Currently there are three engines: + +## spaCy + +Activated by using `rita.compile(, use_engine="spacy")` + +Using this engine, all of the RITA rules will be compiled into spaCy patterns, which can be natively used by spaCy in various scenarios. +Most often - to improve NER (Named Entity Recognition), by adding additional entities derived from your given rules + +It requires to have spaCy package installed (`pip install spacy`) and to actually use it later, language model needs to be downloaded (`python -m spacy download `) + +## Standalone + +Activated by using `rita.compile(, use_engine="standalone")`. It compiles into pure regex and can be used with zero dependencies. +By default, it uses Python `re` library. Since `0.5.10` version, you can give a custom regex implementation to use: +eg. regex package: `rita.compile(, use_engine="standalone", regex_impl=regex)` + +It is very lightweight, very fast (compared to spaCy), however lacking in some functionality which only proper language model can bring: +- Patterns by entity (PERSON, ORGANIZATION, etc) +- Patterns by Lemmas +- Patterns by POS (Part Of Speech) + +Only generic things, like WORD, NUMBER can be matched. + + +## Rust (new in `0.6.0`) + +There's only an interface inside the code, engine itself is proprietary. + +In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost. +It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case +(eg. few long texts with many matches vs a lot short texts with few matches). \ No newline at end of file diff --git a/docs/modules.md b/docs/modules.md new file mode 100644 index 0000000..dffcaa1 --- /dev/null +++ b/docs/modules.md @@ -0,0 +1,56 @@ +# Modules + +Modules are like plugins to the system, usually providing additional functionality at some cost - needs additional dependencies, supports only specific language etc. +That's why they are not included into the core system, but can be easily included into your rules. + +eg. +``` +!IMPORT("rita.modules.fuzzy") + +FUZZY("squirrel") -> MARK("CRITTER") +``` + +**NOTE**: the import path can be any proper Python import. So this actually allows you to add extra functionality by not modifying RITA's source code. +More on that in [Extending section](./extend.md) + +## Fuzzy + +This is more as an example rather than proper module. The main goal is to generate possible misspelled variants of given word, so that match matches more cases. +Very useful when dealing with actual natural language, eg. comments, social media posts. Word `you` can be automatically matched by proper `you` and `u`, `for` as `for` and `4` etc. + +Usage: +``` +!IMPORT("rita.modules.fuzzy") + +FUZZY("squirrel") -> MARK("CRITTER") +``` + +## Pluralize + +Takes list (or single) words, and creates plural version of each of these. + +Requires: `inflect` library (`pip install inflect`) before using. Works only on english words. + +Usage: + +``` +!IMPORT("rita.modules.pluralize") + +vehicles={"car", "motorbike", "bicycle", "ship", "plane"} +{NUM, PLURALIZE(vehicles)}->MARK("VEHICLES") +``` + +## Tag + +Is used or generating POS/TAG patterns based on a Regex +e.g. TAG("^NN|^JJ") for nouns or adjectives. + +Works only with spaCy engine + +Usage: + +``` +!IMPORT("rita.modules.tag") + +{WORD*, TAG("^NN|^JJ")}->MARK("TAGGED_MATCH") +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 9d032bb..22b40cc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,6 +7,8 @@ nav: - Quickstart: quickstart.md - Syntax: syntax.md - Macros: macros.md + - Engines: engines.md + - Modules: modules.md - Extending: extend.md - Config: config.md - Advanced: advanced.md From 0e7c4d7d5add1da19d486db0d12096fcd85cd4ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:38:50 +0300 Subject: [PATCH 11/14] Ignore rust engine file --- .coveragerc | 2 ++ changes/87.feature.rst | 5 +++++ 2 files changed, 7 insertions(+) create mode 100644 changes/87.feature.rst diff --git a/.coveragerc b/.coveragerc index b0b6c42..66d7a86 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,5 +3,7 @@ branch = True source = rita +omit = rita/engines/translate_rust.py + [report] show_missing = True diff --git a/changes/87.feature.rst b/changes/87.feature.rst new file mode 100644 index 0000000..2993edb --- /dev/null +++ b/changes/87.feature.rst @@ -0,0 +1,5 @@ +An interface to be able to use rust engine. + +In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost. +It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case +(eg. few long texts with many matches vs a lot short texts with few matches). \ No newline at end of file From ab16635274dc02a7dd9f17865dd7196bd7de4bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:41:30 +0300 Subject: [PATCH 12/14] Full changelog entry --- CHANGELOG.md | 58 ++++++++++++++++++++++++++++++++++++++++++ changes/66.feature.rst | 10 -------- changes/71.fix.rst | 1 - changes/72.fix.rst | 1 - changes/75.fix.rst | 1 - changes/77.fix.rst | 1 - changes/80.fix.rst | 1 - changes/81.feature.rst | 5 ---- changes/82.feature.rst | 5 ---- changes/84.feature.rst | 4 --- changes/86.feature.rst | 1 - changes/87.feature.rst | 5 ---- 12 files changed, 58 insertions(+), 35 deletions(-) delete mode 100644 changes/66.feature.rst delete mode 100644 changes/71.fix.rst delete mode 100644 changes/72.fix.rst delete mode 100644 changes/75.fix.rst delete mode 100644 changes/77.fix.rst delete mode 100644 changes/80.fix.rst delete mode 100644 changes/81.feature.rst delete mode 100644 changes/82.feature.rst delete mode 100644 changes/84.feature.rst delete mode 100644 changes/86.feature.rst delete mode 100644 changes/87.feature.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index 28f0911..0b81b4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,61 @@ +0.6.0 (2020-08-29) +**************************** + +Features +-------- + +- Implemented ability to alias macros, eg.: + + .. code-block:: + + numbers = {"one", "two", "three"} + @alias IN_LIST IL + + IL(numbers) -> MARK("NUMBER") + + Now using "IL" will actually call "IN_LIST" macro. + #66 +- introduce the TAG element as a module. Needs a new parser for the SpaCy translate. + Would allow more flexible matching of detailed part-of-speech tag, like all adjectives or nouns: TAG("^NN|^JJ"). + + Implemented by: + Roland M. Mueller (https://github.com/rolandmueller) + #81 +- Add a new module for a PLURALIZE tag + For a noun or a list of nouns, it will match any singular or plural word. + + Implemented by: + Roland M. Mueller (https://github.com/rolandmueller) + #82 +- Add a new Configuration implicit_hyphon (default false) for automatically adding hyphon characters - to the rules. + + Implemented by: + Roland M. Mueller (https://github.com/rolandmueller) + #84 +- Allow to give custom regex impl. By default `re` is used + #86 +- An interface to be able to use rust engine. + + In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost. + It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case + (eg. few long texts with many matches vs a lot short texts with few matches). + #87 + +Fix +--- + +- Fix `-` bug when it is used as stand alone word + #71 +- Fix regex matching, when shortest word is selected from IN_LIST + #72 +- Fix IN_LIST regex so that it wouldn't take part of word + #75 +- Fix IN_LIST operation bug - it was ignoring them + #77 +- Use list branching only when using spaCy Engine + #80 + + 0.5.0 (2020-06-18) **************************** diff --git a/changes/66.feature.rst b/changes/66.feature.rst deleted file mode 100644 index a285de5..0000000 --- a/changes/66.feature.rst +++ /dev/null @@ -1,10 +0,0 @@ -Implemented ability to alias macros, eg.: - -.. code-block:: - - numbers = {"one", "two", "three"} - @alias IN_LIST IL - - IL(numbers) -> MARK("NUMBER") - -Now using "IL" will actually call "IN_LIST" macro. \ No newline at end of file diff --git a/changes/71.fix.rst b/changes/71.fix.rst deleted file mode 100644 index a003bc4..0000000 --- a/changes/71.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix `-` bug when it is used as stand alone word \ No newline at end of file diff --git a/changes/72.fix.rst b/changes/72.fix.rst deleted file mode 100644 index ec2fbb8..0000000 --- a/changes/72.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix regex matching, when shortest word is selected from IN_LIST \ No newline at end of file diff --git a/changes/75.fix.rst b/changes/75.fix.rst deleted file mode 100644 index 8f3096d..0000000 --- a/changes/75.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix IN_LIST regex so that it wouldn't take part of word \ No newline at end of file diff --git a/changes/77.fix.rst b/changes/77.fix.rst deleted file mode 100644 index 62132f3..0000000 --- a/changes/77.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix IN_LIST operation bug - it was ignoring them \ No newline at end of file diff --git a/changes/80.fix.rst b/changes/80.fix.rst deleted file mode 100644 index f1f0e32..0000000 --- a/changes/80.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Use list branching only when using spaCy Engine \ No newline at end of file diff --git a/changes/81.feature.rst b/changes/81.feature.rst deleted file mode 100644 index 5d933b2..0000000 --- a/changes/81.feature.rst +++ /dev/null @@ -1,5 +0,0 @@ -introduce the TAG element as a module. Needs a new parser for the SpaCy translate. -Would allow more flexible matching of detailed part-of-speech tag, like all adjectives or nouns: TAG("^NN|^JJ"). - -Implemented by: -Roland M. Mueller (https://github.com/rolandmueller) diff --git a/changes/82.feature.rst b/changes/82.feature.rst deleted file mode 100644 index 700f258..0000000 --- a/changes/82.feature.rst +++ /dev/null @@ -1,5 +0,0 @@ -Add a new module for a PLURALIZE tag -For a noun or a list of nouns, it will match any singular or plural word. - -Implemented by: -Roland M. Mueller (https://github.com/rolandmueller) \ No newline at end of file diff --git a/changes/84.feature.rst b/changes/84.feature.rst deleted file mode 100644 index fd43b8a..0000000 --- a/changes/84.feature.rst +++ /dev/null @@ -1,4 +0,0 @@ -Add a new Configuration implicit_hyphon (default false) for automatically adding hyphon characters - to the rules. - -Implemented by: -Roland M. Mueller (https://github.com/rolandmueller) diff --git a/changes/86.feature.rst b/changes/86.feature.rst deleted file mode 100644 index 695af9a..0000000 --- a/changes/86.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Allow to give custom regex impl. By default `re` is used \ No newline at end of file diff --git a/changes/87.feature.rst b/changes/87.feature.rst deleted file mode 100644 index 2993edb..0000000 --- a/changes/87.feature.rst +++ /dev/null @@ -1,5 +0,0 @@ -An interface to be able to use rust engine. - -In general it's identical to `standalone`, but differs in one crucial part - all of the rules are compiled into actual binary code and that provides large performance boost. -It is proprietary, because there are various caveats, engine itself is a bit more fragile and needs to be tinkered to be optimized to very specific case -(eg. few long texts with many matches vs a lot short texts with few matches). \ No newline at end of file From 9a866c5f76c3631d0ba7c92cf9bb4defc09a0b7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:47:14 +0300 Subject: [PATCH 13/14] Ignore file in report too --- .coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/.coveragerc b/.coveragerc index 66d7a86..bf3d254 100644 --- a/.coveragerc +++ b/.coveragerc @@ -7,3 +7,4 @@ omit = rita/engines/translate_rust.py [report] show_missing = True +omit = rita/engines/translate_rust.py From 4b5387c061fe3365557aab9168290f3d4e1fa07b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Navickas?= Date: Sat, 29 Aug 2020 10:51:13 +0300 Subject: [PATCH 14/14] Oops, fix path --- .coveragerc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.coveragerc b/.coveragerc index bf3d254..d4a703b 100644 --- a/.coveragerc +++ b/.coveragerc @@ -3,8 +3,8 @@ branch = True source = rita -omit = rita/engines/translate_rust.py +omit = rita/engine/translate_rust.py [report] show_missing = True -omit = rita/engines/translate_rust.py +omit = rita/engine/translate_rust.py