v0.5.3 targeting lunr.js v2.3.3

Fix bug catastrophic backtracking on leading wildcards
yeraydiazdiaz · Sep 8, 2018 · 03c3280 · 03c3280
1 parent e9b03ba
commit 03c3280
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 26 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 0.5.3
+
+- Performance improvements on indexing
+- Compatibility with Lunr.js 2.3.3:
+    - Fixes catastrophic backtracking on leading wildcards
+
 ## 0.5.2
 
 - Fix Python 2.7 support

diff --git a/lunr/__init__.py b/lunr/__init__.py
@@ -8,5 +8,5 @@
 
 logging.basicConfig(format="%(levelname)-7s -  %(message)s")
 
-__VERSION__ = '0.5.2'
-__TARGET_JS_VERSION__ = '2.3.2'
+__VERSION__ = '0.5.3'
+__TARGET_JS_VERSION__ = '2.3.3'
diff --git a/lunr/token_set.py b/lunr/token_set.py
@@ -61,14 +61,21 @@ def __repr__(self):
 
     @classmethod
     def from_string(self, string):
+        """Creates a TokenSet from a string.
+
+        The string may contain one or more wildcard characters (*) that will
+        allow wildcard matching when intersecting with another TokenSet
+        """
         node = TokenSet()
         root = node
-        wildcard_found = False
 
+        # Iterates throough all characters in the passed string appending
+        # a node for each character.
+        # When a wildcard character is found then a self referencing edge
+        # is introduced to continually match any number of characters
         for i, char in enumerate(string):
             final = i == len(string) - 1
             if char == '*':
-                wildcard_found = True
                 node.edges[char] = node
                 node.final = final
             else:
@@ -77,9 +84,6 @@ def from_string(self, string):
                 node.edges[char] = next_
                 node = next_
 
-                if wildcard_found:
-                    node.edges['*'] = root
-
         return root
 
     @classmethod

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1,6 +1,7 @@
 -e .[languages]
-pytest==3.7.3
-mock==2.0.0
-tox==3.2.1
-flake8==3.5.0
-coverage==4.5.1
+pytest
+pytest-timeout
+mock
+tox
+flake8
+coverage
diff --git a/tests/acceptance_tests/javascript/package.json b/tests/acceptance_tests/javascript/package.json
@@ -9,7 +9,7 @@
   "author": "Yeray Diaz Diaz",
   "license": "MIT",
   "dependencies": {
-    "lunr": "2.3.2",
+    "lunr": "2.3.3",
     "lunr-languages": "1.0.0",
     "tmp": "0.0.33"
   },

diff --git a/tests/test_token_set.py b/tests/test_token_set.py
@@ -98,70 +98,70 @@ def test_to_list_includes_single_words(self):
 
 class TestTokenSetIntersect:
 
-    def test_intersect_no_intersection(self):
+    def test_no_intersection(self):
         x = TokenSet.from_string('cat')
         y = TokenSet.from_string('bar')
         z = x.intersect(y)
 
         assert len(z.to_list()) == 0
 
-    def test_intersect_simple_intersection(self):
+    def test_simple_intersection(self):
         x = TokenSet.from_string('cat')
         y = TokenSet.from_string('cat')
         z = x.intersect(y)
 
         assert {'cat'} == set(z.to_list())
 
-    def test_intersect_trailing_wildcard_intersection(self):
+    def test_trailing_wildcard_intersection(self):
         x = TokenSet.from_string('cat')
         y = TokenSet.from_string('c*')
         z = x.intersect(y)
 
         assert {'cat'} == set(z.to_list())
 
-    def test_intersect_trailing_wildcard_no_intersection(self):
+    def test_trailing_wildcard_no_intersection(self):
         x = TokenSet.from_string('cat')
         y = TokenSet.from_string('b*')
         z = x.intersect(y)
 
         assert len(z.to_list()) == 0
 
-    def test_intersect_leading_wildcard_intersection(self):
+    def test_leading_wildcard_intersection(self):
         x = TokenSet.from_string('cat')
         y = TokenSet.from_string('*t')
         z = x.intersect(y)
 
         assert {'cat'} == set(z.to_list())
 
-    def test_intersect_leading_wildcard_no_intersection(self):
+    def test_leading_wildcard_no_intersection(self):
         x = TokenSet.from_string('cat')
         y = TokenSet.from_string('*r')
         z = x.intersect(y)
 
         assert len(z.to_list()) == 0
 
-    def test_intersect_contained_wildcard_intersection(self):
+    def test_contained_wildcard_intersection(self):
         x = TokenSet.from_string('foo')
         y = TokenSet.from_string('f*o')
         z = x.intersect(y)
 
         assert {'foo'} == set(z.to_list())
 
-    def test_intersect_contained_wildcard_no_intersection(self):
+    def test_contained_wildcard_no_intersection(self):
         x = TokenSet.from_string('foo')
         y = TokenSet.from_string('b*r')
         z = x.intersect(y)
 
         assert len(z.to_list()) == 0
 
-    def test_intersect_wildcard_zero_or_more_characters(self):
+    def test_wildcard_zero_or_more_characters(self):
         x = TokenSet.from_string('foo')
         y = TokenSet.from_string('foo*')
         z = x.intersect(y)
 
         assert {'foo'} == set(z.to_list())
 
-    def test_intersect_with_fuzzy_string_subtitution(self):
+    def test_with_fuzzy_string_subtitution(self):
         x1 = TokenSet.from_string('bar')
         x2 = TokenSet.from_string('cur')
         x3 = TokenSet.from_string('cat')
@@ -175,7 +175,7 @@ def test_intersect_with_fuzzy_string_subtitution(self):
         assert x4.intersect(y).to_list() == ['car']
         assert x5.intersect(y).to_list() == []
 
-    def test_intersect_with_fuzzy_string_deletion(self):
+    def test_with_fuzzy_string_deletion(self):
         x1 = TokenSet.from_string('ar')
         x2 = TokenSet.from_string('br')
         x3 = TokenSet.from_string('ba')
@@ -189,7 +189,7 @@ def test_intersect_with_fuzzy_string_deletion(self):
         assert x4.intersect(y).to_list() == ['bar']
         assert x5.intersect(y).to_list() == []
 
-    def test_intersect_with_fuzzy_string_insertion(self):
+    def test_with_fuzzy_string_insertion(self):
         x1 = TokenSet.from_string('bbar')
         x2 = TokenSet.from_string('baar')
         x3 = TokenSet.from_string('barr')
@@ -205,7 +205,7 @@ def test_intersect_with_fuzzy_string_insertion(self):
         assert x5.intersect(y).to_list() == ['ba']
         assert x6.intersect(y).to_list() == []
 
-    def test_intersect_with_fuzzy_string_transpose(self):
+    def test_with_fuzzy_string_transpose(self):
         x1 = TokenSet.from_string('abr')
         x2 = TokenSet.from_string('bra')
         x3 = TokenSet.from_string('foo')
@@ -214,3 +214,46 @@ def test_intersect_with_fuzzy_string_transpose(self):
         assert x1.intersect(y).to_list() == ['abr']
         assert x2.intersect(y).to_list() == ['bra']
         assert x3.intersect(y).to_list() == []
+
+    def test_leading_wildcard_backtracking_intersection(self):
+        x = TokenSet.from_string('aaacbab')
+        y = TokenSet.from_string('*ab')
+
+        assert x.intersect(y).to_list() == ['aaacbab']
+
+    def test_leading_wildcard_backtracking_no_intersection(self):
+        x = TokenSet.from_string('aaacbab')
+        y = TokenSet.from_string('*abc')
+
+        assert x.intersect(y).to_list() == []
+
+    def test_contained_wildcard_backtracking_intersection(self):
+        x = TokenSet.from_string('ababc')
+        y = TokenSet.from_string('a*bc')
+
+        assert x.intersect(y).to_list() == ['ababc']
+
+    def test_contained_wildcard_backtracking_no_intersection(self):
+        x = TokenSet.from_string('ababc')
+        y = TokenSet.from_string('a*ac')
+
+        assert x.intersect(y).to_list() == []
+
+    @pytest.mark.timeout(2)
+    def test_catastrophic_backtracking_with_leading_characters(self):
+        x = TokenSet.from_string('f' * 100)
+        y = TokenSet.from_string('*f')
+
+        assert len(x.intersect(y).to_list()) == 1
+
+    def test_leading_atrailing_wildcard_backtracking_intersection(self):
+        x = TokenSet.from_string('acbaabab')
+        y = TokenSet.from_string('*ab*')
+
+        assert x.intersect(y).to_list() == ['acbaabab']
+
+    def test_leading_atrailing_wildcard_backtracking_intersection(self):
+        x = TokenSet.from_string('acbaabab')
+        y = TokenSet.from_string('a*ba*b')
+
+        assert x.intersect(y).to_list() == ['acbaabab']