Skip to content

Commit

Permalink
v0.5.3 targeting lunr.js v2.3.3
Browse files Browse the repository at this point in the history
Fix bug catastrophic backtracking on leading wildcards
  • Loading branch information
Yeray Diaz Diaz committed Sep 8, 2018
1 parent e9b03ba commit 03c3280
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 26 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 0.5.3

- Performance improvements on indexing
- Compatibility with Lunr.js 2.3.3:
- Fixes catastrophic backtracking on leading wildcards

## 0.5.2

- Fix Python 2.7 support
Expand Down
4 changes: 2 additions & 2 deletions lunr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@

logging.basicConfig(format="%(levelname)-7s - %(message)s")

__VERSION__ = '0.5.2'
__TARGET_JS_VERSION__ = '2.3.2'
__VERSION__ = '0.5.3'
__TARGET_JS_VERSION__ = '2.3.3'
14 changes: 9 additions & 5 deletions lunr/token_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,21 @@ def __repr__(self):

@classmethod
def from_string(self, string):
"""Creates a TokenSet from a string.
The string may contain one or more wildcard characters (*) that will
allow wildcard matching when intersecting with another TokenSet
"""
node = TokenSet()
root = node
wildcard_found = False

# Iterates throough all characters in the passed string appending
# a node for each character.
# When a wildcard character is found then a self referencing edge
# is introduced to continually match any number of characters
for i, char in enumerate(string):
final = i == len(string) - 1
if char == '*':
wildcard_found = True
node.edges[char] = node
node.final = final
else:
Expand All @@ -77,9 +84,6 @@ def from_string(self, string):
node.edges[char] = next_
node = next_

if wildcard_found:
node.edges['*'] = root

return root

@classmethod
Expand Down
11 changes: 6 additions & 5 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
-e .[languages]
pytest==3.7.3
mock==2.0.0
tox==3.2.1
flake8==3.5.0
coverage==4.5.1
pytest
pytest-timeout
mock
tox
flake8
coverage
2 changes: 1 addition & 1 deletion tests/acceptance_tests/javascript/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"author": "Yeray Diaz Diaz",
"license": "MIT",
"dependencies": {
"lunr": "2.3.2",
"lunr": "2.3.3",
"lunr-languages": "1.0.0",
"tmp": "0.0.33"
},
Expand Down
69 changes: 56 additions & 13 deletions tests/test_token_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,70 +98,70 @@ def test_to_list_includes_single_words(self):

class TestTokenSetIntersect:

def test_intersect_no_intersection(self):
def test_no_intersection(self):
x = TokenSet.from_string('cat')
y = TokenSet.from_string('bar')
z = x.intersect(y)

assert len(z.to_list()) == 0

def test_intersect_simple_intersection(self):
def test_simple_intersection(self):
x = TokenSet.from_string('cat')
y = TokenSet.from_string('cat')
z = x.intersect(y)

assert {'cat'} == set(z.to_list())

def test_intersect_trailing_wildcard_intersection(self):
def test_trailing_wildcard_intersection(self):
x = TokenSet.from_string('cat')
y = TokenSet.from_string('c*')
z = x.intersect(y)

assert {'cat'} == set(z.to_list())

def test_intersect_trailing_wildcard_no_intersection(self):
def test_trailing_wildcard_no_intersection(self):
x = TokenSet.from_string('cat')
y = TokenSet.from_string('b*')
z = x.intersect(y)

assert len(z.to_list()) == 0

def test_intersect_leading_wildcard_intersection(self):
def test_leading_wildcard_intersection(self):
x = TokenSet.from_string('cat')
y = TokenSet.from_string('*t')
z = x.intersect(y)

assert {'cat'} == set(z.to_list())

def test_intersect_leading_wildcard_no_intersection(self):
def test_leading_wildcard_no_intersection(self):
x = TokenSet.from_string('cat')
y = TokenSet.from_string('*r')
z = x.intersect(y)

assert len(z.to_list()) == 0

def test_intersect_contained_wildcard_intersection(self):
def test_contained_wildcard_intersection(self):
x = TokenSet.from_string('foo')
y = TokenSet.from_string('f*o')
z = x.intersect(y)

assert {'foo'} == set(z.to_list())

def test_intersect_contained_wildcard_no_intersection(self):
def test_contained_wildcard_no_intersection(self):
x = TokenSet.from_string('foo')
y = TokenSet.from_string('b*r')
z = x.intersect(y)

assert len(z.to_list()) == 0

def test_intersect_wildcard_zero_or_more_characters(self):
def test_wildcard_zero_or_more_characters(self):
x = TokenSet.from_string('foo')
y = TokenSet.from_string('foo*')
z = x.intersect(y)

assert {'foo'} == set(z.to_list())

def test_intersect_with_fuzzy_string_subtitution(self):
def test_with_fuzzy_string_subtitution(self):
x1 = TokenSet.from_string('bar')
x2 = TokenSet.from_string('cur')
x3 = TokenSet.from_string('cat')
Expand All @@ -175,7 +175,7 @@ def test_intersect_with_fuzzy_string_subtitution(self):
assert x4.intersect(y).to_list() == ['car']
assert x5.intersect(y).to_list() == []

def test_intersect_with_fuzzy_string_deletion(self):
def test_with_fuzzy_string_deletion(self):
x1 = TokenSet.from_string('ar')
x2 = TokenSet.from_string('br')
x3 = TokenSet.from_string('ba')
Expand All @@ -189,7 +189,7 @@ def test_intersect_with_fuzzy_string_deletion(self):
assert x4.intersect(y).to_list() == ['bar']
assert x5.intersect(y).to_list() == []

def test_intersect_with_fuzzy_string_insertion(self):
def test_with_fuzzy_string_insertion(self):
x1 = TokenSet.from_string('bbar')
x2 = TokenSet.from_string('baar')
x3 = TokenSet.from_string('barr')
Expand All @@ -205,7 +205,7 @@ def test_intersect_with_fuzzy_string_insertion(self):
assert x5.intersect(y).to_list() == ['ba']
assert x6.intersect(y).to_list() == []

def test_intersect_with_fuzzy_string_transpose(self):
def test_with_fuzzy_string_transpose(self):
x1 = TokenSet.from_string('abr')
x2 = TokenSet.from_string('bra')
x3 = TokenSet.from_string('foo')
Expand All @@ -214,3 +214,46 @@ def test_intersect_with_fuzzy_string_transpose(self):
assert x1.intersect(y).to_list() == ['abr']
assert x2.intersect(y).to_list() == ['bra']
assert x3.intersect(y).to_list() == []

def test_leading_wildcard_backtracking_intersection(self):
x = TokenSet.from_string('aaacbab')
y = TokenSet.from_string('*ab')

assert x.intersect(y).to_list() == ['aaacbab']

def test_leading_wildcard_backtracking_no_intersection(self):
x = TokenSet.from_string('aaacbab')
y = TokenSet.from_string('*abc')

assert x.intersect(y).to_list() == []

def test_contained_wildcard_backtracking_intersection(self):
x = TokenSet.from_string('ababc')
y = TokenSet.from_string('a*bc')

assert x.intersect(y).to_list() == ['ababc']

def test_contained_wildcard_backtracking_no_intersection(self):
x = TokenSet.from_string('ababc')
y = TokenSet.from_string('a*ac')

assert x.intersect(y).to_list() == []

@pytest.mark.timeout(2)
def test_catastrophic_backtracking_with_leading_characters(self):
x = TokenSet.from_string('f' * 100)
y = TokenSet.from_string('*f')

assert len(x.intersect(y).to_list()) == 1

def test_leading_atrailing_wildcard_backtracking_intersection(self):
x = TokenSet.from_string('acbaabab')
y = TokenSet.from_string('*ab*')

assert x.intersect(y).to_list() == ['acbaabab']

def test_leading_atrailing_wildcard_backtracking_intersection(self):
x = TokenSet.from_string('acbaabab')
y = TokenSet.from_string('a*ba*b')

assert x.intersect(y).to_list() == ['acbaabab']

0 comments on commit 03c3280

Please sign in to comment.