In [1]:
import sys
sys.path.append("..")

from substring_tagger import SubstringTagger

### I. Test matching without separators 

In [2]:
rules = {'first', 'firs', 'irst', 'last'}
text = 'first second last'
tagger = SubstringTagger(rules)

expected_output = [
    {'start': 0, 'end': 5, 'text': 'first'},
    {'start': 13, 'end': 17, 'text': 'last'}]
assert tagger(text) == expected_output, "Maximal matches must be returned"

### II. Test the effect of separators

In [3]:
rules = {'match'}
text = 'match|match| match| match| match |match'
separators = '|'
tagger = SubstringTagger(rules, separators=separators)

expected_output = [
    {'start': 0, 'end': 5, 'text': 'match'},
    {'start': 6, 'end': 11, 'text': 'match'},
    {'start': 34, 'end': 39, 'text': 'match'}]
assert tagger(text) == expected_output, "Separators are not correctly handled"

In [4]:
rules = {'match'}
text = 'match match, :match, match'
separators = ' , :'
tagger = SubstringTagger(rules, separators=separators)

expected_output = [
    {'start': 0, 'end': 5, 'text': 'match'},
    {'start': 6, 'end': 11, 'text': 'match'},
    {'start': 14, 'end': 19, 'text': 'match'},
    {'start': 21, 'end': 26, 'text': 'match'}]
assert tagger(text) == expected_output, "Multiple separators do not work"

### III. Test annotations

In [5]:
rules = {'first': {'a': 1}, 'second': {'b': 2}, 'last': {}}
text = 'first second last'
tagger = SubstringTagger(rules)

expected_outcome = [
    {'start': 0, 'end': 5, 'text': 'first', 'a': 1},
    {'start': 6, 'end': 12, 'text': 'second', 'b': 2},
    {'start': 13, 'end': 17, 'text': 'last'}]
assert tagger(text) == expected_outcome, "Annotations do not work"