Skip to content

Commit

Permalink
internationalize word boundary checks
Browse files Browse the repository at this point in the history
  • Loading branch information
aseifert committed Mar 19, 2018
1 parent 5591859 commit 9b6b187
Showing 1 changed file with 10 additions and 35 deletions.
45 changes: 10 additions & 35 deletions flashtext/keyword.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import string
import io
import re


class KeywordProcessor(object):
Expand All @@ -9,8 +10,6 @@ class KeywordProcessor(object):
Attributes:
_keyword (str): Used as key to store keywords in trie dictionary.
Defaults to '_keyword_'
non_word_boundaries (set(str)): Characters that will determine if the word is continuing.
Defaults to set([A-Za-z0-9_])
keyword_trie_dict (dict): Trie dict built character by character, that is used for lookup
Defaults to empty dictionary
case_sensitive (boolean): if the search algorithm should be case sensitive or not.
Expand All @@ -35,20 +34,16 @@ class KeywordProcessor(object):
* Idea came from this `Stack Overflow Question <https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.
"""

NON_WORD_CHAR_REGEX = re.compile(r'\W', re.UNICODE)


def __init__(self, case_sensitive=False):
"""
Args:
case_sensitive (boolean): Keyword search should be case sensitive set or not.
Defaults to False
"""
self._keyword = '_keyword_'
self._white_space_chars = set(['.', '\t', '\n', '\a', ' ', ','])
try:
# python 2.x
self.non_word_boundaries = set(string.digits + string.letters + '_')
except AttributeError:
# python 3.x
self.non_word_boundaries = set(string.digits + string.ascii_letters + '_')
self.keyword_trie_dict = dict()
self.case_sensitive = case_sensitive
self._terms_in_trie = 0
Expand Down Expand Up @@ -202,26 +197,6 @@ def __iter__(self):
"""
raise NotImplementedError("Please use get_all_keywords() instead")

def set_non_word_boundaries(self, non_word_boundaries):
"""set of characters that will be considered as part of word.
Args:
non_word_boundaries (set(str)):
Set of characters that will be considered as part of word.
"""
self.non_word_boundaries = non_word_boundaries

def add_non_word_boundary(self, character):
"""add a character that will be considered as part of word.
Args:
character (char):
Character that will be considered as part of word.
"""
self.non_word_boundaries.add(character)

def add_keyword(self, keyword, clean_name=None):
"""To add one or more keywords to the dictionary
pass the keyword and the clean name it maps to.
Expand Down Expand Up @@ -482,7 +457,7 @@ def extract_keywords(self, sentence, span_info=False):
while idx < sentence_len:
char = sentence[idx]
# when we reach a character that might denote word end
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):

# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
Expand All @@ -502,7 +477,7 @@ def extract_keywords(self, sentence, span_info=False):
idy = idx + 1
while idy < sentence_len:
inner_char = sentence[idy]
if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued:
# update longest sequence found
longest_sequence_found = current_dict_continued[self._keyword]
sequence_end_pos = idy
Expand Down Expand Up @@ -540,7 +515,7 @@ def extract_keywords(self, sentence, span_info=False):
idy = idx + 1
while idy < sentence_len:
char = sentence[idy]
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
break
idy += 1
idx = idy
Expand Down Expand Up @@ -594,7 +569,7 @@ def replace_keywords(self, sentence):
char = sentence[idx]
current_word += orig_sentence[idx]
# when we reach whitespace
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
current_white_space = char
# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
Expand All @@ -615,7 +590,7 @@ def replace_keywords(self, sentence):
while idy < sentence_len:
inner_char = sentence[idy]
current_word_continued += orig_sentence[idy]
if inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(inner_char) and self._keyword in current_dict_continued:
# update longest sequence found
current_white_space = inner_char
longest_sequence_found = current_dict_continued[self._keyword]
Expand Down Expand Up @@ -663,7 +638,7 @@ def replace_keywords(self, sentence):
while idy < sentence_len:
char = sentence[idy]
current_word += orig_sentence[idy]
if char not in self.non_word_boundaries:
if KeywordProcessor.NON_WORD_CHAR_REGEX.match(char):
break
idy += 1
idx = idy
Expand Down

0 comments on commit 9b6b187

Please sign in to comment.