Skip to content

Commit

Permalink
add docs for tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Jun 3, 2020
1 parent 9678f1c commit 5aaf524
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 6 deletions.
7 changes: 7 additions & 0 deletions docs/mod_tokenizer.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Tokenizer
=========

.. automodule:: rltk.tokenizer
:members:
:special-members:
:exclude-members: __dict__, __weakref__, __init__
2 changes: 1 addition & 1 deletion docs/modules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ API Reference
mod_utils.rst
mod_cli.rst
mod_remote.rst
.. mod_tokenizer.rst
mod_tokenizer.rst
40 changes: 35 additions & 5 deletions rltk/tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,48 @@


class Tokenizer(ABC):
"""
Abstract tokenizer
"""

@abstractmethod
def tokenize(self, s):
def tokenize(self, s: str) -> List[str]:
"""
Apply tokenizer
Args:
s (str): String to tokenize.
Returns:
List[str]: Tokenized list. It won't do token deduplication.
"""
raise NotImplementedError


class CRFTokenizer(Tokenizer):
"""
CRFTokenizer: this uses old DIG CRFTokenizer
"""

def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
self._t = dig_tokenizer.CrfTokenizer(*args, **kwargs)

def tokenize(self, s):
def tokenize(self, s: str) -> List[str]:
return self._t.tokenize(s)


class WordTokenizer(Tokenizer):
def __init__(self, remove_empty: bool = False):
"""
Word Tokenizer: tokenize word by white space
Args:
remove_empty (bool, optional): If set, empty token will be removed. Defaults to False.
"""

def __init__(self, remove_empty: bool = False) -> None:
self._remove_empty = remove_empty

def tokenize(self, s):
def tokenize(self, s: str) -> List[str]:
s = s.split(' ')
if self._remove_empty:
return list(filter(lambda x: len(x) != 0, s))
Expand All @@ -32,6 +54,14 @@ def tokenize(self, s):


class NGramTokenizer(Tokenizer):
"""
NGrame Tokenizer
Args:
n (int): n.
place_holder (str, optional): String to fill pad and separator. Defaults to white space (' ').
padded (bool, optional): If set, head the tail will be filled with pad. Defaults to False.
"""

def __init__(self, n: int, place_holder: str = ' ', padded: bool = False,
base_tokenizer: Tokenizer = None) -> None:
Expand Down

0 comments on commit 5aaf524

Please sign in to comment.