In [1]:
# Ignore warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
# pip install toiro[all_tokenizers]
from toiro import tokenizers
from toiro import datadownloader

In [3]:
# A list of available tokenizers in toiro
tokenizers.available_tokenizers()

{'nagisa': {'is_available': True, 'version': '0.2.7'},
 'janome': {'is_available': True, 'version': '0.3.10'},
 'mecab-python3': {'is_available': True, 'version': '0.996'},
 'sudachipy': {'is_available': True, 'version': '0.4.9'},
 'spacy': {'is_available': True, 'version': '2.3.2'},
 'ginza': {'is_available': True, 'version': '2.3.2'},
 'kytea': {'is_available': True, 'version': '0.1.5'},
 'jumanpp': {'is_available': True, 'version': '0.4.1'},
 'sentencepiece': {'is_available': True, 'version': '0.1.91'}}

In [4]:
# Toiro includes a sample text file
sample_txt = datadownloader.sample_datasets.sample_txt

In [5]:
# Compare the processing speed of tokenizers
report = tokenizers.compare_from_file(sample_txt)

100%|██████████| 439/439 [00:00<00:00, 22748.66it/s]
  0%|          | 0/439 [00:00<?, ?it/s]

[1/9] Tokenizer: mecab-python3
[2/9] Tokenizer: janome


100%|██████████| 439/439 [00:01<00:00, 273.85it/s]
  3%|▎         | 15/439 [00:00<00:03, 124.98it/s]

[3/9] Tokenizer: nagisa


100%|██████████| 439/439 [00:02<00:00, 160.15it/s]
  5%|▌         | 24/439 [00:00<00:01, 238.15it/s]

[4/9] Tokenizer: sudachipy


100%|██████████| 439/439 [00:01<00:00, 311.88it/s]
  5%|▍         | 20/439 [00:00<00:02, 170.63it/s]

[5/9] Tokenizer: spacy


100%|██████████| 439/439 [00:01<00:00, 251.38it/s]
  4%|▍         | 17/439 [00:00<00:02, 167.00it/s]

[6/9] Tokenizer: ginza


100%|██████████| 439/439 [00:01<00:00, 249.01it/s]
100%|██████████| 439/439 [00:00<00:00, 7628.06it/s]
  5%|▌         | 22/439 [00:00<00:01, 218.14it/s]

[7/9] Tokenizer: kytea
[8/9] Tokenizer: jumanpp


100%|██████████| 439/439 [00:00<00:00, 603.80it/s]
100%|██████████| 439/439 [00:00<00:00, 21842.48it/s]

[9/9] Tokenizer: sentencepiece





In [6]:
# Report includes an execution environment, a data, and elapsed time.
print(report)

{'execution_environment': {'python_version': '3.7.8.final.0 (64 bit)', 'arch': 'X86_64', 'brand_raw': 'Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz', 'count': 8}, 'data': {'number_of_sentences': 439, 'average_length': 81.79043280182232}, 'mecab-python3': {'elapsed_time': 0.019868850708007812}, 'janome': {'elapsed_time': 1.6034212112426758}, 'nagisa': {'elapsed_time': 2.7418212890625}, 'sudachipy': {'elapsed_time': 1.4079265594482422}, 'spacy': {'elapsed_time': 1.7468245029449463}, 'ginza': {'elapsed_time': 1.7634239196777344}, 'kytea': {'elapsed_time': 0.05843210220336914}, 'jumanpp': {'elapsed_time': 0.7275857925415039}, 'sentencepiece': {'elapsed_time': 0.02068305015563965}}


In [7]:
# Compare the segmented words of tokenizers
text = "都庁所在地は新宿区。"
tokenizers.print_words(text, delimiter="|")

mecab-python3: 都庁|所在地|は|新宿|区|。
       janome: 都庁|所在地|は|新宿|区|。
       nagisa: 都庁|所在|地|は|新宿|区|。
    sudachipy: 都庁|所在地|は|新宿区|。
        spacy: 都庁|所在|地|は|新宿|区|。
        ginza: 都庁|所在地|は|新宿区|。
        kytea: 都庁|所在|地|は|新宿|区|。
      jumanpp: 都庁|所在|地|は|新宿|区|。
sentencepiece: ▁|都|庁|所在地|は|新宿|区|。


In [8]:
# The words are tokenized by mecab-python3
words = tokenizers.tokenize_mecab(text)
print(words)

['都庁', '所在地', 'は', '新宿', '区', '。']


In [9]:
# This is the result of the original analysis by mecab-python3.
tokens = tokenizers.original_mecab(text)
print(tokens)

都庁	名詞,一般,*,*,*,*,都庁,トチョウ,トチョー
所在地	名詞,一般,*,*,*,*,所在地,ショザイチ,ショザイチ
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
新宿	名詞,固有名詞,地域,一般,*,*,新宿,シンジュク,シンジュク
区	名詞,接尾,地域,*,*,*,区,ク,ク
。	記号,句点,*,*,*,*,。,。,。
EOS



In [10]:
# The words are tokenized by sentencepiece
words = tokenizers.tokenize_sentencepiece(text)
print(words)

['▁', '都', '庁', '所在地', 'は', '新宿', '区', '。']


In [11]:
# This is the result of the original analysis by sentencepiece
tokens = tokenizers.original_sentencepiece(text)
print(tokens)

[['▁', '都', '庁', '所在地', 'は', '新宿', '区', '。'], [5, 880, 1410, 5812, 6, 4797, 251, 6723]]
