In [1]:
# Ignore warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
from toiro import tokenizers
from toiro import datadownloader

In [3]:
# A list of avaliable corpora in toiro
corpora = datadownloader.available_corpus()
print(corpora)

['livedoor_news_corpus', 'yahoo_movie_reviews', 'amazon_reviews']


In [4]:
# A list of available tokenizers in your execution environment
# This is the only Janome available by default
# Add each tokenizer by pip install it
tokenizers.available_tokenizers()

{'nagisa': {'is_available': False, 'version': False},
 'janome': {'is_available': True, 'version': '0.3.10'},
 'mecab-python3': {'is_available': False, 'version': False},
 'sudachipy': {'is_available': False, 'version': False},
 'spacy': {'is_available': False, 'version': False},
 'ginza': {'is_available': False, 'version': False},
 'kytea': {'is_available': False, 'version': False},
 'jumanpp': {'is_available': False, 'version': False},
 'sentencepiece': {'is_available': False, 'version': False}}

In [5]:
# Download the livedoor news corpus and load it as pandas.DataFrame
livedoor_corpus = corpora[0]
datadownloader.download_corpus(corpus=livedoor_corpus)
train_df, dev_df, test_df = datadownloader.load_corpus(corpus=livedoor_corpus)
print(train_df.head())

livedoor_news_corpus (ldcc-20140209.tar.gz) exists in /home/taishi-i/toiro_resources .
               0                                           1
0    movie-enter         魅力満載なスマホ向け放送局「NOTTV（ノッティーヴィー）」はじまる！
1   sports-watch                   錦織の敗戦に「立派」「力の差あった」など様々な反応
2  kaden-channel  露出を理解してクリエイティブな写真を撮る！プロのテクに近づく露出を学ぶ【カメラ生活】
3  kaden-channel  【ニュース】「OLYMPUS Viewer 2」のバージョンアッププログラム明日公開
4    movie-enter      【DVDエンター！】十字架、ニンニク…ヴァンパイアに対抗する武器といえば何？


In [6]:
# Compare the processing speed of tokenizers
texts = train_df[1]
report = tokenizers.compare(texts)
print(report)

  1%|          | 47/5900 [00:00<00:12, 460.37it/s]

[1/1] Tokenizer: janome


100%|██████████| 5900/5900 [00:08<00:00, 677.29it/s]

{'execution_environment': {'python_version': '3.7.8.final.0 (64 bit)', 'arch': 'X86_64', 'brand_raw': 'Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz', 'count': 8}, 'data': {'number_of_sentences': 5900, 'average_length': 37.69593220338983}, 'janome': {'elapsed_time': 8.711658239364624}}





In [7]:
# Compare the segmented words of tokenizers
text = "単語分割の結果を比較することができます"
tokenizers.print_words(text, delimiter="|")

       janome: 単語|分割|の|結果|を|比較|する|こと|が|でき|ます
