In [1]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyPreTokenizer
from konlpy.tag import Mecab

sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
mecab_pretok = KoNLPyPreTokenizer(Mecab())
print(mecab_pretok(sent_ko))



신종 코로나 바이러스 감염증 ( 코로나 19 ) 사태 가 심각 합니다


  return torch._C._cuda_getDeviceCount() > 0


In [2]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyT5WordPieceTrainer, KoNLPyWordPieceTokenizer
from huggingface_konlpy.transformers_konlpy import  KoNLPyT5Tokenizer


In [5]:
t5_mecab_wordpiece_notag_trainer = KoNLPyT5WordPieceTrainer(
    Mecab(), use_tag=False)
t5_mecab_wordpiece_notag_trainer.train(
    files = ['./data/2020-07-29_covid_news_sents.txt'])

Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 100311.70it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:12<00:00, 5460.71it/s]


In [6]:
t5_mecab_wordpiece_notag_trainer.save_model('./tokenizers/', 't5-no-tag')


[/home/jovyan/work/huggingface_konlpy/tokenizers/t5-no-tag-vocab.txt]


In [3]:
mecab_t5_notag =  KoNLPyT5Tokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False),
    vocab_file = './tokenizers/t5-no-tag-vocab.txt'
)

In [4]:
mecab_t5_notag.encode(sent_ko)

[1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112, 1668]

In [5]:
mecab_t5_notag.decode([1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112, 1668])

'신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'

In [12]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyBertWordPieceTrainer, KoNLPyWordPieceTokenizer
from huggingface_konlpy.transformers_konlpy import  KoNLPyBertTokenizer


mecab_wordpiece_usetag_trainer = KoNLPyBertWordPieceTrainer(
    Mecab(), use_tag=True)
mecab_wordpiece_usetag_trainer.train(
    files = ['./data/2020-07-29_covid_news_sents.txt'])
mecab_wordpiece_usetag_trainer.save_model('./tokenizers/BertStyleMecab/', 'usetag')



Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 100909.85it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:13<00:00, 5222.35it/s]


[/home/jovyan/work/huggingface_konlpy/tokenizers/BertStyleMecab/usetag-vocab.txt]


In [13]:
mecab_bert_usetag =  KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=True),
    vocab_file = './tokenizers/BertStyleMecab/usetag-vocab.txt'
)
print(mecab_bert_usetag.tokenize(sent_ko))

['신종/NNG', '코로나/NNG', '##바이러스/NNG', '감염증/NNG', '##(/SSO', '##코로나/NNP', '##19/SN', '##)/SSC', '사태/NNG', '##가/JKS', '심각/XR', '합', '니', '다']


In [14]:
mecab_wordpiece_notag_trainer = KoNLPyBertWordPieceTrainer(
    Mecab(), use_tag=False)
mecab_wordpiece_notag_trainer.train(
    files = ['./data/2020-07-29_covid_news_sents.txt'])
mecab_wordpiece_notag_trainer.save_model('./tokenizers/BertStyleMecab/', 'notag')

mecab_bert_notag = KoNLPyBertTokenizer(
    konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False),
    vocab_file = './tokenizers/BertStyleMecab/notag-vocab.txt'
)
print(mecab_bert_notag.tokenize(sent_ko))

Initialize alphabet 1/1: 100%|██████████| 70964/70964 [00:00<00:00, 99327.07it/s]
Train vocab 1/1: 100%|██████████| 70964/70964 [00:13<00:00, 5291.62it/s]


[/home/jovyan/work/huggingface_konlpy/tokenizers/BertStyleMecab/notag-vocab.txt]
['신종', '코로나', '##바이러스', '감염증', '##(', '##코로나', '##19', '##)', '사태', '##가', '심각', '##합니다']


In [16]:
mecab_bert_notag.encode(sent_ko)

[2, 1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112, 1668, 3]

In [None]:
mecab

In [7]:
import abc

class Vocabulary(object):
    """Base class for all vocabularies."""

    def __init__(self, extra_ids=0):
        self._extra_ids = extra_ids

    @property
    def vocab_size(self):
        raise NotImplementedError

    @abc.abstractmethod
    def encode(self, s):
        raise NotImplementedError

    @abc.abstractmethod
    def decode(self, ids):
        raise NotImplementedError

    @abc.abstractmethod
    def encode_tf(self, s):
        raise NotImplementedError

    @abc.abstractmethod
    def decode_tf(self, ids):
        raise NotImplementedError

    @property
    def extra_ids(self):
        return self._extra_ids

In [35]:
import tensorflow as tf

In [48]:
class KorVocabulary(Vocabulary):
    
    def __init__(self, vocab_file, extra_ids=None):
        
        self._vocab_file = vocab_file
        self._tokenizer = None
        

        kwargs = {"extra_ids": extra_ids} if extra_ids is not None else {}
        super().__init__(**kwargs)
        
    @property
    def tokenizer(self):
        if not self._tokenizer:
            mecab_t5_notag =  KoNLPyT5Tokenizer(
                konlpy_wordpiece = KoNLPyWordPieceTokenizer(Mecab(), use_tag=False),
                vocab_file = self._vocab_file
            )

            self._tokenizer = mecab_t5_notag
        return self._tokenizer
        
    @property
    def vocab_size(self):
        return self.tokenizer.vocab_size
    
    def encode(self, s):
        return self.tokenizer.encode(s)
    
    def decode(self, ids):
        return self.tokenizer.decode(ids)
    
    def encode_tf(self, s):
        ids = self.encode(s)
        return tf.convert_to_tensor(ids, dtype=tf.int32)
    def decode_tf(self, ids):
        return tf.py_function(func=self.decode, inp=[ids], Tout=tf.string)


In [49]:
test = KorVocabulary(vocab_file='./tokenizers/t5-no-tag-vocab.txt')

In [50]:
test.encode(sent_ko)

[1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112, 1668]

In [51]:
test.decode([1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112, 1668])

'신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'

In [52]:
test.vocab_size

30000

In [53]:
test.encode_tf(sent_ko)

<tf.Tensor: shape=(12,), dtype=int32, numpy=
array([1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112,
       1668], dtype=int32)>

In [57]:
test.decode_tf([1103, 1023, 1098, 1109, 1016, 1063, 1024, 1014, 1219, 1011, 2112, 1668]).numpy().decode('utf-8')

'신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'

In [2]:
from huggingface_konlpy.tokenizers_konlpy import KoNLPyPretokBertWordPieceTokenizer
from huggingface_konlpy.transformers_konlpy import KoNLPyPretokBertTokenizer

In [3]:
mecab_bertwordpiece_tokenizer = KoNLPyPretokBertWordPieceTokenizer(
    konlpy_pretok = mecab_pretok)

mecab_bertwordpiece_tokenizer.train(
    files = ['./data/2020-07-29_covid_news_sents.txt'],
    vocab_size = 3000)
mecab_bertwordpiece_tokenizer.save_model(
    directory='./tokenizers/MecabBertWordPieceTokenizer/',
    name='covid')

['./tokenizers/MecabBertWordPieceTokenizer/covid-vocab.txt']

In [4]:
from huggingface_konlpy import compose
from huggingface_konlpy.transformers_konlpy import KoNLPyPretokBertTokenizer

mecab_pretok_berttokenizer = KoNLPyPretokBertTokenizer(
    konlpy_pretok = mecab_pretok,
    vocab_file = './tokenizers/MecabBertWordPieceTokenizer/covid-vocab.txt')

indices = mecab_pretok_berttokenizer.encode(sent_ko)
tokens = [mecab_pretok_berttokenizer.ids_to_tokens[ids] for ids in indices]
print(' '.join(compose(tokens)))

[CLS] 신종 코로나 바이러스 감염증 ( 코로나 19 ) 사태 가 심 ##각 합 ##니다 [SEP]
