In [1]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

‘WLV’ - Word Level Algorithm

‘WPC’ - WordPiece Algorithm

‘BPE’ - Byte Pair Encoding

‘UNI’ - Unigram

In [2]:
unk_token = "<UNK>"  # token for unknown words
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]  # special tokens

def prepare_tokenizer_trainer(alg,vocab_size=5000):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens,vocab_size=vocab_size)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens,vocab_size=vocab_size)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens,vocab_size=vocab_size)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens,vocab_size=vocab_size)
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer


def train_tokenizer(files, save_path, vocab_size, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg, vocab_size)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save(save_path)
    tokenizer = Tokenizer.from_file(save_path)
    return tokenizer

In [3]:
tokenizers = [("./trained_models/from_scratch/90_percent_eng/bpe_scratch_5K_90_percent_eng","BPE",5000),
            ("./trained_models/from_scratch/90_percent_eng/unigram_scratch_5K_90_percent_eng","UNI",5000),
             ("./trained_models/from_scratch/90_percent_eng/wordpiece_scratch_5K_90_percent_eng","WPC",5000)
            ]
files = ["./language_mixed/samanantar_eng_90_percent.txt"]
for save_path,algo,vocab_size in tokenizers:
    print(save_path)
    trained_tokenizer = train_tokenizer(files, save_path, vocab_size, algo)
#     assert len(trained_tokenizer)==vocab_size

./trained_models/from_scratch/90_percent_eng/bpe_scratch_5K_90_percent_eng



./trained_models/from_scratch/90_percent_eng/unigram_scratch_5K_90_percent_eng


./trained_models/from_scratch/90_percent_eng/wordpiece_scratch_5K_90_percent_eng



