In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [30]:
from datasets import load_dataset

dataset = load_dataset("ugshanyu/Book-Mongol")

Downloading data:   0%|          | 0.00/252M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 72439
    })
    validation: Dataset({
        features: ['title', 'content'],
        num_rows: 18110
    })
})

In [32]:
with open("toy.txt", "a", encoding="utf-8") as f:
    for item in dataset["validation"]:
        f.write(item["content"] + "\n")

In [33]:
# train a sentencepiece model on it
# the settings here are (best effort) those used for training Llama 2
import os

options = dict(
  # input spec
  input="toy.txt",
  input_format="text",
  # output spec
  model_prefix="tok400", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=4000,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=2000000000, # max number of training sentences
  max_sentence_length=8384, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=64,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=0, # the UNK token MUST exist
  bos_id=1, # the others are optional, set to -1 to turn off
  eos_id=2,
  pad_id=-1,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options)


In [28]:
sp = spm.SentencePieceProcessor()
sp.load('tok400.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5

In [35]:
ids = sp.encode("сайн байна уу?")
print(ids)
print([sp.id_to_piece(idx) for idx in ids])

[538, 1918, 917, 428, 66]
['▁сай', 'н', '▁байна', '▁уу', '<0x3F>']


In [36]:
ids = sp.encode("амьдрал үргэлж сайхнаараа")
print(ids)
print([sp.id_to_piece(idx) for idx in ids])

[984, 637, 1921, 455, 689, 1008, 260]
['▁амьдрал', '▁ү', 'р', 'гэ', 'лж', '▁сайхнаар', 'аа']


In [38]:
import os
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm
import argparse

llama_tokenizer_dir = "mistralai/Mistral-7B-Instruct-v0.1"
# mongolian_sp_model_file = args.mongolian_sp_model_file

# load
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
mongolian_sp_model = spm.SentencePieceProcessor()
mongolian_sp_model.Load('tok400.model')

llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())
mongolian_spm = sp_pb2_model.ModelProto()
mongolian_spm.ParseFromString(mongolian_sp_model.serialized_model_proto())

# print number of tokens
print(len(llama_tokenizer),len(mongolian_sp_model))
print(llama_tokenizer.all_special_tokens)
print(llama_tokenizer.all_special_ids)
print(llama_tokenizer.special_tokens_map)

## Add mongolian tokens to LLaMA tokenizer
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(len(llama_spm_tokens_set))
print(f"Before:{len(llama_spm_tokens_set)}")
for p in mongolian_spm.pieces:
    piece = p.piece
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")

## Save
output_sp_dir = 'merged_tokenizer_sp'
output_hf_dir = 'merged_tokenizer_hf' # the path to save mongolian-LLaMA tokenizer
os.makedirs(output_sp_dir,exist_ok=True)
with open(output_sp_dir+'/mongolian_llama.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())
tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/mongolian_llama.model')

tokenizer.save_pretrained(output_hf_dir)
print(f"mongolian-LLaMA tokenizer has been saved to {output_hf_dir}")

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

32000 4000
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
32000
Before:32000
New model pieces: 35255
mongolian-LLaMA tokenizer has been saved to merged_tokenizer_hf
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
Test text:
 白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
The primary use of LLaMA is research on large language models, including
Tokenized by LLaMA tokenizer:['▁', '白', '日', '依', '山', '尽', '，', '黄', '河', '入', '海', '流', '。', '<0xE6>', '<0xAC>', '<0xB2>', '<0xE7>', '<0xA9>', '<0xB7>', '千', '里', '目', '，', '更', '上', '一', '层', '楼', '。', '<0x0A>', 'The', '▁primary', '▁use', '▁of', '▁L', 'La', 'MA', '▁is', '▁research', '▁on', '▁large', '▁language', '▁models', ',', '▁including']
Tokenized by mongolian-LLaMA tokenizer:['▁', '白', '日', '依', '山', '尽', '，', '黄', '河', '入', '海', '流', '。', '<0xE6>', '<0xAC>', '<0xB2>', '<0xE7>', '<0xA9>', '<0xB7>', '千', '里', '目', '，', '更', '上', '一', '层', '楼', '。', '<0x0A>', 'T

In [45]:
# Test
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
mongolian_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
# print(tokenizer.all_special_tokens)
# print(tokenizer.all_special_ids)
# print(tokenizer.special_tokens_map)
text='''This is test we are Doing Is there any difference? Энэ бол туршилт ялгаатай байж чадаж байна уу?'''
print(text)
print(f"old:{llama_tokenizer.tokenize(text)}")
print(f"new:{mongolian_llama_tokenizer.tokenize(text)}")

This is test we are Doing Is there any difference? Энэ бол туршилт ялгаатай байж чадаж байна уу?
old:['▁This', '▁is', '▁test', '▁we', '▁are', '▁Do', 'ing', '▁Is', '▁there', '▁any', '▁difference', '?', '▁Э', 'н', 'э', '▁бо', 'л', '▁тур', 'ши', 'л', 'т', '▁я', 'л', 'га', 'а', 'та', 'й', '▁ба', 'й', 'ж', '▁ча', 'да', 'ж', '▁ба', 'й', 'на', '▁у', 'у', '?']
new:['▁This', '▁is', '▁test', '▁we', '▁are', '▁Do', 'ing', '▁Is', '▁there', '▁any', '▁difference', '?', '▁Энэ', '▁бол', '▁тур', 'шил', 'т', '▁ялг', 'аатай', '▁байж', '▁чад', 'аж', '▁байна', '▁уу', '?']
