## Load the dataset of truku and traditional chinese languages

In [None]:
import pandas as pd
def get_data(file):
    df=pd.read_excel('./dataset/'+file+'.xlsx')
    df =df.replace(to_replace=r'_x000D_', value='', regex=True) #delete _x000D_ found in the element
    df=df.rename(columns={"華語": "chinese", "太魯閣族語": "truku"}) #rename columns
    return df[['chinese','truku']]
df1=get_data('df1')
df2=get_data('df2')
df3=get_data('df3')
df4=get_data('df4') #bible
df=pd.concat([df1,df2,df3,df4]) # concatinate all data
df=df.drop_duplicates() #drop duplicates of rows

## Checking How well does the data fit into the NLLB tokenizer?

### How many unknown tokens are in the tokenizer outputs for Truku or traditional Chinese?
#### If this is too often, we need to fix it somehow

In [None]:
# load NLLB Tokenizer
from transformers import NllbTokenizer 
tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")


In [31]:
# counting the amount of unknown tokens
from tqdm.auto import tqdm
# In Truku data
unk=[] 
for sent in tqdm(df['truku']):
    for text in tokenizer.tokenize(str(sent)):
        if tokenizer.convert_tokens_to_ids(text) == tokenizer.unk_token_id:
            unk.append(list(text))

  0%|          | 0/84818 [00:00<?, ?it/s]

In [32]:
from collections import Counter
unk_tr_tok=Counter(c for t in unk for c in t)
print('The number of unknown tokens in truku dataset is ',len(unk_tr_tok))

The number of unknown tokens in truku dataset is  58


In [33]:
#In chinese data
unk_zh=[]
for sent in tqdm(df['chinese']):
    for text in tokenizer.tokenize(str(sent)):
        if tokenizer.convert_tokens_to_ids(text) == tokenizer.unk_token_id:
            unk_zh.append(list(text))

  0%|          | 0/84818 [00:00<?, ?it/s]

In [34]:
from collections import Counter
unk_zh_tok=Counter(c for t in unk_zh for c in t)
print('The number of unknown tokens in chinese dataset is ',len(unk_zh_tok))

The number of unknown tokens in chinese dataset is  1488


### We can also evaluate based on How many tokens per word do we have on average? 
#### However this scheme can be applied in in alphabetical letter, i.e. Truku

In [None]:
import re
def word_tokenize(text):
    """
    Split a text into words, numbers, and punctuation marks
    (for languages where words are separated by spaces)
    """
    return re.findall('(\w+|[^\w\s])', text)

smpl = df.sample(10000, random_state=1)
#smpl['chinese_toks'] = smpl.chinese.apply(tokenizer.tokenize)
smpl['truku_toks'] = smpl.truku.apply(tokenizer.tokenize)
#smpl['chinese_words'] = smpl.chinese.apply(word_tokenize)
smpl['truku_words'] = smpl.truku.apply(word_tokenize)
stats = smpl[
    [ 'truku_toks', 'truku_words']
].applymap(len).describe()
#print(stats.chinese_toks['mean'] / stats.chinese_words['mean'])  # 4.0333
print(stats.truku_toks['mean'] / stats.truku_words['mean'])  # 1.707
print(stats)
#Good news: for truku, as a new language, the NLLB tokenizer produces on average 1.7 tokens per word


## Increase the vocabs of traditional Chinese in NLLB Tokenizer
### Since there are so many unknown tokens

In [115]:
# retrieve traditional Chinese words from the owned dataset 
# and a new huge dataset from huggingface
from datasets import load_dataset
zh_tra = load_dataset("jed351/Traditional-Chinese-Common-Crawl-Filtered", data_files="C4_Traditional_Chinese-00004-of-00008.jsonl", split="train")
zh_tra = zh_tra.remove_columns(["url","timestamp", "content_language", "content_type"])

  table = cls._concat_blocks(blocks, axis=0)


### The codes for preprocessing stage

In [117]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               #u"\U000024C2-\U0001F251"# chinese char
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [118]:
import re
import sys
import unicodedata
from sacremoses import MosesPunctNormalizer
from cleantext import clean as Clean

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    #clean = mpn.normalize(text)
    clean = replace_nonprint(text)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    
    clean = remove_emoji(clean)
    return clean

### Put together all texts of traditional Chinese words from all datasets, then preprocess them


In [123]:
print(sum(len(t) for t in zh_tra['text']))  # 312908326
print(sum(len(str(t)) for t in df['chinese'].dropna())) #1964285

312908326
1964285
6261647


In [None]:
from tqdm.auto import tqdm, trange
from collections import Counter
zh_all_texts = zh_tra['text'] + df.chinese.dropna().tolist() 
zh_all_text_normalized = [preproc(t) for t in tqdm(str(zh_all_texts))]
zh_chars_cnt = Counter(c for t in zh_all_text_normalized for c in t)
# count the number characters appear more 3 times
zh_required_chars = ''.join([
    k for k, v in zh_chars_cnt.most_common() 
    if v >= 3 and k not in ' '
])

### save the traditional chinese data corpus

In [17]:
file = open('zht_all_texts.txt','w')
file.writelines(str(zh_all_texts))
file.close()


### Expanding the vocabulary of traditional chinese language in NLLB Tokenzier

In [None]:
import sentencepiece as spm
all_texts_file = 'zht_all_texts.txt'
SPM_PREFIX = 'spm_zh_tr_20k'
with open(all_texts_file, 'w') as f:
    for i, text in enumerate(zh_all_texts):
        print(text, file=f)

spm.SentencePieceTrainer.train(
    input=all_texts_file,
    model_prefix=SPM_PREFIX,
    vocab_size=20*(1000),  # 20K
    character_coverage = 1,
    num_threads=16,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192*4,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=zh_required_chars,
)


In [40]:
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
# At this step, the code may throw an error about protobuf. Do as it tells.
from transformers import NllbTokenizer
SPM_PREFIX = 'spm_zh_tr_20k'
# reading the NLLB and the traditional chinese sentencepiece models into a native format
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
sp_trained = spm.SentencePieceProcessor(model_file=f'{SPM_PREFIX}.model')

In [41]:
'''
Extracting the sentencepiece model from the standard NLLB tokenizer and enriching it from all tokens from new traditional Chinese
tokenizer that has been missing from the NLLB tokenizer 
'''

'\nExtracting the sentencepiece model from the standard NLLB tokenizer and enriching it from all tokens from new traditional Chinese\ntokenizer that has been missing from the NLLB tokenizer \n'

In [42]:
import sentencepiece as spm
chinese_spm = sp_pb2_model.ModelProto() ## Define the sentencepiece model of traditional Chinese tokenizer
chinese_spm.ParseFromString(sp_trained.serialized_model_proto()) # Get the serialization model from traditional Chinese SentencePiece tokenizer and parse it
nllb_spm = sp_pb2_model.ModelProto() ## Define the sentencepiece model of NLLB tokenizer
nllb_spm.ParseFromString(tokenizer.sp_model.serialized_model_proto())# Load sentencepiece model from NLLB tokenizer and parse it

4852054

### a set of all vocabs in NLLB tokenizer

In [44]:
nllb_tokens_set = {p.piece for p in nllb_spm.pieces}
prev_min_score = nllb_spm.pieces[-1].score

### adding the missing tokens of traditional chinese to the NLLB sentencepiece model

In [45]:
total_p=0
for p in chinese_spm.pieces: 
    piece = p.piece
    if piece not in nllb_tokens_set: #if the token (in chinese spm) not available in NLLB token
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        
        # for all new tokens, set a lower score (priority)
        new_p.score = p.score + prev_min_score
        nllb_spm.pieces.append(new_p)
        total_p += 1

In [46]:
# saving the new spm model to directory
NEW_SPM_NAME = 'spm_nllb_ch_tr_270k.model'
with open(NEW_SPM_NAME, 'wb') as f:
    f.write(nllb_spm.SerializeToString())

## Update the neural network weights: add new embeddings for the freshly added tokens to NLLB model

In [47]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model_name = 'facebook/nllb-200-distilled-600M'

# loading the tokenizers
tokenizer_old = NllbTokenizer.from_pretrained(model_name)
tokenizer = NllbTokenizer.from_pretrained(model_name, vocab_file=NEW_SPM_NAME)
print(len(tokenizer_old), len(tokenizer)) # 256204, 270333

256204 270333


In [48]:
print('The number of tokens in the default NLLB tokenizer', len(tokenizer_old))
print('The number of tokens in the new NLLB tokenizer', len(tokenizer)) 

The number of tokens in the default NLLB tokenizer 256204
The number of tokens in the new NLLB tokenizer 270333


In [49]:
# add new 14128 tokens
added_vocab = set(tokenizer.get_vocab()).difference(set(tokenizer_old.get_vocab()))
print('The number of added tokens', len(added_vocab))  # 14128

The number of added tokens 14128


### Add truku as a new language tag

In [51]:
def fix_tokenizer(tokenizer, new_lang='tru_Latn'):
    """ Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
    old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang] = old_len-1
    tokenizer.id_to_lang_code[old_len-1] = new_lang
    # always move "mask" to the last position
    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang)
    # clear the added token encoder; otherwise a new token may end up there by mistake
    tokenizer.added_tokens_encoder = {}
    tokenizer.added_tokens_decoder = {}


In [52]:
fix_tokenizer(tokenizer)

### Loading and resizing the NLLB model (from pytorch_model.bin)

In [53]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [54]:
## Before extending the tokenizer
print(model.model.shared.weight.data.shape)
model.model.shared.weight.data

torch.Size([256206, 1024])


tensor([[-0.0321,  0.0348,  0.0181,  ...,  0.0312, -0.0099, -0.0133],
        [-0.0039,  0.0104, -0.0156,  ...,  0.0290, -0.0138, -0.0134],
        [-0.0245, -0.0283, -0.0295,  ...,  0.9712, -0.0255, -0.0273],
        ...,
        [-0.0123, -0.0031, -0.0089,  ...,  0.0645, -0.0182, -0.0740],
        [ 0.0085, -0.0088, -0.0091,  ...,  0.0571, -0.0035, -0.1298],
        [-0.0076, -0.0107, -0.0051,  ...,  1.0264, -0.0338, -0.1175]])

#### Resize the model embedding shape adjusted with the lenght of extended tokenizer

In [55]:
model.resize_token_embeddings(len(tokenizer))
##The embedding for the new token is by default initialized randomly

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 270334. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(270334, 1024)

In [56]:
## After extending the tokenizer
print(model.model.shared.weight.data.shape)
model.model.shared.weight.data

torch.Size([270334, 1024])


tensor([[-0.0321,  0.0348,  0.0181,  ...,  0.0312, -0.0099, -0.0133],
        [-0.0039,  0.0104, -0.0156,  ...,  0.0290, -0.0138, -0.0134],
        [-0.0245, -0.0283, -0.0295,  ...,  0.9712, -0.0255, -0.0273],
        ...,
        [ 0.0162,  0.0015,  0.0258,  ...,  0.0093, -0.0138,  0.0087],
        [-0.0051, -0.0136,  0.0166,  ..., -0.0254,  0.0095, -0.0316],
        [ 0.0427, -0.0335, -0.0056,  ..., -0.0394, -0.0203, -0.0277]])

### Set the embedding of lang id sames as the default NLLB tokenizer

In [57]:
moved_tokens = list(tokenizer_old.lang_code_to_id) + ['<mask>']

In [58]:
model.model.shared.weight.data[tokenizer.convert_tokens_to_ids(moved_tokens)] = model.model.shared.weight.data[tokenizer_old.convert_tokens_to_ids(moved_tokens)]

In [59]:
# set the truku embedding same with tagalog embedding since both are in same family, i.e., austronesian
model.model.shared.weight.data[tokenizer.convert_tokens_to_ids('tru_Latn')] = model.model.shared.weight.data[tokenizer_old.convert_tokens_to_ids('tag_Latn')]

### re-initializing the new embeddings for new vocab

In [60]:
from tqdm.auto import tqdm, trange
for t in tqdm(added_vocab):
    tt = tokenizer_old(t, add_special_tokens=False).input_ids
    if len(tt) == 0: # if there is none, set the embedding with the <unk> token).
        print(f'empty token "{t}"/{tokenizer.convert_tokens_to_ids(t)}')
        tt = [tokenizer_old.unk_token_id]
        model.model.shared.weight.data[tokenizer.convert_tokens_to_ids(t)] = model.model.shared.weight.data[tt]
    # re-initialize each new vocab with the average of the embeddings of the old tokens that corresponded to the new token
    # if the new token consist of several tokens in old tokenizer except '▁' & '<unk>' token
    elif (len(tt) > 1) and (tt != tokenizer.convert_tokens_to_ids(['▁', '<unk>'])):
        model.model.shared.weight.data[tokenizer.convert_tokens_to_ids(t)] = model.model.shared.weight.data[tt].mean(0)

  0%|          | 0/14128 [00:00<?, ?it/s]

empty token "‍"/258688


### Save the model

In [63]:
MODEL_SAVE_PATH = './nllb_zh_tr_new'
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

('./nllb_zh_tr_new/tokenizer_config.json',
 './nllb_zh_tr_new/special_tokens_map.json',
 './nllb_zh_tr_new/sentencepiece.bpe.model',
 './nllb_zh_tr_new/added_tokens.json')