In [1]:
import transformers

In [2]:
from datasets import load_dataset

In [3]:
from transformers import DataCollatorForWholeWordMask, AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline, BertTokenizerFast, BertTokenizer

In [4]:
from tokenizers import BertWordPieceTokenizer

In [5]:
import requests

In [6]:
import os
import shutil

In [7]:
# MODEL_NAME = "DeepPavlov/rubert-base-cased" # –∏–ª–∏ "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
MODEL_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"

In [8]:
OUTPUT_DIR = "old_russian_slavic_bert_finetune"

In [9]:
TRAIN_FILE = "final_dataset_ready.txt"

In [10]:
VOCAB_DIR = "./custom_vocab_dir"

In [11]:
ORIGINAL_FILE = "vocab_extended.txt"

In [12]:
VOCAB_SIZE_LIMIT = 5000

In [13]:
vocab_url = "https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased/resolve/main/vocab.txt"
try:
    r = requests.get(vocab_url)
    original_vocab = r.text.splitlines()
    print(f" Downloader.{len(original_vocab)} tokens in the dataset")
except Exception as e:
    raise ValueError(f"Error: {e}")

 Downloader.119547 tokens in the dataset


In [14]:
with open(TRAIN_FILE, 'r', encoding='utf-8') as f:
    text_content = f.read()

In [15]:
existing_vocab_set = set(original_vocab)
tokens_to_append = []

In [16]:
unique_chars = set(text_content)
chars_found = 0

In [17]:
for c in unique_chars:
    if c.strip(): # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—Ä–æ–±–µ–ª—ã
        # 1. –î–æ–±–∞–≤–ª—è–µ–º —Å–∞–º—É –±—É–∫–≤—É (–¥–ª—è –Ω–∞—á–∞–ª–∞ —Å–ª–æ–≤–∞)
        if c not in existing_vocab_set and c not in tokens_to_append:
            tokens_to_append.append(c)
            chars_found += 1

        # 2. –í–ê–ñ–ù–û: –î–æ–±–∞–≤–ª—è–µ–º –≤–∞—Ä–∏–∞–Ω—Ç —Å ## (–¥–ª—è —Å–µ—Ä–µ–¥–∏–Ω—ã —Å–ª–æ–≤–∞)
        sub_c = "##" + c
        if sub_c not in existing_vocab_set and sub_c not in tokens_to_append:
            tokens_to_append.append(sub_c)

In [18]:
print(f"Added letters/symbols: {chars_found} (Examples: {tokens_to_append})")

Added letters/symbols: 129 (Examples: ['—•', '##—•', 'Ãà', '##Ãà', 'Ãë', '##Ãë', '“á', '##“á', '\uf47e', '##\uf47e', '‚Åû', '##‚Åû', '\uf13f', '##\uf13f', 'ÃÜ', '##ÃÜ', '·≤Ç', '##·≤Ç', 'Õó', '##Õó', 'Õ®', '##Õ®', '‚±ñ', '##‚±ñ', '—¶', '##—¶', 'Ãæ', '##Ãæ', '‚∞ô', '##‚∞ô', '‚Äú', '##‚Äú', '—†', '##—†', '‚∞π', '##‚∞π', '»•', '##»•', 'Íôá', '##Íôá', 'ÍôÉ', '##ÍôÉ', '\uf480', '##\uf480', '—∑', '##—∑', 'Õû', '##Õû', '—´', '##—´', '”±', '##”±', '—≤', '##—≤', '‚∏Ø', '##‚∏Ø', '\uf2b4', '##\uf2b4', '”è', '##”è', '“ç', '##“ç', 'ÍôØ', '##ÍôØ', '\uf2b5', '##\uf2b5', '\uf086', '##\uf086', '‚Äù', '##‚Äù', '\ufeff', '##\ufeff', '—ß', '##—ß', '—±', '##—±', '\uf2db', '##\uf2db', '\uf067', '##\uf067', 'Íôó', '##Íôó', '“Å', '##“Å', '·¥§', '##·¥§', '‚üß', '##‚üß', '\uf265', '##\uf265', 'ÕÜ', '##ÕÜ', '“Ö', '##“Ö', '¬¥', '##¬¥', 'Íô©', '##Íô©', 'Ãã', '##Ãã', '‚ãÆ', '##‚ãÆ', '‚Ä¶', '##‚Ä¶', 'Õõ', '##Õõ', '“Ü', '##“Ü', '¬¶', '##¬¶', 'Õ†', '##Õ†', '—æ', '##—æ', '—Ω', '##—Ω', '“å', '##“å', '\uf467', '##\uf467',

## Adding vocabulary / words

In [19]:
from collections import Counter

In [20]:
words = text_content.split()
word_counts = Counter(words)

In [21]:
forbidden = set(".,;!?:()[]\"'¬´¬ª-\n\r\t")
words_found = 0

In [22]:
for w, c in word_counts.most_common(VOCAB_SIZE_LIMIT + 2000): # –ë–µ—Ä–µ–º —Å –∑–∞–ø–∞—Å–æ–º
    # –£—Å–ª–æ–≤–∏—è:
    # 1. –°–ª–æ–≤–∞ –Ω–µ—Ç –≤ —Å–ª–æ–≤–∞—Ä–µ
    # 2. –î–ª–∏–Ω–∞ > 1 (–±—É–∫–≤—ã —É–∂–µ –¥–æ–±–∞–≤–∏–ª–∏)
    # 3. –ù–µ—Ç –∑–∞–ø—Ä–µ—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞–∫–æ–≤ –≤–Ω—É—Ç—Ä–∏
    if w not in existing_vocab_set and len(w) > 1:
        if not any(bad in w for bad in forbidden):
            tokens_to_append.append(w)
            words_found += 1
            if words_found >= VOCAB_SIZE_LIMIT:
                break

In [23]:
print(f"Adding words: {words_found}")
print(f"All tokens: {len(tokens_to_append)}")

Adding words: 5000
All tokens: 5258


In [24]:
print(f"Creating a file {ORIGINAL_FILE}...")
full_vocab = original_vocab + tokens_to_append

with open(ORIGINAL_FILE, "w", encoding="utf-8") as f:
    for token in full_vocab:
        f.write(token + "\n")

Creating a file vocab_extended.txt...


In [25]:
os.makedirs(VOCAB_DIR, exist_ok=True)
dest_file = os.path.join(VOCAB_DIR, "vocab.txt")
shutil.copy(ORIGINAL_FILE, dest_file)

'./custom_vocab_dir/vocab.txt'

In [26]:
tokenizer = BertTokenizer.from_pretrained(
      VOCAB_DIR,
      do_lower_case=False,
      unk_token="[UNK]"
  )

In [27]:
len(tokenizer)

124805

In [28]:
print("\n>>> üß™ –¢–ï–°–¢ –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–ò:")
test_phrase = "–ø–æ–∫–ª–æ–Ω–æ —ø –æ–Ω—≥–∏–º–∞ –∫–æ –æ—Å–ø–æ–¥–∏–Ω—É"
tokens = tokenizer.tokenize(test_phrase)
print(f"   –§—Ä–∞–∑–∞: '{test_phrase}'")
print(f"   –¢–æ–∫–µ–Ω—ã: {tokens}")


>>> üß™ –¢–ï–°–¢ –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–ò:
   –§—Ä–∞–∑–∞: '–ø–æ–∫–ª–æ–Ω–æ —ø –æ–Ω—≥–∏–º–∞ –∫–æ –æ—Å–ø–æ–¥–∏–Ω—É'
   –¢–æ–∫–µ–Ω—ã: ['–ø–æ–∫–ª–æ–Ω–æ', '—ø', '–æ–Ω', '##—≥', '##–∏–º', '##–∞', '–∫–æ', '–æ—Å–ø–æ', '##–¥–∏–Ω', '##—É']


In [29]:
save_path = "./old_rus_tokenizer"
tokenizer.save_pretrained(save_path)

('./old_rus_tokenizer/tokenizer_config.json',
 './old_rus_tokenizer/tokenizer.json')

## Loading everything

In [30]:
tokenizer = AutoTokenizer.from_pretrained("./old_rus_tokenizer", use_fast=True)

In [31]:
model = AutoModelForMaskedLM.from_pretrained("DeepPavlov/bert-base-bg-cs-pl-ru-cased")



Loading weights:   0%|          | 0/204 [00:00<?, ?it/s]

BertForMaskedLM LOAD REPORT from: DeepPavlov/bert-base-bg-cs-pl-ru-cased
Key                          | Status     |  | 
-----------------------------+------------+--+-
cls.seq_relationship.weight  | UNEXPECTED |  | 
cls.seq_relationship.bias    | UNEXPECTED |  | 
bert.pooler.dense.bias       | UNEXPECTED |  | 
bert.embeddings.position_ids | UNEXPECTED |  | 
bert.pooler.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [32]:
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(124805, 768, padding_idx=0)

In [33]:
dataset = load_dataset("text", data_files={"train": TRAIN_FILE})

In [34]:
def tokenize_function(examples):
    # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )

    # –ì–õ–ê–í–ù–ê–Ø –ú–ê–ì–ò–Ø: –î–æ–±–∞–≤–ª—è–µ–º word_ids –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø—Ä–∏–º–µ—Ä–∞
    # –≠—Ç–æ –ø–æ–∑–≤–æ–ª—è–µ—Ç –∫–æ–ª–ª–∞—Ç–æ—Ä—É –ø–æ–Ω—è—Ç—å, –∫–∞–∫–∏–µ —Ç–æ–∫–µ–Ω—ã –ø—Ä–∏–Ω–∞–¥–ª–µ–∂–∞—Ç –æ–¥–Ω–æ–º—É —Å–ª–æ–≤—É
    all_word_ids = []
    for i in range(len(examples["text"])):
        word_ids = result.word_ids(batch_index=i)
        # –ó–∞–º–µ–Ω—è–µ–º None (—Å–ø–µ—Ü—Ç–æ–∫–µ–Ω—ã) –Ω–∞ -1 –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏
        processed_word_ids = [w if w is not None else -1 for w in word_ids]
        all_word_ids.append(processed_word_ids)

    result["word_ids"] = all_word_ids
    return result

In [35]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,        # –ò—Å–ø–æ–ª—å–∑—É–µ–º 4 —è–¥—Ä–∞ –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
    remove_columns=["text"] # –£–¥–∞–ª—è–µ–º –∏—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç, –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —Ü–∏—Ñ—Ä—ã
)

Map (num_proc=4):   0%|          | 0/71255 [00:00<?, ? examples/s]

In [36]:
block_size = 512
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [37]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/71255 [00:00<?, ? examples/s]

In [38]:
print(f"Total blocks for training: {len(lm_datasets['train'])}")

Total blocks for training: 3961


In [39]:
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)



In [40]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=15,             # –°—Ç–∞–≤–∏–º 15 —ç–ø–æ—Ö –¥–ª—è –Ω–∞–¥–µ–∂–Ω–æ—Å—Ç–∏
    per_device_train_batch_size=8,   # 8 –¥–ª—è T4 GPU (–µ—Å–ª–∏ –≤—ã–ª–µ—Ç–∞–µ—Ç OOM, —Å—Ç–∞–≤—å 4)
    gradient_accumulation_steps=2,   # –ù–∞–∫–∞–ø–ª–∏–≤–∞–µ–º –≥—Ä–∞–¥–∏–µ–Ω—Ç (—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω—ã–π –±–∞—Ç—á = 16)
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    fp16=True,                       # –í–∫–ª—é—á–∞–µ–º Mixed Precision (—É—Å–∫–æ—Ä–µ–Ω–∏–µ)
    logging_steps=50,
    report_to="none",
    remove_unused_columns=False
)

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets["train"],
    processing_class =tokenizer # –ü–µ—Ä–µ–¥–∞–µ–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä, —á—Ç–æ–±—ã –æ–Ω —Å–æ—Ö—Ä–∞–Ω–∏–ª—Å—è –≤ —Ñ–∏–Ω–∞–ª–µ
)

In [42]:
trainer.train()

IndexError: too many indices for array: array is 0-dimensional, but 3 were indexed

In [None]:
trainer.save_model(OUTPUT_DIR)

In [None]:
print("\n>>> –¢–ï–°–¢–ò–†–£–ï–ú –ú–û–î–ï–õ–¨:")
fill_mask = pipeline("fill-mask", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR, device=0)

test_phrase = "–ø–æ–∫–ª–æ–Ω–æ —ø [MASK] –∫–æ –æ—Å–ø–æ–¥–∏–Ω—É"
print(f"–ó–∞–ø—Ä–æ—Å: {test_phrase}")
res = fill_mask(test_phrase)
for r in res:
    print(f"üîπ {r['token_str']} ({r['score']:.1%})")