In [1]:
!pip uninstall -y transformers
!pip install transformers==4.44.2 datasets==2.19.0

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Collecting transformers==4.44.2
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Installing collected packages: transformers
Successfully installed transformers-4.44.2


In [2]:
import transformers

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [3]:
from datasets import load_dataset

In [4]:
from transformers import DataCollatorForWholeWordMask, AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline, BertTokenizerFast, BertTokenizer

In [5]:
from tokenizers import BertWordPieceTokenizer

In [6]:
import requests

In [7]:
import os
import shutil

In [8]:
# MODEL_NAME = "DeepPavlov/rubert-base-cased" # –∏–ª–∏ "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
MODEL_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"

In [9]:
OUTPUT_DIR = "old_russian_slavic_bert_finetune"

In [10]:
TRAIN_FILE = "final_dataset_ready.txt"

In [11]:
VOCAB_DIR = "./custom_vocab_dir"

In [12]:
ORIGINAL_FILE = "vocab_extended.txt"

In [13]:
VOCAB_SIZE_LIMIT = 5000

In [14]:
vocab_url = "https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased/resolve/main/vocab.txt"
try:
    r = requests.get(vocab_url)
    original_vocab = r.text.splitlines()
    print(f" Downloader.{len(original_vocab)} tokens in the dataset")
except Exception as e:
    raise ValueError(f"Error: {e}")

 Downloader.119547 tokens in the dataset


In [15]:
with open(TRAIN_FILE, 'r', encoding='utf-8') as f:
    text_content = f.read()

In [16]:
existing_vocab_set = set(original_vocab)
tokens_to_append = []

In [17]:
unique_chars = set(text_content)
chars_found = 0

In [18]:
for c in unique_chars:
    if c.strip(): # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—Ä–æ–±–µ–ª—ã
        # 1. –î–æ–±–∞–≤–ª—è–µ–º —Å–∞–º—É –±—É–∫–≤—É (–¥–ª—è –Ω–∞—á–∞–ª–∞ —Å–ª–æ–≤–∞)
        if c not in existing_vocab_set and c not in tokens_to_append:
            tokens_to_append.append(c)
            chars_found += 1

        # 2. –í–ê–ñ–ù–û: –î–æ–±–∞–≤–ª—è–µ–º –≤–∞—Ä–∏–∞–Ω—Ç —Å ## (–¥–ª—è —Å–µ—Ä–µ–¥–∏–Ω—ã —Å–ª–æ–≤–∞)
        sub_c = "##" + c
        if sub_c not in existing_vocab_set and sub_c not in tokens_to_append:
            tokens_to_append.append(sub_c)

In [19]:
print(f"Added letters/symbols: {chars_found} (Examples: {tokens_to_append})")

Added letters/symbols: 129 (Examples: ['”±', '##”±', 'ÍôÉ', '##ÍôÉ', '\uf4a5', '##\uf4a5', '‚Äê', '##‚Äê', '\uf4a4', '##\uf4a4', '“Ü', '##“Ü', '“Ö', '##“Ö', 'ÕÜ', '##ÕÜ', 'Ãã', '##Ãã', '¬¶', '##¬¶', 'ÍôØ', '##ÍôØ', '—Ω', '##—Ω', '\uf467', '##\uf467', 'Ãæ', '##Ãæ', '\uf074', '##\uf074', '—º', '##—º', '`', '##`', 'Õò', '##Õò', '—°', '##—°', 'Íô≥', '##Íô≥', '»£', '##»£', '\uf245', '##\uf245', '‚∑©', '##‚∑©', '\uf13f', '##\uf13f', '\uf2d1', '##\uf2d1', '“Ç', '##“Ç', '‚ãÆ', '##‚ãÆ', '—ï', '##—ï', '—≤', '##—≤', 'ÍôÅ', '##ÍôÅ', '\uf1c0', '##\uf1c0', 'Íôã', '##Íôã', '—µ', '##—µ', 'Õû', '##Õû', '”è', '##”è', '‚±ï', '##‚±ï', '—©', '##—©', 'Ãï', '##Ãï', '—∫', '##—∫', '‚àô', '##‚àô', '‚∞π', '##‚∞π', '‚±î', '##‚±î', 'Ãí', '##Ãí', '‚Äú', '##‚Äú', 'Õó', '##Õó', '—§', '##—§', '—Ø', '##—Ø', 'Ãè', '##Ãè', '”ë', '##”ë', '—ß', '##—ß', '‚∑Æ', '##‚∑Æ', '·≤Ç', '##·≤Ç', 'Ãà', '##Ãà', '—ø', '##—ø', '‚Åò', '##‚Åò', '–á', '##–á', '\uf130', '##\uf130', '“É', '##“É', '\uf2b4', '##\uf2b4', 'Ãì', '##Ãì', '\uf47e', '

## Adding vocabulary / words

In [20]:
from collections import Counter

In [21]:
words = text_content.split()
word_counts = Counter(words)

In [22]:
forbidden = set(".,;!?:()[]\"'¬´¬ª-\n\r\t")
words_found = 0

In [23]:
for w, c in word_counts.most_common(VOCAB_SIZE_LIMIT + 2000): # –ë–µ—Ä–µ–º —Å –∑–∞–ø–∞—Å–æ–º
    # –£—Å–ª–æ–≤–∏—è:
    # 1. –°–ª–æ–≤–∞ –Ω–µ—Ç –≤ —Å–ª–æ–≤–∞—Ä–µ
    # 2. –î–ª–∏–Ω–∞ > 1 (–±—É–∫–≤—ã —É–∂–µ –¥–æ–±–∞–≤–∏–ª–∏)
    # 3. –ù–µ—Ç –∑–∞–ø—Ä–µ—â–µ–Ω–Ω—ã—Ö –∑–Ω–∞–∫–æ–≤ –≤–Ω—É—Ç—Ä–∏
    if w not in existing_vocab_set and len(w) > 1:
        if not any(bad in w for bad in forbidden):
            tokens_to_append.append(w)
            words_found += 1
            if words_found >= VOCAB_SIZE_LIMIT:
                break

In [24]:
print(f"Adding words: {words_found}")
print(f"All tokens: {len(tokens_to_append)}")

Adding words: 5000
All tokens: 5258


In [25]:
print(f"Creating a file {ORIGINAL_FILE}...")
full_vocab = original_vocab + tokens_to_append

with open(ORIGINAL_FILE, "w", encoding="utf-8") as f:
    for token in full_vocab:
        f.write(token + "\n")

Creating a file vocab_extended.txt...


In [26]:
os.makedirs(VOCAB_DIR, exist_ok=True)
dest_file = os.path.join(VOCAB_DIR, "vocab.txt")
shutil.copy(ORIGINAL_FILE, dest_file)

'./custom_vocab_dir/vocab.txt'

In [27]:
tokenizer = BertTokenizer.from_pretrained(
      VOCAB_DIR,
      do_lower_case=False,
      unk_token="[UNK]"
  )



In [28]:
len(tokenizer)

124805

In [29]:
print("\n>>> üß™ –¢–ï–°–¢ –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–ò:")
test_phrase = "–ø–æ–∫–ª–æ–Ω–æ —ø –æ–Ω—≥–∏–º–∞ –∫–æ –æ—Å–ø–æ–¥–∏–Ω—É"
tokens = tokenizer.tokenize(test_phrase)
print(f"   –§—Ä–∞–∑–∞: '{test_phrase}'")
print(f"   –¢–æ–∫–µ–Ω—ã: {tokens}")


>>> üß™ –¢–ï–°–¢ –¢–û–ö–ï–ù–ò–ó–ê–¶–ò–ò:
   –§—Ä–∞–∑–∞: '–ø–æ–∫–ª–æ–Ω–æ —ø –æ–Ω—≥–∏–º–∞ –∫–æ –æ—Å–ø–æ–¥–∏–Ω—É'
   –¢–æ–∫–µ–Ω—ã: ['–ø–æ–∫–ª–æ–Ω–æ', '—ø', '–æ–Ω', '##—≥', '##–∏–º', '##–∞', '–∫–æ', '–æ—Å–ø–æ', '##–¥–∏–Ω', '##—É']


In [30]:
save_path = "./old_rus_tokenizer"
tokenizer.save_pretrained(save_path)

('./old_rus_tokenizer/tokenizer_config.json',
 './old_rus_tokenizer/special_tokens_map.json',
 './old_rus_tokenizer/vocab.txt',
 './old_rus_tokenizer/added_tokens.json')

## Loading everything

In [31]:
tokenizer = AutoTokenizer.from_pretrained("./old_rus_tokenizer", use_fast=True)

In [32]:
model = AutoModelForMaskedLM.from_pretrained("DeepPavlov/bert-base-bg-cs-pl-ru-cased")

In [33]:
model.resize_token_embeddings(len(tokenizer))

Embedding(124805, 768, padding_idx=0)

In [34]:
dataset = load_dataset("text", data_files={"train": TRAIN_FILE})

Generating train split: 0 examples [00:00, ? examples/s]

In [35]:
def tokenize_function(examples):
    # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )

    # –ì–õ–ê–í–ù–ê–Ø –ú–ê–ì–ò–Ø: –î–æ–±–∞–≤–ª—è–µ–º word_ids –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø—Ä–∏–º–µ—Ä–∞
    # –≠—Ç–æ –ø–æ–∑–≤–æ–ª—è–µ—Ç –∫–æ–ª–ª–∞—Ç–æ—Ä—É –ø–æ–Ω—è—Ç—å, –∫–∞–∫–∏–µ —Ç–æ–∫–µ–Ω—ã –ø—Ä–∏–Ω–∞–¥–ª–µ–∂–∞—Ç –æ–¥–Ω–æ–º—É —Å–ª–æ–≤—É
    all_word_ids = []
    for i in range(len(examples["text"])):
        word_ids = result.word_ids(batch_index=i)
        # –ó–∞–º–µ–Ω—è–µ–º None (—Å–ø–µ—Ü—Ç–æ–∫–µ–Ω—ã) –Ω–∞ -1 –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏
        processed_word_ids = [w if w is not None else -100 for w in word_ids]
        all_word_ids.append(processed_word_ids)

    result["word_ids"] = all_word_ids
    return result

In [36]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,        # –ò—Å–ø–æ–ª—å–∑—É–µ–º 4 —è–¥—Ä–∞ –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
    remove_columns=["text"] # –£–¥–∞–ª—è–µ–º –∏—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç, –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —Ü–∏—Ñ—Ä—ã
)

Map (num_proc=4):   0%|          | 0/71255 [00:00<?, ? examples/s]

In [37]:
block_size = 512
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [38]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/71255 [00:00<?, ? examples/s]

In [39]:
print(f"Total blocks for training: {len(lm_datasets['train'])}")

Total blocks for training: 3961


In [40]:
print(f"   üßê –ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–æ–ª–æ–Ω–æ–∫: {lm_datasets['train'].column_names}")
if "word_ids" not in lm_datasets['train'].column_names:
    raise ValueError("‚ùå –û—à–∏–±–∫–∞! –ö–æ–ª–æ–Ω–∫–∞ word_ids –∏—Å—á–µ–∑–ª–∞ –ø—Ä–∏ –ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥–µ.")

   üßê –ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–æ–ª–æ–Ω–æ–∫: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'word_ids']


In [41]:
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [45]:
training_args = TrainingArguments(
    save_safetensors=False,
    remove_unused_columns=False,
    output_dir=OUTPUT_DIR,
    num_train_epochs=15,             # –°—Ç–∞–≤–∏–º 15 —ç–ø–æ—Ö –¥–ª—è –Ω–∞–¥–µ–∂–Ω–æ—Å—Ç–∏
    per_device_train_batch_size=8,   # 8 –¥–ª—è T4 GPU (–µ—Å–ª–∏ –≤—ã–ª–µ—Ç–∞–µ—Ç OOM, —Å—Ç–∞–≤—å 4)
    gradient_accumulation_steps=2,   # –ù–∞–∫–∞–ø–ª–∏–≤–∞–µ–º –≥—Ä–∞–¥–∏–µ–Ω—Ç (—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω—ã–π –±–∞—Ç—á = 16)
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    fp16=True,                       # –í–∫–ª—é—á–∞–µ–º Mixed Precision (—É—Å–∫–æ—Ä–µ–Ω–∏–µ)
    logging_steps=50,
    report_to="none")

In [46]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets["train"],
    tokenizer =tokenizer # –ü–µ—Ä–µ–¥–∞–µ–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä, —á—Ç–æ–±—ã –æ–Ω —Å–æ—Ö—Ä–∞–Ω–∏–ª—Å—è –≤ —Ñ–∏–Ω–∞–ª–µ
)

In [47]:
trainer.train()

Step,Training Loss
50,4.4533
100,4.3126
150,4.2781
200,4.1735
250,4.0692
300,3.9985
350,3.9846


KeyboardInterrupt: 

In [None]:
trainer.save_model(OUTPUT_DIR)

In [None]:
print("\n>>> –¢–ï–°–¢–ò–†–£–ï–ú –ú–û–î–ï–õ–¨:")
fill_mask = pipeline("fill-mask", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR, device=0)

test_phrase = "–ø–æ–∫–ª–æ–Ω–æ —ø [MASK] –∫–æ –æ—Å–ø–æ–¥–∏–Ω—É"
print(f"–ó–∞–ø—Ä–æ—Å: {test_phrase}")
res = fill_mask(test_phrase)
for r in res:
    print(f"üîπ {r['token_str']} ({r['score']:.1%})")