In [None]:
!pip install transformers



In [None]:
import transformers

In [None]:
print(transformers.__version__)

5.0.0


In [None]:
from datasets import load_dataset

In [None]:
from transformers import DataCollatorForWholeWordMask, AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, BertTokenizer, BertTokenizerFast

In [None]:
from tokenizers import BertWordPieceTokenizer

In [None]:
import os
import shutil

In [None]:
# MODEL_NAME = "DeepPavlov/rubert-base-cased" # или "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
MODEL_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"

In [None]:
OUTPUT_DIR = "old_russian_slavic_bert_finetune"

In [None]:
TRAIN_FILE = "final_dataset_ready.txt"

In [None]:
VOCAB_EXPANSION_LIMIT = 4000

In [None]:
import requests

In [None]:
vocab_url = "https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased/resolve/main/vocab.txt"

In [None]:
r = requests.get(vocab_url)
original_vocab = r.text.splitlines()
print(f"   Оригинальный размер: {len(original_vocab)} токенов")

   Оригинальный размер: 119547 токенов


In [None]:
with open(TRAIN_FILE, 'r', encoding='utf-8') as f:
    text_content = f.read()

In [None]:
existing_vocab_set = set(original_vocab)
tokens_to_append = []

In [None]:
forbidden = set(".,;!?:()[]\"'«»-\n\r\t")

In [None]:
unique_chars = set(text_content)

In [None]:
missing_chars = []


In [None]:
unique_chars = set(text_content)
for c in unique_chars:
    if c not in existing_vocab_set and c.strip():
        tokens_to_append.append(c)

In [None]:
from collections import Counter

In [None]:
words = text_content.split()
word_counts = Counter(words)

In [None]:
for w, c in word_counts.most_common(5000):
    if w not in existing_vocab_set and len(w) > 1:
        # Проверяем, чтобы внутри слова не было мусора
        if not any(bad in w for bad in forbidden):
            tokens_to_append.append(w)

In [None]:
print(f"   Найдено новых токенов: {len(tokens_to_append)}")

   Найдено новых токенов: 3677


In [None]:
NEW_VOCAB_FILE = "vocab_extended.txt"

In [None]:
full_vocab = original_vocab + tokens_to_append

with open(NEW_VOCAB_FILE, "w", encoding="utf-8") as f:
    for token in full_vocab:
        f.write(token + "\n")

print("✅ Новый словарь готов на диске.")

✅ Новый словарь готов на диске.


In [None]:
tokenizer = BertTokenizerFast(
    vocab_file=NEW_VOCAB_FILE,
    do_lower_case=False,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]"
)

In [None]:
print(f"   Размер токенизатора: {len(tokenizer)}")

   Размер токенизатора: 5


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Adding symbols/ letters

In [None]:
print(f"Addding {len(missing_chars)} new symbols (letters): {missing_chars[:10]}...")

Addding 129 new symbols (letters): ['⟦', '҃', '\uf1c0', 'ѱ', 'Ѽ', 'ꙇ', 'Ѡ', '⁘', '꙽', '”']...


In [None]:
py_tokenizer.add_tokens(missing_chars)

129

## Adding vocabulary / words

In [None]:
temp_tokenizer = BertWordPieceTokenizer(clean_text=False, handle_chinese_chars=False, lowercase=False)

In [None]:
temp_tokenizer.train(
    files=[TRAIN_FILE],
    vocab_size=15000,
    min_frequency=5,
    limit_alphabet=1000,
    wordpieces_prefix="##",
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

In [None]:
candidates = list(temp_tokenizer.get_vocab().keys())
safe_tokens = []
rejected_count = 0

In [None]:
print(len(candidates))

15000


In [None]:
i = 0

In [None]:
from tqdm import tqdm # Красивый прогресс-бар

In [None]:
existing_vocab = set(py_tokenizer.get_vocab().keys())

In [None]:
for token in tqdm(candidates):
  # Мгновенная проверка (O(1))
    if token in existing_vocab:
        continue

    if len(token) < 2:
        continue

    # Подготовка текста
    text_to_check = token[2:] if token.startswith("##") else token

    # Быстрая проверка Rust
    splits = py_tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text_to_check)

    if len(splits) <= 1:
        safe_tokens.append(token)
    else:
        rejected_count += 1

100%|██████████| 15000/15000 [00:00<00:00, 87426.50it/s]


In [None]:
safe_tokens = safe_tokens[:VOCAB_EXPANSION_LIMIT]

In [None]:
print(f"   ✅ Одобрено: {len(safe_tokens)} слов.")
print(f"   ❌ Отброшено (Bad Split): {rejected_count} слов.")

   ✅ Одобрено: 4000 слов.
   ❌ Отброшено (Bad Split): 0 слов.


In [None]:
py_tokenizer.add_tokens(safe_tokens)

4000

In [None]:
save_path = "./old_rus_tokenizer"
py_tokenizer.save_pretrained(save_path)

('./old_rus_tokenizer/tokenizer_config.json',
 './old_rus_tokenizer/tokenizer.json')

## Loading everything

In [None]:
print("\n>>> ТЕСТ НА ОШИБКИ:")
test_phrases = [
    "свѣтъ ѿ свѣта бг҃а истинна",
    "поклоно ѿ онѳима ко осподину",
    "начѧтъ ѧсти мѧсо ѫгнѧ"
]


>>> ТЕСТ НА ОШИБКИ:


In [None]:
model = AutoModelForMaskedLM.from_pretrained("DeepPavlov/bert-base-bg-cs-pl-ru-cased")

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/204 [00:00<?, ?it/s]

BertForMaskedLM LOAD REPORT from: DeepPavlov/bert-base-bg-cs-pl-ru-cased
Key                          | Status     |  | 
-----------------------------+------------+--+-
cls.seq_relationship.bias    | UNEXPECTED |  | 
bert.embeddings.position_ids | UNEXPECTED |  | 
bert.pooler.dense.bias       | UNEXPECTED |  | 
bert.pooler.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.weight  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [None]:
model.resize_token_embeddings(len(py_tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(123676, 768, padding_idx=0)

In [None]:
dataset = load_dataset("text", data_files={"train": TRAIN_FILE})

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [None]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,        # Используем 4 ядра процессора для ускорения
    remove_columns=["text"] # Удаляем исходный текст, оставляем только цифры
)

Map (num_proc=4):   0%|          | 0/71255 [00:00<?, ? examples/s]

In [None]:
block_size = 512
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/71255 [00:00<?, ? examples/s]

Process ForkPoolWorker-9:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Process ForkPoolWorker-7:
  File "/usr/local/lib/python3.12/dist-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.12/dist-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.12/dist-packages/datasets/utils/py_utils.py", line 586, in _write_generator_to_queue
    for i, result in enumerate(func(**kwargs)):
                     ^^^^^^^^^^^^^^^^^^^^^^^^^
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/multiprocess/process.py

TimeoutError: 

In [None]:
print(f"Total blocks for training: {len(lm_datasets['train'])}")

In [None]:
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=15,             # Ставим 15 эпох для надежности
    per_device_train_batch_size=8,   # 8 для T4 GPU (если вылетает OOM, ставь 4)
    gradient_accumulation_steps=2,   # Накапливаем градиент (эффективный батч = 16)
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    fp16=True,                       # Включаем Mixed Precision (ускорение)
    logging_steps=50,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets["train"],
    tokenizer=tokenizer # Передаем токенизатор, чтобы он сохранился в финале
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(OUTPUT_DIR)

In [None]:
print("\n>>> ТЕСТИРУЕМ МОДЕЛЬ:")
fill_mask = pipeline("fill-mask", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR, device=0)

test_phrase = "поклоно ѿ [MASK] ко осподину"
print(f"Запрос: {test_phrase}")
res = fill_mask(test_phrase)
for r in res:
    print(f"🔹 {r['token_str']} ({r['score']:.1%})")