In [1]:
from transformers import LlamaConfig
from huggingface_hub import notebook_login
from transformers import LlamaForCausalLM
from transformers import PreTrainedTokenizerFast
from datasets import load_from_disk, load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import wandb
import numpy as np
import torch
import evaluate

2025-03-20 21:02:12.718090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742493732.736054 1547316 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742493732.744712 1547316 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-20 21:02:12.768133: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
for device in range(torch.cuda.device_count()):
    torch.cuda.set_device(device)
    torch.cuda.empty_cache()

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mvladimirshilonosov2[0m ([33mvladimirshilonosov2-itmo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [13]:
dataset = load_dataset("danasone/wikipedia_ru", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000 * 100):
        yield dataset[i : i + 1000]["text"]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [4]:
!python train_tokenizer.py

Resolving data files: 100%|█████████████████████| 21/21 [00:00<00:00, 48.87it/s]
Loading dataset shards: 100%|█████████████████| 21/21 [00:00<00:00, 1533.19it/s]
[2K[00:00:12] Tokenize words                 ██████████████████ 8175842  /  8175842
[2K[00:00:31] Count pairs                    ██████████████████ 8175842  /  8175842
[2K[00:01:14] Compute merges                 ██████████████████ 29944    /    29944


In [14]:
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='custom_ru_tokenizer.json'
)
special_tokens = {
    "bos_token": "<|bos|>",
    "eos_token": "<|eos|>",
    "unk_token": "<|unk|>",
    "pad_token": "<|pad|>",
    "mask_token": "<|mask|>",
    "additional_special_tokens": ["<|user|>", "<|bot|>", "<|end|>"]
}
tokenizer.add_special_tokens(special_tokens)

0

In [15]:
SMALL_PART_SIZE = 512
CONTEXT_SIZE = 4096

In [16]:
custom_config = LlamaConfig(
    vocab_size=32000,
    hidden_size=896,
    intermediate_size=3584,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=CONTEXT_SIZE,
    rope_theta=10000.0,
    attention_bias=False,
    pad_token_id=tokenizer.pad_token_id,
    tie_word_embeddings=True,
    initializer_range=1.5e-4
)

In [17]:
model = LlamaForCausalLM(custom_config)

In [18]:
print(f"Параметров модели: {model.num_parameters():,}")

Параметров модели: 221,377,408


In [19]:
def tokenize_small_parts(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=SMALL_PART_SIZE,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

def tokenize(element):
    max_length = CONTEXT_SIZE - 256 - 2
    
    outputs = tokenizer(
        element["text"],
        truncation=False,
        return_length=True,
        return_tensors='np'
    )

    out_batch = []

    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= max_length:
            out_batch.append([tokenizer.bos_token_id] + input_ids.tolist() + [tokenizer.eos_token_id])
            
    return {"input_ids": out_batch}

In [9]:
tokenized_dataset = dataset.map(
    tokenize, 
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=16
)
tokenized_dataset

NameError: name 'dataset' is not defined

In [11]:
tokenized_dataset = tokenized_dataset.train_test_split(
    test_size=0.05,
    shuffle=True,
    seed=42
)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1791988
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 94316
    })
})

In [12]:
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (0/8 shards):   0%|          | 0/1791988 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/94316 [00:00<?, ? examples/s]

In [20]:
tokenized_dataset = load_from_disk('tokenized_dataset')

In [21]:
class DiffSizeDataCollator(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, padding_side='left', pad_to_multiple_of=8, **kwargs):
        super().__init__(tokenizer, pad_to_multiple_of=pad_to_multiple_of, **kwargs)
        self.padding_side = padding_side
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        max_length = max(len(f['input_ids']) for f in features)
        
        if self.pad_to_multiple_of is not None:
            padded_length = ((max_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of) * self.pad_to_multiple_of
        else:
            padded_length = max_length

        padded_length = min(4096 - 256, padded_length)

        batch = self.tokenizer.pad(
            features,
            padding='longest',
            pad_to_multiple_of=padded_length,
            return_tensors='pt',
            padding_side=self.padding_side
        )
        
        labels = batch['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        batch['labels'] = labels

        return batch

In [22]:
data_collator = DiffSizeDataCollator(tokenizer, mlm=False)

In [23]:
out = data_collator([tokenized_dataset['train'][i] for i in range(8)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([8, 3024])
attention_mask shape: torch.Size([8, 3024])
labels shape: torch.Size([8, 3024])


In [24]:
GPU_COUNT = 3
BATCH_PER_GPU = 1
STEPS_TO_UPDATE = 512
EVALS_PER_EPOCH = 25

args = TrainingArguments(
    output_dir="Llama-ru-220M",
    hub_model_id="NLPVladimir/Llama-ru-220M",
    per_device_train_batch_size=BATCH_PER_GPU,
    per_device_eval_batch_size=BATCH_PER_GPU,
    eval_strategy="steps",
    eval_steps=int(len(tokenized_dataset['train']) / STEPS_TO_UPDATE / EVALS_PER_EPOCH),
    logging_steps=1,
    gradient_accumulation_steps=int(STEPS_TO_UPDATE / GPU_COUNT / GPU_COUNT),
    num_train_epochs=1,
    weight_decay=0.001,
    warmup_steps=100,
    lr_scheduler_type="constant_with_warmup",
    learning_rate=1e-3,
    save_steps=int(len(tokenized_dataset['train']) / STEPS_TO_UPDATE / EVALS_PER_EPOCH),
    fp16=True,
    push_to_hub=True,
    run_name='Llama-ru-220M_pretraining',
    report_to="wandb",
    optim="sgd"
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-03-20 21:04:25,562] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/vshilonosov/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/vshilonosov/miniconda3/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


In [25]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mvladimirshilonosov2[0m ([33mvladimirshilonosov2-itmo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.




Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.31 GiB. GPU 0 has a total capacity of 10.90 GiB of which 1.23 GiB is free. Including non-PyTorch memory, this process has 9.63 GiB memory in use. Of the allocated memory 7.91 GiB is allocated by PyTorch, and 1.49 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.push_to_hub()

In [43]:
len(tokenized_dataset["train"][95 * 3 + 8]['input_ids'])

459