In [1]:
from transformers import LlamaConfig
from huggingface_hub import notebook_login
from transformers import LlamaForCausalLM
from transformers import PreTrainedTokenizerFast
from datasets import load_from_disk, load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import wandb
import torch
import evaluate

2025-03-15 20:56:14.587132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742061374.611255 3355456 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742061374.616575 3355456 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-15 20:56:14.633978: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
for device in range(torch.cuda.device_count()):
    torch.cuda.set_device(device)
    torch.cuda.empty_cache()

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mvladimirshilonosov2[0m ([33mvladimirshilonosov2-itmo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
dataset = load_dataset("danasone/wikipedia_ru", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000 * 100):
        yield dataset[i : i + 1000]["text"]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [4]:
!python train_tokenizer.py

Resolving data files: 100%|█████████████████████| 21/21 [00:04<00:00,  5.25it/s]
Loading dataset shards: 100%|█████████████████| 21/21 [00:00<00:00, 1482.19it/s]
[2K[00:00:35] Tokenize words                 ██████████████████ 21027394 / 21027394
[2K[00:01:58] Count pairs                    ██████████████████ 21027394 / 21027394
[2K[00:03:22] Compute merges                 ██████████████████ 13414    /    13414


In [4]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file='custom_ru_tokenizer.json', eos_token="<|endoftext|>")

In [5]:
dataset_sample = dataset.shuffle(seed=42).select(range(len(dataset) // 2))

In [6]:
encoding = tokenizer.tokenize("Проведём тестовую токенизацию текста!")
print(encoding)

['Про', 'вед', 'ём', 'те', 'сто', 'вую', 'то', 'ке', 'ни', 'за', 'цию', 'текста', '!']


In [7]:
custom_config = LlamaConfig(
    vocab_size=32000,
    hidden_size=896,
    intermediate_size=3584,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=2048,
    rope_theta=10000.0,
    attention_bias=False,
    pad_token_id=None
)

In [8]:
model = LlamaForCausalLM(custom_config)

In [9]:
print(f"Параметров модели: {model.num_parameters():,}")

Параметров модели: 250,049,408


In [10]:
context_length = 512

outputs = tokenizer(
    dataset_sample[:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 2
Input chunk lengths: [425, 155]
Chunk mapping: [0, 1]


In [11]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [12]:
tokenized_dataset = dataset_sample.map(
    tokenize, batched=True, remove_columns=dataset_sample.column_names,
    num_proc=16
)
tokenized_dataset

Map (num_proc=16):   0%|          | 0/962693 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 924930
})

In [13]:
tokenized_dataset = tokenized_dataset.train_test_split(
    test_size=0.05,
    shuffle=True,
    seed=42
)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 878683
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 46247
    })
})

In [14]:
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (0/4 shards):   0%|          | 0/878683 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46247 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset = load_from_disk('tokenized_dataset')

In [16]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [17]:
out = data_collator([tokenized_dataset['train'][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [18]:
args = TrainingArguments(
    output_dir="Llama-ru-250M",
    hub_model_id="NLPVladimir/Llama-ru-250M",
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=20,
    gradient_accumulation_steps=12,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=200,
    fp16=True,
    push_to_hub=True,
    run_name='Llama-ru-250M_pretraining',
    report_to="wandb",
    optim="sgd"
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-03-15 20:59:44,241] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/vshilonosov/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/vshilonosov/miniconda3/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mvladimirshilonosov2[0m ([33mvladimirshilonosov2-itmo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.




Step,Training Loss,Validation Loss
200,126.6161,10.551748




In [None]:
trainer.push_to_hub()