In [2]:
from transformers import LlamaConfig
from huggingface_hub import notebook_login
from transformers import LlamaForCausalLM
from transformers import PreTrainedTokenizerFast
from datasets import load_from_disk, load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import wandb
import numpy as np
import torch
import evaluate

2025-03-16 18:17:15.739614: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742138235.764911 4001760 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742138235.773214 4001760 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-16 18:17:15.796210: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
for device in range(torch.cuda.device_count()):
    torch.cuda.set_device(device)
    torch.cuda.empty_cache()

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mvladimirshilonosov2[0m ([33mvladimirshilonosov2-itmo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
dataset = load_dataset("danasone/wikipedia_ru", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000 * 100):
        yield dataset[i : i + 1000]["text"]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [23]:
!python train_tokenizer.py

Resolving data files: 100%|█████████████████████| 21/21 [00:00<00:00, 52.00it/s]
Loading dataset shards: 100%|█████████████████| 21/21 [00:00<00:00, 1274.11it/s]
[2K[00:12:22] Pre-processing sequences       ██████████░░░░░░░░ 1158950  /  1925386^C
[2K[00:12:22] Pre-processing sequences       ██████████████████ 1925386  /  1925386

In [5]:
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='custom_ru_tokenizer.json',
    eos_token="<|endoftext|>",
    pad_token="<|padding|>",
    bos_token="<|begin|>"
)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
encoding = tokenizer.tokenize("Проведём тестовую токенизацию текста!")
print(encoding)

['Про', 'вед', 'ём', 'те', 'сто', 'вую', 'то', 'ке', 'ни', 'за', 'цию', 'текста', '!']


In [7]:
SMALL_PART_SIZE = 512
CONTEXT_SIZE = 4096

In [8]:
custom_config = LlamaConfig(
    vocab_size=32000,
    hidden_size=896,
    intermediate_size=3584,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=CONTEXT_SIZE,
    rope_theta=10000.0,
    attention_bias=False,
    pad_token_id=tokenizer.pad_token_id,
    tie_word_embeddings=True,
    initializer_range=1.5e-4
)

In [None]:
model = LlamaForCausalLM(custom_config)

In [None]:
print(f"Параметров модели: {model.num_parameters():,}")

In [10]:
def tokenize_small_parts(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=SMALL_PART_SIZE,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

def tokenize_padding(element):
    max_length = CONTEXT_SIZE - 256
    
    outputs = tokenizer(
        element["text"],
        truncation=False,
        max_length=max_length,
        return_length=True,
        padding="max_length",
        padding_side='left',
        return_tensors='np'
    )

    out_batch = []

    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == max_length:
            out_batch.append(input_ids.tolist())
            
    return {"input_ids": out_batch}

def tokenize_no_padding(element):
    max_length = CONTEXT_SIZE - 256
    
    outputs = tokenizer(
        element["text"],
        truncation=False,
        return_length=True,
        return_tensors='np'
    )

    out_batch = []

    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= max_length:
            out_batch.append(input_ids.tolist())
            
    return {"input_ids": out_batch}

In [11]:
CONTEXT_SIZE = 256 + 16

In [12]:
print(tokenize_padding({'text': ['Проведём тестовую токенизацию текста!']}))

{'input_ids': [[0, 0, 0, 19088, 19335, 19218, 18605, 18651, 20307, 18598, 18720, 18588, 18619, 19262, 27949, 2]]}


In [13]:
print(tokenize_no_padding({'text': ['Проведём тестовую токенизацию текста!']}))

{'input_ids': [[19088, 19335, 19218, 18605, 18651, 20307, 18598, 18720, 18588, 18619, 19262, 27949, 2]]}


In [14]:
CONTEXT_SIZE = 4096

In [15]:
tokenized_dataset = dataset.map(
    tokenize_padding, 
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=16
)
tokenized_dataset

Map (num_proc=16):   0%|          | 0/1925386 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 1881282
})

In [16]:
tokenized_dataset = tokenized_dataset.train_test_split(
    test_size=0.05,
    shuffle=True,
    seed=42
)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1787217
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 94065
    })
})

In [17]:
tokenized_dataset_no_pad = dataset.map(
    tokenize_no_padding, 
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=16
)
tokenized_dataset_no_pad

Map (num_proc=16):   0%|          | 0/1925386 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 1881282
})

In [18]:
tokenized_dataset_no_pad = tokenized_dataset_no_pad.train_test_split(
    test_size=0.05,
    shuffle=True,
    seed=42
)
tokenized_dataset_no_pad

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1787217
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 94065
    })
})

In [19]:
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (0/55 shards):   0%|          | 0/1787217 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/94065 [00:00<?, ? examples/s]

In [20]:
tokenized_dataset_no_pad.save_to_disk("tokenized_dataset_no_pad")

Saving the dataset (0/9 shards):   0%|          | 0/1787217 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/94065 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset = load_from_disk('tokenized_dataset')

Loading dataset from disk:   0%|          | 0/55 [00:00<?, ?it/s]

In [19]:
tokenized_dataset_no_pad = load_from_disk('tokenized_dataset_no_pad')

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [21]:
out = data_collator([tokenized_dataset['train'][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 3840])
attention_mask shape: torch.Size([5, 3840])
labels shape: torch.Size([5, 3840])


In [22]:
out['attention_mask']

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

In [23]:
args = TrainingArguments(
    output_dir="Llama-ru-220M",
    hub_model_id="NLPVladimir/Llama-ru-220M",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=1,
    gradient_accumulation_steps=170,
    num_train_epochs=1,
    weight_decay=0.001,
    warmup_steps=100,
    lr_scheduler_type="constant_with_warmup",
    learning_rate=1e-3,
    save_steps=100,
    fp16=True,
    push_to_hub=True,
    run_name='Llama-ru-220M_pretraining',
    report_to="wandb",
    optim="sgd"
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_no_pad["train"],
    eval_dataset=tokenized_dataset_no_pad["test"],
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-03-16 18:15:41,962] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/vshilonosov/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/vshilonosov/miniconda3/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


In [24]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mvladimirshilonosov2[0m ([33mvladimirshilonosov2-itmo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.




OutOfMemoryError: CUDA out of memory. Tried to allocate 1.17 GiB. GPU 0 has a total capacity of 10.90 GiB of which 863.75 MiB is free. Including non-PyTorch memory, this process has 10.02 GiB memory in use. Of the allocated memory 7.83 GiB is allocated by PyTorch, and 1.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.push_to_hub()