In [None]:
!pip install -r requirements.txt
!pip install numpy==1.26.4 scikit-learn==1.3.2 --force-reinstall --no-cache-dir


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from transformers import Trainer
from torch import autocast
import json
import re
from datasets import Dataset
from sklearn.model_selection import train_test_split
from huggingface_hub import login
import random
from transformers import DataCollatorWithPadding
from torch.nn.utils.rnn import pad_sequence

In [None]:
model_id = "google/gemma-3-4b-pt"
login(token="hf_...")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load the model with bfloat16
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap the model in LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [4]:

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU: NVIDIA A100-SXM4-80GB MIG 3g.40gb
Total memory: 39.25 GB


In [None]:
print('Started data processing')

from torch.nn import CrossEntropyLoss

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs.get("attention_mask", None)
        labels = inputs["labels"].to(model.device)

        # 1) Forward только за логитами
        with torch.autocast("cuda", dtype=torch.bfloat16):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
            logits_bf16 = outputs.logits  # bf16

        # 2) К float32
        logits = logits_bf16.float()
        
        # 3) Shift для стандартного CLM
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        # 4) Вычисляем loss на float32
        loss_fct = CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.view(-1)
        )

        if return_outputs:
            return loss, outputs
        return loss


def apply_ragged_cut(text, tokenizer, min_len=128, max_len=512):
    # Токенизация с возвратом тензоров
    tokens = tokenizer(
        text,
        return_tensors="pt",
        truncation=False,
        add_special_tokens=True
    )
    
    input_ids = tokens["input_ids"][0]
    
    # Фильтрация слишком коротких текстов
    if len(input_ids) < min_len:
        return None
    
    # Обрезка до максимальной длины
    if len(input_ids) > max_len:
        input_ids = input_ids[:max_len]
        attention_mask = tokens["attention_mask"][0][:max_len]
    else:
        attention_mask = tokens["attention_mask"][0]
    
    return {
        "input_ids": input_ids.tolist(),
        "labels": input_ids.tolist(),  # Для LM labels = input_ids
        "attention_mask": attention_mask.tolist()
    }


all_samples = []

with open("books.jsonl", "r", encoding="utf-8") as f:
    cnt = 0
    for line in f:
        if (cnt + 1) % 5000 == 0:
            print("Step: ", cnt + 1)
        if cnt == 10000:
            break
        data = json.loads(line)
        raw_text = data["text"]
        sample = apply_ragged_cut(raw_text, tokenizer)

        if sample:  # если не None
            all_samples.append(sample)
        cnt += 1
        

train_data, val_data = train_test_split(all_samples, test_size=0.005, random_state=42)
dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(val_data)
print('Finished')
print(len(all_samples))
print(len(dataset))
print("The number of steps per epoch: ", len(dataset)//(2))

Started data processing
Step:  5000
Step:  10000
Finished
10000
9950
The number of steps per epoch:  4975


In [None]:
print('Started training')

class SafeDataCollator(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        try:
            # Очистка кэша CUDA перед обработкой батча
            torch.cuda.empty_cache()
            
            batch = super().torch_call(examples)
            
            # Проверка на NaN/Inf
            for k, v in batch.items():
                if torch.is_tensor(v):
                    if torch.isnan(v).any() or torch.isinf(v).any():
                        raise ValueError(f"Found NaN/Inf in {k}")
            
            return batch
        except RuntimeError as e:
            print(f"Error during batching: {str(e)}")
            torch.cuda.empty_cache()
            raise



training_args = TrainingArguments(
    output_dir="./persistent_volume/lora_gemma",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    max_steps=15000, # increase for real training
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=250,
    save_strategy="steps",                   # # save by steps, not by epoches
    save_steps=250,                          # increase in real training
    report_to="none",
    logging_dir="./persistent_volume/logs",
    eval_strategy="steps",
    eval_steps=250,
)

data_collator = SafeDataCollator(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# test check
sample = dataset[0]

input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device)

with torch.autocast("cuda", dtype=torch.bfloat16):
    outputs = model(input_ids=input_ids, labels=labels)
    loss = outputs.loss
    print("Sample loss:", outputs.loss.item())
    print("Any logits NaN?", torch.isnan(outputs.logits).any().item())
    logits = outputs.logits

print("Logits dtype:", logits.dtype)
print("Logits min:", logits.min().item())
print("Logits max:", logits.max().item())

print("Labels min:", labels.min().item())
print("Labels max:", labels.max().item())

vocab_size = tokenizer.vocab_size
print("Tokenizer vocab size:", vocab_size)
print("Any label >= vocab_size:", (labels >= vocab_size).any().item())

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

trainer.train()

model.save_pretrained("./persistent_volume/lora_adapter")
tokenizer.save_pretrained("./persistent_volume/lora_adapter")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Started training
Sample loss: 2.1739327907562256
Any logits NaN? False
Logits dtype: torch.float32
Logits min: -21.375
Logits max: 25.125
Labels min: 2
Labels max: 236881
Tokenizer vocab size: 262144
Any label >= vocab_size: False


Step,Training Loss,Validation Loss
250,4.8969,2.269037
500,4.5604,2.138432
750,4.3777,2.032814
1000,4.154,1.842658
1250,3.6227,1.653644
1500,3.4693,1.562531
1750,3.1525,1.397665
2000,2.8716,1.275708
2250,2.644,1.193375
2500,2.5075,1.104011
