In [1]:
!pip install -r requirements.txt
!pip install numpy==1.26.4 scikit-learn==1.3.2 --force-reinstall --no-cache-dir


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from transformers import Trainer
from torch import autocast
import json
import re
from datasets import Dataset
from sklearn.model_selection import train_test_split
from huggingface_hub import login

In [None]:
model_id = "google/gemma-3-4b-pt"
login(token="hf_...")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load the model with bfloat16
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap the model in LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [4]:
print('Started data processing')

class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs["input_ids"].to(model.device)
        labels = inputs["labels"].to(model.device)

        # Autocast with bfloat16
        with torch.autocast("cuda", dtype=torch.bfloat16):
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_preds):
    """
    Считает loss и perplexity на основе логитов модели.
    """
    logits, labels = eval_preds

    # Сдвигаем input и target (standard causal LM setup)
    shift_logits = torch.tensor(logits[..., :-1, :])
    shift_labels = torch.tensor(labels[..., 1:])

    # Выравниваем формы
    shift_logits = shift_logits.contiguous().view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.contiguous().view(-1)

    # Функция потерь
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits, shift_labels)

    # Перплексия
    perplexity = math.exp(loss.item()) if loss.item() < 20 else float("inf")

    return {
        "eval_loss": loss.item(),
        "perplexity": perplexity
    }


def split_sentences(text):
    # The simplest division by dots, exclamation marks and question marks
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences


def clean_text(text, top_cut=0.1, bottom_cut=0.1):
    length = len(text)
    start = int(length * top_cut)
    end = int(length * (1 - bottom_cut))
    trimmed = text[start:end]

    sentences = split_sentences(trimmed)
    cleaned_text = " ".join(sentences)
    return cleaned_text


all_samples = []

with open("books.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        raw_text = data["text"]
        cleaned = clean_text(raw_text)

        tokens = tokenizer(
            cleaned,
            return_tensors="pt",
            return_attention_mask=True,
            truncation=False
        )
        input_ids = tokens["input_ids"][0]
        attention_mask = tokens["attention_mask"][0]

        # Splitting into pieces
        max_length = 256
        stride = 128
        for i in range(0, len(input_ids) - max_length, stride):
            chunk = input_ids[i : i + max_length]
            attn_chunk = attention_mask[i : i + max_length]
            all_samples.append({
                "input_ids": chunk,
                "labels": chunk,
                "attention_mask": attn_chunk
            })


train_data, val_data = train_test_split(all_samples, test_size=0.05, random_state=42)
dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(val_data)
print('Finished')


In [None]:
print('Started training')

training_args = TrainingArguments(
    output_dir="./lora_gemma",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=50, # increase for real training
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="steps",                   # # save by steps, not by epoches
    save_steps=10,                          # increase in real training
    report_to="none",
    logging_dir="./logs",
    eval_strategy="steps",
    eval_steps=10,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# test check
sample = dataset[0]

input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device)

with torch.autocast("cuda", dtype=torch.bfloat16):
    outputs = model(input_ids=input_ids, labels=labels)
    loss = outputs.loss
    print("Sample loss:", outputs.loss.item())
    print("Any logits NaN?", torch.isnan(outputs.logits).any().item())
    logits = outputs.logits

print("Logits dtype:", logits.dtype)
print("Logits min:", logits.min().item())
print("Logits max:", logits.max().item())

print("Labels min:", labels.min().item())
print("Labels max:", labels.max().item())

vocab_size = tokenizer.vocab_size
print("Tokenizer vocab size:", vocab_size)
print("Any label >= vocab_size:", (labels >= vocab_size).any().item())

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

trainer.train()

model.save_pretrained("./lora_adapter")
tokenizer.save_pretrained("./lora_adapter")