In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import copy
from transformers import Qwen2Tokenizer, AutoModelForCausalLM
from tqdm import tqdm
import math

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

dtype = torch.float16

print(dtype)

cuda
torch.float16


In [3]:
model_name = "Qwen/Qwen3-0.6B"
# model_name = "Qwen/Qwen3-4B-Instruct-2507" у кого карточки есть покруче!

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    dtype=torch.float32
).to(device)

tokenizer = Qwen2Tokenizer.from_pretrained(model_name)

for param in model.parameters():
    param.requires_grad = False

model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.config.use_cache = False

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


In [4]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [5]:
class LoRALinear(nn.Module):
    def __init__(self, original_linear: nn.Linear, rank: int, alpha: float):
        super().__init__()

        self.register_buffer('weight', original_linear.weight.data)
        if original_linear.bias is not None:
            self.register_buffer('bias', original_linear.bias.data)
        else:
            self.bias = None

        feature_in = original_linear.in_features
        feature_out = original_linear.out_features

        self.scaling = alpha / rank

        factory_kwargs = {'device': original_linear.weight.device, 'dtype': original_linear.weight.dtype}

        self.lora_A = nn.Parameter(torch.empty((rank, feature_in), **factory_kwargs))
        self.lora_B = nn.Parameter(torch.zeros((feature_out, rank), **factory_kwargs))

        self.lora_A.requires_grad = True
        self.lora_B.requires_grad = True

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        original_out = F.linear(x, self.weight, self.bias)

        lora_out = (x @ self.lora_A.T @ self.lora_B.T) * self.scaling

        return original_out + lora_out

In [6]:
def inject_lora(model, rank=32, alpha=64, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]):
    """
    Доступные: "q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj".
    """
    for name, module in model.named_children():
        if isinstance(module, nn.Linear):
            if any(target in name for target in target_modules):
                lora_layer = LoRALinear(module, rank, alpha)
                setattr(model, name, lora_layer)
        else:
            inject_lora(module, rank, alpha, target_modules)

In [7]:
inject_lora(model)

In [8]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"\nОбучаемых параметров: {trainable_params:,} ({trainable_params/all_params:.2%})")


Обучаемых параметров: 9,175,040 (2.14%)


In [9]:
class GSM8kDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item['question']
        answer = item['answer']

        messages = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]

        input_ids = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=False,
            truncation=True,
            max_length=self.max_len
        )

        user_messages = [{"role": "user", "content": question}]
        prompt_ids = self.tokenizer.apply_chat_template(
            user_messages,
            tokenize=True,
            add_generation_prompt=True
        )

        prompt_len = len(prompt_ids)

        labels = list(input_ids)

        mask_len = min(prompt_len, len(labels))

        for i in range(mask_len):
            labels[i] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": [1] * len(input_ids),
            "labels": labels
        }

In [10]:
def collate_fn(batch):
    max_len = max(len(x["input_ids"]) for x in batch)

    input_ids_list, attention_mask_list, labels_list = [], [], []

    for x in batch:
        inp = x["input_ids"]
        mask = x["attention_mask"]
        lab = x["labels"]

        pad_len = max_len - len(inp)

        padded_inp = [tokenizer.pad_token_id] * pad_len + inp
        padded_mask = [0] * pad_len + mask
        padded_lab = [-100] * pad_len + lab

        input_ids_list.append(torch.tensor(padded_inp))
        attention_mask_list.append(torch.tensor(padded_mask))
        labels_list.append(torch.tensor(padded_lab))

    return {
        "input_ids": torch.stack(input_ids_list),
        "attention_mask": torch.stack(attention_mask_list),
        "labels": torch.stack(labels_list)
    }

In [11]:
dataset = load_dataset("openai/gsm8k", "main", split="train")
train_ds = GSM8kDataset(dataset, tokenizer, 1024)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [12]:
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4,
    fused=torch.cuda.is_available()
)
scaler = torch.amp.GradScaler('cuda', enabled=(dtype == torch.float16))

In [13]:
model.train()
step = 0
total_loss = 0
accumulation_steps = 16
progress_bar = tqdm(range(len(train_loader)))

optimizer.zero_grad()

for epoch in range(1):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.amp.autocast('cuda', dtype=dtype):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        total_loss += loss.item()

        if (step + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(
                filter(lambda p: p.requires_grad, model.parameters()),
                max_norm=1.0
            )

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            progress_bar.set_description(f"Epoch {epoch} | Loss: {total_loss:.4f}")
            total_loss = 0

        step += 1
        progress_bar.update(1)

if step % accumulation_steps != 0:
    scaler.unscale_(optimizer)
    torch.nn.utils.clip_grad_norm_(
        filter(lambda p: p.requires_grad, model.parameters()),
        max_norm=1.0
    )
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()

Epoch 0 | Loss: 0.4580:  50%|█████     | 1869/3738 [16:44<14:21,  2.17it/s]

In [16]:
import os

lora_state_dict = {k: v.cpu() for k, v in model.state_dict().items() if "lora_" in k}
output_path = "custom_lora.pt"
torch.save(lora_state_dict, output_path)

print(f"Готово! Адаптер сохранен в '{output_path}'.")
print(f"Размер файла: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")

Готово! Адаптер сохранен в 'custom_lora.pt'.
Размер файла: 35.07 MB


Inference(можно перезапустить ноутбук и начать отсюда, сохранив адаптер)




In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, Qwen2Tokenizer, GenerationConfig
import math

#КОНФИГУРАЦИЯ (должна совпадать с обучением!)
MODEL_NAME = "Qwen/Qwen3-0.6B"
LORA_PATH = "/content/custom_lora.pt"
RANK = 32
ALPHA = 64
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]

device = "cuda" if torch.cuda.is_available() else "cpu"


In [20]:
class LoRALinear(nn.Module):
    def __init__(self, original_linear: nn.Linear, rank: int, alpha: float):
        super().__init__()

        self.register_buffer('weight', original_linear.weight.data)
        if original_linear.bias is not None:
            self.register_buffer('bias', original_linear.bias.data)
        else:
            self.bias = None

        feature_in = original_linear.in_features
        feature_out = original_linear.out_features

        self.scaling = alpha / rank

        factory_kwargs = {'device': original_linear.weight.device, 'dtype': original_linear.weight.dtype}

        self.lora_A = nn.Parameter(torch.empty((rank, feature_in), **factory_kwargs))
        self.lora_B = nn.Parameter(torch.zeros((feature_out, rank), **factory_kwargs))

        self.lora_A.requires_grad = True
        self.lora_B.requires_grad = True

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        original_out = F.linear(x, self.weight, self.bias)

        lora_out = (x @ self.lora_A.T @ self.lora_B.T) * self.scaling

        return original_out + lora_out

In [21]:
def inject_lora(model, rank=32, alpha=64, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]):
    """
    Доступные: "q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj".
    """
    for name, module in model.named_children():
        if isinstance(module, nn.Linear):
            if any(target in name for target in target_modules):
                lora_layer = LoRALinear(module, rank, alpha)
                setattr(model, name, lora_layer)
        else:
            inject_lora(module, rank, alpha, target_modules)

In [22]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    dtype=torch.float16
).to(device)
model.config.use_cache = True

tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [23]:
inject_lora(model, RANK, ALPHA, TARGET_MODULES)

In [24]:
state_dict = torch.load(LORA_PATH, map_location=device)
missing, unexpected = model.load_state_dict(state_dict, strict=False)

In [25]:
if any("lora_" in k for k in missing):
    print("Какие-то веса LoRA не найдены! Проверь названия слоев.")
else:
    print("Успех! Адаптеры загружены.")

model.eval()

Успех! Адаптеры загружены.


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): LoRALinear()
          (k_proj): LoRALinear()
          (v_proj): LoRALinear()
          (o_proj): LoRALinear()
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): Qwen3RMSNorm((1024,), eps=1e-06)
    (rotary_emb): Qwen3RotaryEmbedding()
  )
  (lm_head): Linear(in_featu

In [26]:
question = "Natalia has 5 apples. She buys 3 packs of 4 apples. How many apples?"

messages = [{"role": "user", "content": question}]
input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        input_ids,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
print(f"\nВопрос: {question}")
print(f"Ответ: {response}")


Вопрос: Natalia has 5 apples. She buys 3 packs of 4 apples. How many apples?
Ответ: <think>

</think>

She buys 3*4=<<3*4=12>>12 apples.
So Natalia has 5+12=<<5+12=17>>17 apples.
#### 17
