# Для начала обучения должны быть установлены зависимости из файла requirements.txt
### Импорты

In [10]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    set_seed,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

base_model_name = "Qwen/Qwen3-4B-Instruct-2507"

use_qlora = True
bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
fp16 = not bf16

output_dir = "outputs/mistake_tagger_lora"
max_seq_len = 2048
per_device_batch_size = 1
grad_accum = 16
num_epochs = 10
learning_rate = 3e-5
warmup_ratio = 0.05
logging_steps = 1
save_steps = 75
eval_steps = 15

os.makedirs(output_dir, exist_ok=True)

Using GPU: True bf16: True use_qlora: True


### Подгружаем данные для обучения

In [None]:
df = pd.read_csv('private_test.csv')
df_old_train = pd.read_csv('train.csv')
df = pd.concat([df, df_old_train], axis=0).reset_index(drop=True)

In [16]:
df = df.reset_index(drop=True)

### Расставляем тэги

In [18]:
def make_razmetka(x, sol):
    text = ''
    prev = 0
    for item in eval(x):
        text += sol[prev:item[0]] + '<mistake>' + sol[item[0]:item[1]] + '</mistake>'
        prev = item[1]
    text += sol[prev:]
    return text

In [20]:
df['tagged_solution'] = df.apply(lambda x: make_razmetka(x['answer'], x['solution']), axis=1)

In [24]:
SYSTEM_INSTRUCTIONS = (
    "You tag mistakes in student math solutions.\n"
    "- Output must be EXACTLY the student's solution text, with <mistake>...</mistake> tags around mistakes.\n"
    "- Do NOT add or remove any other text, lines, or spaces.\n"
    "- Do NOT add commentary or explanations.\n"
)

def format_prompt(task: str, solution: str) -> str:
    return (
        f"{SYSTEM_INSTRUCTIONS}\n\n"
        f"Problem:\n{task}\n\n"
        f"Student solution:\n{solution}\n\n"
        f"Tagged solution:\n"
    )

def get_lora_target_modules(model):
    present = {"q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"}
    return sorted(list(present))

class DataCollatorPadToMax:
    def __init__(self, tokenizer, pad_to_multiple_of=8):
        self.tokenizer = tokenizer
        self.pad_to_multiple_of = pad_to_multiple_of

    def __call__(self, features):
        max_len = max(len(f["input_ids"]) for f in features)
        if self.pad_to_multiple_of is not None and max_len % self.pad_to_multiple_of != 0:
            max_len = ((max_len // self.pad_to_multiple_of) + 1) * self.pad_to_multiple_of

        pad_id = self.tokenizer.pad_token_id
        batch_input_ids, batch_attention_mask, batch_labels = [], [], []

        for f in features:
            ids = f["input_ids"]
            labs = f["labels"]
            attn = [1] * len(ids)

            pad_len = max_len - len(ids)
            if pad_len > 0:
                ids = ids + [pad_id] * pad_len
                attn = attn + [0] * pad_len
                labs = labs + [-100] * pad_len

            batch_input_ids.append(ids)
            batch_attention_mask.append(attn)
            batch_labels.append(labs)

        return {
            "input_ids": torch.tensor(batch_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(batch_attention_mask, dtype=torch.long),
            "labels": torch.tensor(batch_labels, dtype=torch.long),
        }

def make_tokenize_fn(tokenizer, max_seq_len):
    def tokenize_batch(batch):
        input_ids_list, labels_list = [], []
        for task, solution, tagged in zip(batch["task"], batch["solution"], batch["tagged_solution"]):
            prompt = format_prompt(task, solution)
            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
            target_ids = tokenizer.encode(tagged, add_special_tokens=False)

            eos_id = tokenizer.eos_token_id
            if eos_id is not None:
                target_ids = target_ids + [eos_id]

            input_ids = prompt_ids + target_ids
            labels = [-100] * len(prompt_ids) + target_ids

            if len(input_ids) > max_seq_len:
                input_ids = input_ids[:max_seq_len]
                labels = labels[:max_seq_len]

            input_ids_list.append(input_ids)
            labels_list.append(labels)

        return {"input_ids": input_ids_list, "labels": labels_list}
    return tokenize_batch

### Обучение 1 фолда

In [25]:
def train_solo_model(train_df,val_df,output_dir):
    set_seed(56)
    #data
    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    val_ds = Dataset.from_pandas(val_df, preserve_index=False)
    raw_ds = DatasetDict({"train": train_ds, "val": val_ds})

    # setitng up
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16 if bf16 else torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    ) if use_qlora else None
    
    torch_dtype = None if use_qlora else (torch.bfloat16 if bf16 else torch.float16)
    
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="auto",
        torch_dtype=torch_dtype,
        quantization_config=quant_cfg,
    )
    
    if use_qlora:
        model = prepare_model_for_kbit_training(model)
    
    target_modules = get_lora_target_modules(model)
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()
    
    model.config.use_cache = False

    tokenize_fn = make_tokenize_fn(tokenizer, max_seq_len)

    tokenized_ds = raw_ds.map(
        tokenize_fn,
        batched=True,
        remove_columns=raw_ds["train"].column_names,
        desc="Tokenizing dataset",
    )

    # parameters
    data_collator = DataCollatorPadToMax(tokenizer)

    optim_name = "paged_adamw_32bit" if use_qlora else "adamw_torch"

    train_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_batch_size,
        per_device_eval_batch_size=per_device_batch_size,
        gradient_accumulation_steps=grad_accum,
        learning_rate=learning_rate,
        num_train_epochs=num_epochs,
        warmup_ratio=warmup_ratio,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_strategy="steps",
        eval_steps=eval_steps,
        save_total_limit=2,
        bf16=True,
        report_to="none",
        gradient_checkpointing=True,
        optim=optim_name,
        lr_scheduler_type="cosine",
    )

    #trainer
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["val"],
        data_collator=data_collator,
    )

    #train
    out = trainer.train()
    trainer.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Saved LoRA adapter + tokenizer to {output_dir}")

    model.cpu()
    del model
    torch.cuda.empty_cache()

### Запускаем обучение

In [28]:
from sklearn.model_selection import GroupKFold
kf = GroupKFold(n_splits=5, shuffle=True, random_state=56)
df = df[["task", "solution", "tagged_solution"]].dropna().reset_index(drop=True)

In [None]:
from tqdm.auto import tqdm
test_results_on_folds = []
for fold, (train_idxs, val_idxs) in enumerate(kf.split(df, groups=df['task'])):
    fold_train = df.iloc[train_idxs, :]
    fold_val = df.iloc[val_idxs, :]
    fold_name = f'outputs/qwen-sft-fold-{fold}'
    tok = train_solo_model(
        fold_train,
        fold_val,
        fold_name
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 33,030,144 || all params: 4,055,498,240 || trainable%: 0.8145


Tokenizing dataset:   0%|          | 0/336 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/74 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
15,0.0828,0.080013
30,0.0534,0.053313
45,0.0414,0.047168
60,0.0437,0.04369
75,0.0362,0.041395
90,0.0331,0.040188
105,0.0381,0.039124
120,0.0382,0.038552
135,0.0289,0.038247
150,0.029,0.038158


Saved LoRA adapter + tokenizer to outputs/qwen-sft-fold-0-1-1


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 33,030,144 || all params: 4,055,498,240 || trainable%: 0.8145


Tokenizing dataset:   0%|          | 0/325 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/85 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
15,0.091,0.077753
30,0.0478,0.050302
45,0.0433,0.044533
60,0.0418,0.041723
75,0.035,0.039696
90,0.0381,0.038754
105,0.0446,0.037528
