In [1]:
!pip -q install -U "transformers>=4.51.0" "datasets>=2.18.0" "accelerate>=0.33.0" "peft>=0.12.0" "bitsandbytes>=0.43.0" "safetensors>=0.4.3" "tqdm>=4.66.0"

import os, re, random, time
import torch
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling, set_seed
from transformers.trainer_callback import TrainerCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

seed = 42
set_seed(seed)
random.seed(seed)

print("cuda:", torch.cuda.is_available())
assert torch.cuda.is_available(), "GPU не включен: Runtime -> Change runtime type -> GPU"
print("gpu:", torch.cuda.get_device_name(0))
!nvidia-smi

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

prefix_path = "/content/prefixes.txt"
assert os.path.exists(prefix_path), f"Нет файла {prefix_path}"

def parse_prefixes(path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip("\n").strip()
            if not s:
                continue
            m = re.match(r"^\s*(\d+)\s+(.*)\s*$", s)
            if m:
                idx = int(m.group(1))
                pref = m.group(2).strip()
                items.append((idx, pref))
            else:
                items.append((len(items), s))
    items.sort(key=lambda x: x[0])
    return items

prefixes = parse_prefixes(prefix_path)

def build_prefix_regex(prefix):
    p = prefix.strip()
    p = p.replace("—", "-").replace("–", "-")
    esc = re.escape(p)
    esc = esc.replace(r"\ ", r"\s+")
    esc = esc.replace("ё", "[её]").replace("Ё", "[ЕЁ]")
    esc = esc.replace(r"\-", r"[-—–]")
    return re.compile(r"^\s*" + esc + r"(?:(?:\s*[,:\-—–]\s*)|\s+|$)", re.IGNORECASE)

patterns = {idx: build_prefix_regex(pref) for idx, pref in prefixes}

def cleanup_one_line(s):
    s = s.replace("\r", " ").replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    s = s.strip(" ,;:-—–")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def safe_continuation(full, prefix):
    full_s = full.strip()
    pref_s = prefix.strip()
    if full_s.lower().startswith(pref_s.lower()):
        cont = full_s[len(pref_s):]
    else:
        pos = full_s.lower().find(pref_s.lower())
        cont = full_s[pos + len(pref_s):] if pos >= 0 else full_s
    return cleanup_one_line(cont)

def score_candidate(s):
    if not s:
        return -1e9
    if len(s) < 25:
        return -200 + len(s)
    if len(s) > 280:
        return -50 - (len(s) - 280)
    words = re.findall(r"[A-Za-zА-Яа-яЁё0-9]+", s)
    uniq = len(set(w.lower() for w in words)) / max(1, len(words))
    rep_pen = 0
    if len(words) >= 10:
        for i in range(len(words) - 6):
            if words[i:i+3] == words[i+3:i+6]:
                rep_pen += 1
    end_bonus = 10 if re.search(r"[.!?…]$", s) else 0
    return 60 * uniq + end_bonus - 25 * rep_pen - 0.03 * abs(len(s) - 140)

per_prefix_real_limit = 60
general_limit = 6000
max_scan = 90000

counts = {idx: 0 for idx, _ in prefixes}
active = set(counts.keys())
real_texts = []
general_texts = []

print("Stage: streaming/filtering dataset")
ds_stream = load_dataset("igorktech/anekdots", split="train", streaming=True)

pbar = tqdm(total=max_scan, desc="scan", unit="ex")
scanned = 0
for ex in ds_stream:
    scanned += 1
    pbar.update(1)
    if scanned >= max_scan:
        break

    t = ex.get("text", None)
    if not isinstance(t, str):
        continue
    t = t.strip()
    if len(t) < 60 or len(t) > 900:
        continue
    if "\u0000" in t:
        continue
    mark = ex.get("total_mark", None)
    if isinstance(mark, int) and mark < 3:
        continue

    matched = False
    if active:
        for idx in list(active):
            if counts[idx] >= per_prefix_real_limit:
                active.discard(idx)
                continue
            if patterns[idx].match(t):
                real_texts.append(t)
                counts[idx] += 1
                matched = True
                if counts[idx] >= per_prefix_real_limit:
                    active.discard(idx)
                break

    if (not matched) and (len(general_texts) < general_limit):
        if isinstance(mark, int) and mark >= 10:
            general_texts.append(t)

    if scanned % 2000 == 0:
        pbar.set_postfix({"real": len(real_texts), "gen": len(general_texts), "active": len(active)})

    if (len(general_texts) >= general_limit) and (not active):
        break

pbar.close()
print("Stage done:", {"scanned": scanned, "real": len(real_texts), "gen": len(general_texts), "active": len(active)})

train_texts = real_texts + general_texts
random.shuffle(train_texts)

adapter_dir = "/content/qwen3_0p6b_ru_jokes_lora"
os.makedirs(adapter_dir, exist_ok=True)

base_model_id = "Qwen/Qwen3-0.6B-Base"
print("Model:", base_model_id)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype
)

def load_base():
    return AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        torch_dtype=compute_dtype,
        device_map="auto",
        trust_remote_code=True
    )

def generate_candidates(m, tok, prefix, n, max_new_tokens=90, temperature=1.0, top_p=0.9, rep_pen=1.12):
    inp = tok(prefix, return_tensors="pt")
    input_ids = inp["input_ids"].to(m.device)
    attn = inp["attention_mask"].to(m.device)
    input_ids = input_ids.repeat(n, 1)
    attn = attn.repeat(n, 1)
    with torch.no_grad():
        out = m.generate(
            input_ids=input_ids,
            attention_mask=attn,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=rep_pen,
            max_new_tokens=max_new_tokens,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id
        )
    return [tok.decode(out[i], skip_special_tokens=True) for i in range(out.size(0))]

adapter_exists = os.path.exists(os.path.join(adapter_dir, "adapter_config.json"))

if not adapter_exists:
    model_for_aug = load_base()
    model_for_aug.eval()
    aug_per_prefix = 1
    aug_texts = []
    for _, pref in tqdm(prefixes, desc="augment", unit="pref"):
        fulls = generate_candidates(model_for_aug, tokenizer, pref, n=aug_per_prefix, max_new_tokens=80, temperature=1.02, top_p=0.9, rep_pen=1.10)
        for ft in fulls:
            ft = cleanup_one_line(ft)
            if len(ft) >= len(pref) + 20:
                aug_texts.append(ft)
    train_texts = train_texts + aug_texts
    random.shuffle(train_texts)

    model = load_base()
    model = prepare_model_for_kbit_training(model)
    model.config.use_cache = False

    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    )
    model = get_peft_model(model, lora_cfg)

    train_ds = Dataset.from_dict({"text": train_texts})

    max_len = 192
    def tok_fn(batch):
        return tokenizer(batch["text"], truncation=True, max_length=max_len, padding=False)

    tok_ds = train_ds.map(tok_fn, batched=True, remove_columns=["text"], desc="tokenize")
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    class TrainTimeCallback(TrainerCallback):
        def __init__(self):
            self.t0 = None
        def on_train_begin(self, args, state, control, **kwargs):
            self.t0 = time.time()
            print(f"train_begin: max_steps={state.max_steps} epochs={args.num_train_epochs} bs={args.per_device_train_batch_size} ga={args.gradient_accumulation_steps} bf16={args.bf16} fp16={args.fp16}")
        def on_log(self, args, state, control, logs=None, **kwargs):
            if logs is None:
                return
            now = time.time()
            elapsed = now - (self.t0 or now)
            step = state.global_step
            max_steps = state.max_steps or 0
            eta = None
            if max_steps and step:
                eta = elapsed * (max_steps - step) / max(1, step)
            loss = logs.get("loss", None)
            lr = logs.get("learning_rate", None)
            ep = state.epoch
            s = f"step={step}/{max_steps} epoch={ep:.3f}" if ep is not None else f"step={step}/{max_steps}"
            if loss is not None:
                s += f" loss={loss:.4f}"
            if lr is not None:
                s += f" lr={lr:.2e}"
            s += f" elapsed={elapsed/60:.1f}m"
            if eta is not None:
                s += f" eta={eta/60:.1f}m"
            print(s)
        def on_train_end(self, args, state, control, **kwargs):
            if self.t0 is None:
                return
            elapsed = time.time() - self.t0
            print(f"train_end: steps={state.global_step} time={elapsed/60:.1f}m")

    args = TrainingArguments(
        output_dir=adapter_dir,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        logging_strategy="steps",
        logging_steps=10,
        save_strategy="no",
        bf16=use_bf16,
        fp16=not use_bf16,
        optim="paged_adamw_8bit",
        report_to="none",
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        group_by_length=True,
        disable_tqdm=False
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tok_ds,
        data_collator=data_collator,
        callbacks=[TrainTimeCallback()]
    )

    print("Stage: training")
    trainer.train()
    model.save_pretrained(adapter_dir)
    tokenizer.save_pretrained(adapter_dir)

base_model = load_base()
model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()

def generate_batch(prefix, n, max_new_tokens=110, temperature=1.03, top_p=0.9, rep_pen=1.12):
    inp = tokenizer(prefix, return_tensors="pt")
    input_ids = inp["input_ids"].to(model.device)
    attn = inp["attention_mask"].to(model.device)
    input_ids = input_ids.repeat(n, 1)
    attn = attn.repeat(n, 1)
    with torch.no_grad():
        out = model.generate(
            input_ids=input_ids,
            attention_mask=attn,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=rep_pen,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    return [tokenizer.decode(out[i], skip_special_tokens=True) for i in range(out.size(0))]

submission_path = "/content/submission.txt"
num_lines_per_prefix = 3
candidates_per_prefix = 12

print("Stage: generating submission")
with open(submission_path, "w", encoding="utf-8") as f:
    for idx, pref in tqdm(prefixes, desc="prefixes", unit="pref"):
        fulls = generate_batch(pref, candidates_per_prefix)
        cands = []
        for ft in fulls:
            cont = safe_continuation(ft, pref)
            if cont:
                cands.append(cont)
        cands = list(dict.fromkeys(cands))
        cands.sort(key=score_candidate, reverse=True)
        picked = cands[:num_lines_per_prefix] if cands else ["..."]
        for cont in picked:
            f.write(f"{idx} {cont}\n")

print("saved:", submission_path)
with open(submission_path, "r", encoding="utf-8") as f:
    for _ in range(15):
        line = f.readline()
        if not line:
            break
        print(line.rstrip("\n"))

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hcuda: True
gpu: Tesla T4
Thu Dec 25 20:07:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4            

  self.setter(val)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

scan:   0%|          | 0/90000 [00:00<?, ?ex/s]

Stage done: {'scanned': 90000, 'real': 70, 'gen': 6000, 'active': 76}
Model: Qwen/Qwen3-0.6B-Base


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

augment:   0%|          | 0/76 [00:00<?, ?pref/s]

tokenize:   0%|          | 0/6146 [00:00<?, ? examples/s]

Stage: training
train_begin: max_steps=385 epochs=1 bs=4 ga=4 bf16=True fp16=False


  return fn(*args, **kwargs)


Step,Training Loss
10,3.038
20,2.969
30,2.957
40,3.0257
50,3.02
60,2.7827
70,2.8105
80,2.8873
90,2.853
100,3.0251


step=10/385 epoch=0.026 loss=3.0380 lr=1.50e-04 elapsed=1.1m eta=40.7m
step=20/385 epoch=0.052 loss=2.9690 lr=2.00e-04 elapsed=2.0m eta=36.2m
step=30/385 epoch=0.078 loss=2.9570 lr=1.99e-04 elapsed=2.6m eta=30.8m
step=40/385 epoch=0.104 loss=3.0257 lr=1.97e-04 elapsed=3.1m eta=26.4m
step=50/385 epoch=0.130 loss=3.0200 lr=1.95e-04 elapsed=3.5m eta=23.2m
step=60/385 epoch=0.156 loss=2.7827 lr=1.92e-04 elapsed=4.5m eta=24.5m
step=70/385 epoch=0.182 loss=2.8105 lr=1.89e-04 elapsed=5.4m eta=24.3m
step=80/385 epoch=0.208 loss=2.8873 lr=1.84e-04 elapsed=6.0m eta=22.9m
step=90/385 epoch=0.234 loss=2.8530 lr=1.80e-04 elapsed=6.5m eta=21.2m
step=100/385 epoch=0.260 loss=3.0251 lr=1.74e-04 elapsed=6.9m eta=19.6m
step=110/385 epoch=0.286 loss=2.6838 lr=1.68e-04 elapsed=8.0m eta=19.9m
step=120/385 epoch=0.312 loss=2.7512 lr=1.62e-04 elapsed=8.8m eta=19.5m
step=130/385 epoch=0.338 loss=2.8290 lr=1.55e-04 elapsed=9.4m eta=18.4m
step=140/385 epoch=0.364 loss=2.8334 lr=1.48e-04 elapsed=9.8m eta=17.2m
s

prefixes:   0%|          | 0/76 [00:00<?, ?pref/s]

saved: /content/submission.txt
0 . По мгновению ворвался инспектор спереди: - Товарищ главнокомандующий! Я привел вам на задание отдельные личности, которые упали бы позади туда, где был данный случай.
0 . - Вовочка пьяный!? - Пять гвоздей, - ответил Вовочка. И крикнул: А я за руль!!!!!!
0 . А она вот так отпускает все: - Ну ты как, дорогой?! Эхну.. Ой! Понимаешь? Головно ложь - тут мозги заливаются воды... Приходит подушка. Я честно спрашиваю на приезде "доктор" поселить мне домой, котом почти ничего дела не делает? Молчит, вынимаете ее, начинает сп
1 . Папа к нему зовёт и спрашивает: - Какой дороги? Муж: ``Вижу... вот как я еду. Гость познает и счастливо вымицело вздробнееется.
1 . Пройти надо - у него накинуто резьбовка на пальцах. - Твоего? - Не, не кого! Дык грузовой стоит! Желает спать в белье!!! Хватит...!!! Hачем с тобой ворончик отпирать? Клопит-кизетный. Один из них хреновый!!!.. А я тебе такую резьбу дотол
1 он видит - у пенька садятся собаки. Видно так вроде бы их жена не к

вырезал с префиксом 0