Imports all libraries used across the notebook for both stages:
- Preprocess and analyze the Russian literature corpus (dedup, filtering, normalization)
- Train a small BPE tokenizer (~3k vocab) with BOS/EOS
- Build token-packed datasets for pretrain (context 512)
- Define and train a ~150M decoder-only model with Trainer and epoch-end generations
- Load Alpaca RU, format dialog data, and run SFT with Qwen2.5-0.5B


In [None]:
from __future__ import annotations

import os
# Disable hf-xet early (before any hub/transformers imports)
os.environ["HF_HUB_DISABLE_XET"] = "1"
# Ensure hf_transfer is not forced unless explicitly desired
os.environ.pop("HF_HUB_ENABLE_HF_TRANSFER", None)

import csv
import json
import random
import re
from math import exp
from pathlib import Path

from datasets import Dataset, DatasetDict, load_dataset
import numpy as np
import torch
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LlamaConfig,
    LlamaForCausalLM,
    PreTrainedTokenizerFast,
    Trainer,
    TrainerCallback,
    TrainingArguments,
    set_seed,
)
from trl.trainer.sft_trainer import SFTTrainer

Seeding and device/dtype selection makes training deterministic and aligned with available hardware. Supports stable Trainer and SFT runs and reproducible metrics.

In [2]:
SEED = int(os.environ.get("SEED", "42"))
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_name = (
    torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
)
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(
    json.dumps(
        {
            "seed": SEED,
            "device": str(device),
            "device_name": device_name,
            "cuda": torch.cuda.is_available(),
            "dtype": str(dtype),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "seed": 42,
  "device": "cuda",
  "device_name": "NVIDIA A100-PCIE-40GB",
  "cuda": true,
  "dtype": "torch.float16"
}


Verifies presence of corpus text files and Alpaca RU parquet shards before preprocessing and SFT. Ensures the full dataset scope is available and prevents wasted work later. Enumerates files for traceability and quick inspection.

In [3]:
ROOT = Path.cwd()
if not (ROOT / "data").is_dir():
    ROOT = ROOT.parent

CORPUS_DIR = ROOT / "data" / "corpus"
ALPACA_DIR = ROOT / "data" / "alpaca-cleaned-ru"

assert (
    CORPUS_DIR.exists() and CORPUS_DIR.is_dir()
), f"Missing directory: {CORPUS_DIR}"
assert (
    ALPACA_DIR.exists() and ALPACA_DIR.is_dir()
), f"Missing directory: {ALPACA_DIR}"

corpus_txt_files = sorted(CORPUS_DIR.rglob("*.txt"))
parquet_files = sorted(ALPACA_DIR.glob("*.parquet"))

stats = {
    "corpus_num_files": len(corpus_txt_files),
    "corpus_sample": [p.as_posix() for p in corpus_txt_files[:3]],
    "alpaca_parquet_files": [p.as_posix() for p in parquet_files],
}

print(json.dumps(stats, ensure_ascii=False, indent=2))

{
  "corpus_num_files": 108,
  "corpus_sample": [
    "/home/user/code/yandex_dl_course_6_private/data/corpus/Bulgakov_BelayaGvardiya.txt",
    "/home/user/code/yandex_dl_course_6_private/data/corpus/Bulgakov_Diavoliada.txt",
    "/home/user/code/yandex_dl_course_6_private/data/corpus/Bulgakov_Master.txt"
  ],
  "alpaca_parquet_files": [
    "/home/user/code/yandex_dl_course_6_private/data/alpaca-cleaned-ru/train-00000-of-00001-c503683bee003a5c.parquet"
  ]
}


Computes basic statistics (file/line/char counts) on the raw corpus to inform preprocessing and act as a baseline. It does not transform data but supports verification that later deduplication and filtering have expected magnitude. Helps detect anomalies before tokenizer training.

In [4]:
files = sorted(CORPUS_DIR.rglob("*.txt"))

num_files = len(files)
num_lines = 0
num_chars = 0

for p in files:
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            num_lines += 1
            num_chars += len(line)

avg_chars_per_line = (num_chars / num_lines) if num_lines else 0.0
avg_lines_per_file = (num_lines / num_files) if num_files else 0.0

print(
    json.dumps(
        {
            "num_files": num_files,
            "num_lines": num_lines,
            "num_chars": num_chars,
            "avg_chars_per_line": round(avg_chars_per_line, 2),
            "avg_lines_per_file": round(avg_lines_per_file, 2),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "num_files": 108,
  "num_lines": 329299,
  "num_chars": 45348575,
  "avg_chars_per_line": 137.71,
  "avg_lines_per_file": 3049.06
}


Groups lines into paragraphs and removes duplicate lines and paragraphs. This step reduces redundancy and organizes the text into coherent units before tokenization-aware packing. It specifically handles the deduplication stage of preprocessing and logs before-and-after counts for verification.

In [5]:
# Read all lines, strip trailing whitespace; group into paragraphs by blank lines
all_lines: list[str] = []
paragraphs: list[str] = []
current_paragraph: list[str] = []

for p in files:
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.rstrip("\n\r")
            all_lines.append(line)
            if line.strip():
                current_paragraph.append(line)
            else:
                if current_paragraph:
                    paragraphs.append("\n".join(current_paragraph))
                    current_paragraph = []
        if current_paragraph:
            paragraphs.append("\n".join(current_paragraph))
            current_paragraph = []

num_lines_before = len(all_lines)
num_paragraphs_before = len(paragraphs)

# Deduplicate exact lines and exact paragraphs
unique_lines = list(dict.fromkeys(all_lines))
unique_paragraphs = list(dict.fromkeys(paragraphs))

print(
    json.dumps(
        {
            "lines_before": num_lines_before,
            "lines_after_unique": len(unique_lines),
            "paragraphs_before": num_paragraphs_before,
            "paragraphs_after_unique": len(unique_paragraphs),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "lines_before": 329299,
  "lines_after_unique": 305870,
  "paragraphs_before": 9518,
  "paragraphs_after_unique": 7383
}


Enforces a Cyrillic-only constraint by rejecting lines containing non-Cyrillic letters (keeping punctuation/digits). This keeps the domain consistent with Russian literature and removes alphabetic noise. Improves the quality of the corpus used for tokenizer training and LM.

In [6]:
# Allow punctuation, digits, whitespace, and Cyrillic letters (incl. Ё/ё). Filter out lines containing non-Cyrillic letters.
ALLOWED_CHARS = set(
    " \t\n\r0123456789!?,.:;-—()[]{}'\"“…»«/\\|@#$%^&*+=~`<>·•–\u00a0\u2000\u2001\u2002\u2003\u2009\u2026"
)


def is_cyrillic_line(line: str) -> bool:
    for ch in line:
        if ch.isalpha():
            if not ("А" <= ch <= "Я" or "а" <= ch <= "я" or ch in ("Ё", "ё")):
                return False
        elif not (ch in ALLOWED_CHARS or ch.isspace()):
            return False
    return True


filtered_lines: list[str] = [ln for ln in unique_lines if is_cyrillic_line(ln)]
print(
    json.dumps(
        {
            "lines_before": len(unique_lines),
            "lines_after_cyrillic_filter": len(filtered_lines),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "lines_before": 305870,
  "lines_after_cyrillic_filter": 288461
}


Normalizes repeated punctuation and collapses excessive whitespace, including ellipsis variants. This reduces superficial variance and improves tokenizer training quality on domain text while keeping meaning intact. Standardization here supports more stable tokenization statistics used for BPE learning.

In [7]:
# Start from filtered_lines; normalize repeated punctuation and whitespace.
# Rules:
# - Replace sequences of ! or ? with a single char.
# - Normalize ellipsis variations to a single … (or three dots).
# - Collapse multiple spaces/tabs to a single space (preserve newlines).
# - Trim trailing spaces.

norm_lines: list[str] = []
for ln in filtered_lines:
    s = ln
    s = re.sub(r"[!]{2,}", "!", s)
    s = re.sub(r"[?]{2,}", "?", s)
    s = re.sub(r"(\.{3,}|…{2,})", "…", s)
    s = re.sub(r"[ \t]{2,}", " ", s)
    s = s.rstrip()
    norm_lines.append(s)

print(
    json.dumps(
        {
            "lines_before": len(filtered_lines),
            "lines_after_norm": len(norm_lines),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "lines_before": 288461,
  "lines_after_norm": 288461
}


Produces cleaned paragraph strings and defers chunking to token level, which will occur after tokenizer training. This aligns with preparing for context-length boundaries with BOS/EOS later. It removes stray carriage returns and trims text, ensuring packable units and a deterministic pipeline.

In [8]:
# Char-based chunking removed; token-level packing will follow after tokenizer training.
joined_text = []
for para in unique_paragraphs:
    s = para.replace("\r", "").strip()
    if s:
        joined_text.append(s)

print(
    json.dumps(
        {
            "paragraphs_joined": len(joined_text),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "paragraphs_joined": 7383
}


Prints a pre/post summary of key counts (unique lines/paragraphs, filtered and normalized lines, joined paragraphs). This validates that deduplication and filtering had the intended effect and supports traceability before tokenizer training. It is a lightweight audit to catch anomalies early.

In [9]:
pre_stats = {
    "initial_files": len(corpus_txt_files),
    "initial_lines": num_lines_before,
}

post_stats = {
    "unique_lines": len(unique_lines),
    "unique_paragraphs": len(unique_paragraphs),
    "cyrillic_lines": len(filtered_lines),
    "normalized_lines": len(norm_lines),
    "paragraphs_joined": len(joined_text),
}

print(
    "Preprocessing summary:\n"
    + json.dumps(
        {"pre": pre_stats, "post": post_stats}, ensure_ascii=False, indent=2
    )
)

Preprocessing summary:
{
  "pre": {
    "initial_files": 108,
    "initial_lines": 329299
  },
  "post": {
    "unique_lines": 305870,
    "unique_paragraphs": 7383,
    "cyrillic_lines": 288461,
    "normalized_lines": 288461,
    "paragraphs_joined": 7383
  }
}


Trains a BPE tokenizer on normalized lines with a small vocabulary (~3k) and defines special tokens including BOS/EOS. This matches the requirement to build a domain-appropriate tokenizer. The post-processor ensures BOS/EOS are applied consistently for single or paired sequences. The trained tokenizer is a prerequisite for packing and model configuration.

In [10]:
SPECIAL_TOKENS = ["<unk>", "<pad>", "<bos>", "<eos>"]
VOCAB_SIZE = 3000

# Train on normalized lines as corpus
trainer = BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS)

model = BPE(unk_token="<unk>")
tokenizer = Tokenizer(model)
setattr(tokenizer, "pre_tokenizer", ByteLevel(add_prefix_space=True))
setattr(tokenizer, "decoder", ByteLevelDecoder())

tokenizer.train_from_iterator(norm_lines, trainer=trainer)

# Add BOS/EOS processing
setattr(
    tokenizer,
    "post_processor",
    TemplateProcessing(
        single="<bos> $A <eos>",
        pair="<bos> $A <eos> <bos> $B <eos>",
        special_tokens=[
            ("<bos>", tokenizer.token_to_id("<bos>")),
            ("<eos>", tokenizer.token_to_id("<eos>")),
        ],
    ),
)

print(
    json.dumps(
        {
            "vocab_size": tokenizer.get_vocab_size(),
            "bos_id": tokenizer.token_to_id("<bos>"),
            "eos_id": tokenizer.token_to_id("<eos>"),
            "pad_id": tokenizer.token_to_id("<pad>"),
            "unk_id": tokenizer.token_to_id("<unk>"),
        },
        ensure_ascii=False,
        indent=2,
    )
)




{
  "vocab_size": 3000,
  "bos_id": 2,
  "eos_id": 3,
  "pad_id": 1,
  "unk_id": 0
}


Saves the tokenizer to disk and reloads it to verify vocabulary integrity and special token IDs. This ensures artifacts are consistent and ready for reuse across steps. Verifying IDs prevents silent mismatches in subsequent packing and model configuration. Persistence is necessary for reproducible training runs.

In [11]:
TOKENIZER_PATH = Path("tokenizer.json")

tokenizer.save(str(TOKENIZER_PATH))
print({"saved": TOKENIZER_PATH.exists(), "path": str(TOKENIZER_PATH)})

reloaded = Tokenizer.from_file(str(TOKENIZER_PATH))
print(
    json.dumps(
        {
            "vocab_size": reloaded.get_vocab_size(),
            "bos_id": reloaded.token_to_id("<bos>"),
            "eos_id": reloaded.token_to_id("<eos>"),
            "pad_id": reloaded.token_to_id("<pad>"),
            "unk_id": reloaded.token_to_id("<unk>"),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{'saved': True, 'path': 'tokenizer.json'}
{
  "vocab_size": 3000,
  "bos_id": 2,
  "eos_id": 3,
  "pad_id": 1,
  "unk_id": 0
}


Validates the trained tokenizer by round‑trip encode→decode on a sample line to ensure the vocabulary and BOS/EOS post‑processing behave correctly before packing data.


Extracts special token IDs (PAD/BOS/EOS/UNK) from the persisted tokenizer for consistent configuration of the model, collator, and generation routines.


In [12]:
# Round-trip encode/decode verification using the reloaded tokenizer
# Bypass BOS/EOS in this check and compare after collapsing whitespace (incl. NBSP)
sample = next((s for s in norm_lines if s.strip()), "Sample text.")
enc = reloaded.encode(sample, add_special_tokens=False)
dec = reloaded.decode(enc.ids, skip_special_tokens=True)

sample_cmp = re.sub(r"\s+", " ", sample.replace("\u00a0", " ")).strip()
dec_cmp = re.sub(r"\s+", " ", dec.replace("\u00a0", " ")).strip()

print(
    json.dumps(
        {
            "sample_preview": sample_cmp[:80],
            "num_ids": len(enc.ids),
            "decoded_equals": dec_cmp == sample_cmp,
        },
        ensure_ascii=False,
        indent=2,
    )
)

assert dec_cmp == sample_cmp, "Round-trip encode/decode mismatch"

{
  "sample_preview": "Посвящается Любови Евгеньевне Белозерской",
  "num_ids": 17,
  "decoded_equals": true
}


In [13]:
PAD_ID = reloaded.token_to_id("<pad>")
BOS_ID = reloaded.token_to_id("<bos>")
EOS_ID = reloaded.token_to_id("<eos>")
UNK_ID = reloaded.token_to_id("<unk>")

print(
    json.dumps(
        {
            "PAD_ID": PAD_ID,
            "BOS_ID": BOS_ID,
            "EOS_ID": EOS_ID,
            "UNK_ID": UNK_ID,
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "PAD_ID": 1,
  "BOS_ID": 2,
  "EOS_ID": 3,
  "UNK_ID": 0
}


Packs token IDs to 512 tokens and builds a Dataset with input_ids and attention_mask. This operationalizes the assignment’s requirement to prepare tokenized data for pretraining; BOS/EOS are applied by the post-processor.

In [14]:
# Pack to fixed 512 with BOS/EOS per chunk (no cross-chunk slicing)
MAX_LEN = 512
buf = []
input_ids = []
attention_mask = []

for t in tqdm(joined_text, desc="encode & buffer"):
    e = reloaded.encode(t, add_special_tokens=False).ids
    buf.extend(e)
    while len(buf) >= (MAX_LEN - 2):
        chunk = [BOS_ID] + buf[: MAX_LEN - 2] + [EOS_ID]
        input_ids.append(chunk)
        attention_mask.append([1] * len(chunk))
        buf = buf[MAX_LEN - 2 :]

if buf:
    chunk = [BOS_ID] + buf + [EOS_ID]
    input_ids.append(chunk)
    attention_mask.append([1] * len(chunk))

train_ds = Dataset.from_dict(
    {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }
)

print(
    json.dumps(
        {
            "num_sequences": len(train_ds),
            "avg_len": round(
                sum(len(x) for x in input_ids) / max(1, len(input_ids)),
                1,
            ),
            "max_len": max((len(x) for x in input_ids), default=0),
        },
        ensure_ascii=False,
        indent=2,
    )
)

encode & buffer: 100%|██████████| 7383/7383 [00:37<00:00, 194.91it/s] 


{
  "num_sequences": 31736,
  "avg_len": 512.0,
  "max_len": 512
}


Performs a deterministic 95/5 train/validation split to enable evaluation during training. Supports Trainer’s evaluation strategy by providing a held-out split and helps monitor generalization.

In [15]:
# Deterministic 95/5 split by index
split_idx = int(0.95 * len(train_ds))
train_split = train_ds.select(range(0, split_idx))
val_split = train_ds.select(range(split_idx, len(train_ds)))

datasets_dict = DatasetDict(
    {
        "train": train_split,
        "validation": val_split,
    }
)

print(
    json.dumps(
        {k: len(v) for k, v in datasets_dict.items()},
        ensure_ascii=False,
        indent=2,
    )
)

{
  "train": 30149,
  "validation": 1587
}


Wraps the saved tokenizer as a fast tokenizer compatible with Trainer and configures a causal LM data collator. Enables efficient batching and correct label shifting for next-token prediction; sanity-checks a tiny batch to catch shape issues early.

In [16]:
# Build a HF fast tokenizer from tokenizer.json for Trainer compatibility
tok_fast = PreTrainedTokenizerFast(tokenizer_file=str(TOKENIZER_PATH))
tok_fast.pad_token = "<pad>"
tok_fast.unk_token = "<unk>"
tok_fast.bos_token = "<bos>"
tok_fast.eos_token = "<eos>"

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tok_fast,
    mlm=False,
)

# Quick sanity: collate a tiny batch
batch = data_collator([train_ds[0], train_ds[min(1, len(train_ds) - 1)]])
print(
    json.dumps(
        {
            k: (getattr(v, "shape", None) or str(type(v)))
            for k, v in batch.items()
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "input_ids": [
    2,
    512
  ],
  "attention_mask": [
    2,
    512
  ],
  "labels": [
    2,
    512
  ]
}


Defines a ~150M parameter Llama configuration with the assignment’s suggested sizes and sets special token IDs. Instantiates a decoder-only model aligned with the specified architecture; ties embeddings and checks for correct head/padding configuration.

In [17]:
VOCAB_SIZE = reloaded.get_vocab_size()

config = LlamaConfig(
    vocab_size=VOCAB_SIZE,
    hidden_size=1024,
    intermediate_size=1536,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=512,
    pad_token_id=PAD_ID,
    bos_token_id=BOS_ID,
    eos_token_id=EOS_ID,
    tie_word_embeddings=True,
)

model = LlamaForCausalLM(config)
# Ensure embeddings/head are tied and padding idx set
model.resize_token_embeddings(VOCAB_SIZE)
model.get_input_embeddings().padding_idx = PAD_ID
model.tie_weights()

# Quick checks
num_params = sum(p.numel() for p in model.parameters())
has_output_head = model.get_output_embeddings() is not None
print(
    json.dumps(
        {
            "num_params": num_params,
            "has_output_head": has_output_head,
            "pad_token_id": model.config.pad_token_id,
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "num_params": 128934912,
  "has_output_head": true,
  "pad_token_id": 1
}


Configures TrainingArguments and Trainer (effective batch size via accumulation, weight decay, warmup, fp16). Establishes evaluation/save strategies by epoch and creates a reproducible minimal training loop.

In [18]:
# Effective batch size target 64–128 via gradient accumulation
per_device_train_batch_size = 2
gradient_accumulation_steps = (
    32  # adjust based on VRAM to hit effective batch size
)
warmup_ratio = 0.03

args = TrainingArguments(
    output_dir="outputs/pretrain",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=3e-4,
    weight_decay=0.01,
    warmup_ratio=warmup_ratio,
    num_train_epochs=1,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_total_limit=2,
    max_grad_norm=1.0,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets_dict["train"],
    eval_dataset=datasets_dict["validation"],
    data_collator=data_collator,
)

print("trainer ready")

trainer ready


Declares the 10 required evaluation prompts and registers an epoch-end callback to capture deterministic generations. Provides qualitative checks each epoch to complement validation loss.

In [19]:
# 10 fixed pretrain prompts from assignment
test_prompts = [
    "Все мысли, которые имеют огромные последствия",
    "Сила войска зависит от его духа",
    "Мысль о том, что он принес страдания",
    "Человек сознает себя свободным",
    "Что бы ни случилось, я всегда буду",
    "Любовь мешает смерти",
    "Нет, жизнь не кончена",
    "Всякая мысль, даже самая простая",
    "Война не любезность, а самое гадкое дело",
    "Чтобы жить честно",
]

# Minimal epoch-end generation callback


class EpochGenCB(TrainerCallback):
    def on_epoch_end(
        self,
        args,
        state,
        control,
        model=None,
        tokenizer=None,
        **kwargs,
    ):
        gens = []
        gen_kwargs = dict(
            max_new_tokens=64,
            do_sample=False,
            pad_token_id=PAD_ID,
            eos_token_id=EOS_ID,
        )
        for p in test_prompts:
            x = tok_fast(
                [p],
                return_tensors="pt",
                return_token_type_ids=False,
            ).to(model.device)
            with torch.inference_mode():
                y = model.generate(**x, **gen_kwargs)
            input_len = int(x["input_ids"].shape[1])
            cont_ids = y[0, input_len:]
            cont_text = tok_fast.decode(
                cont_ids,
                skip_special_tokens=True,
            )
            gens.append({"prompt": p, "text": cont_text})
        epoch_generations.append(
            {
                "epoch": int(state.epoch),
                "generations": gens,
            }
        )
        print(f"Stored generations for epoch {int(state.epoch)}")


epoch_generations: list[dict] = []
trainer.add_callback(EpochGenCB())
print("epoch-end generation callback registered")

epoch-end generation callback registered


Runs training then evaluates to obtain validation loss and perplexity. This is the quantitative checkpoint for the language modeling objective and confirms end-to-end wiring of the pipeline.

In [20]:
train_result = trainer.train()
metrics_train = train_result.metrics

# Evaluate to get validation loss
metrics_eval = trainer.evaluate()
val_loss = metrics_eval.get("eval_loss", None)
val_ppl = exp(val_loss) if val_loss is not None else None

print(
    json.dumps(
        {
            "train_metrics": {
                k: float(v)
                for k, v in metrics_train.items()
                if isinstance(v, (int, float))
            },
            "eval_metrics": {
                k: float(v)
                for k, v in metrics_eval.items()
                if isinstance(v, (int, float))
            },
            "val_ppl": (float(val_ppl) if val_ppl is not None else None),
        },
        ensure_ascii=False,
        indent=2,
    )
)

Step,Training Loss
50,6.5222
100,5.5367
150,5.0145
200,4.5552
250,4.184
300,3.9573
350,3.8001
400,3.6974
450,3.6203


Stored generations for epoch 1


{
  "train_metrics": {
    "train_runtime": 1354.0618,
    "train_samples_per_second": 22.266,
    "train_steps_per_second": 0.349,
    "total_flos": 1.1657136948903936e+16,
    "train_loss": 4.500001115314031,
    "epoch": 1.0
  },
  "eval_metrics": {
    "eval_loss": 3.8177576065063477,
    "eval_runtime": 18.1985,
    "eval_samples_per_second": 87.205,
    "eval_steps_per_second": 43.63,
    "epoch": 1.0
  },
  "val_ppl": 45.50206031164532
}


Saves the trained model and trainer state to disk for reproducibility and inspection.

In [37]:
SAVE_DIR = Path("outputs/pretrain/final")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(SAVE_DIR))  # saves model + tokenizer (tok_fast)
trainer.state.save_to_json(str(SAVE_DIR / "trainer_state.json"))

print(
    json.dumps(
        {
            "saved_dir": str(SAVE_DIR),
            "exists": SAVE_DIR.exists(),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "saved_dir": "outputs/pretrain/final",
  "exists": true
}


Loads the Alpaca RU parquet shard into a Dataset and prints split/row counts and columns to verify availability and schema prior to SFT formatting.

In [38]:
# Load all local parquet shards
parquet_paths = sorted(ALPACA_DIR.glob("*.parquet"))
alpaca = load_dataset("parquet", data_files=[str(p) for p in parquet_paths])
print(alpaca)
train_split = alpaca["train"]
num_rows = int(getattr(train_split, "num_rows", 0))
columns = list(getattr(train_split, "column_names", [])) or list(
    getattr(getattr(train_split, "features", {}), "keys", lambda: [])()
)
print(
    json.dumps(
        {
            "num_rows": num_rows,
            "columns": columns,
        },
        ensure_ascii=False,
        indent=2,
    )
)

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 51760
    })
})
{
  "num_rows": 51760,
  "columns": [
    "input",
    "instruction",
    "output"
  ]
}


Maps Alpaca columns to dialog roles: `input→system`, `instruction→user`, `output→assistant`. Validates required columns and returns a clean dialog Dataset without pandas.

In [39]:
# Map fields strictly: input→system, instruction→user, output→assistant (no pandas)
sft_train = alpaca["train"]
feat = getattr(sft_train, "features", None)
columns = (
    list(feat.keys())
    if feat is not None
    else list(getattr(sft_train, "column_names", []))
)
required = {"input", "instruction", "output"}
assert required.issubset(
    set(columns)
), f"Missing required columns: {required - set(columns)}"


def _map(row):
    return {
        "system": (row.get("input") or ""),
        "user": (row.get("instruction") or ""),
        "assistant": (row.get("output") or ""),
    }


sft_mapped = sft_train.map(_map, remove_columns=sft_train.column_names)

print(
    json.dumps(
        {
            "num_rows": int(getattr(sft_mapped, "num_rows", 0)),
            "columns": list(getattr(sft_mapped, "features", {}).keys()),
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "num_rows": 51760,
  "columns": [
    "system",
    "user",
    "assistant"
  ]
}


Initializes the Qwen2.5‑0.5B base model and tokenizer for SFT. Chooses float16 on CUDA, ensures `pad_token`/`eos_token` are set for batching and deterministic generation.

In [44]:


QWEN_ID = "Qwen/Qwen2.5-0.5B"
qwen_tok = AutoTokenizer.from_pretrained(
    QWEN_ID,
    local_files_only=False,
)

qwen_model = AutoModelForCausalLM.from_pretrained(
    QWEN_ID,
    dtype=(torch.bfloat16 if torch.cuda.is_available() else torch.float32),
    device_map="auto",
    trust_remote_code=False,
)

# Ensure pad and eos are set
if qwen_tok.pad_token is None:
    qwen_tok.pad_token = qwen_tok.eos_token or "<|endoftext|>"

print(
    json.dumps(
        {
            "pad_token": qwen_tok.pad_token,
            "pad_token_id": qwen_tok.pad_token_id,
            "eos_token": qwen_tok.eos_token,
            "eos_token_id": qwen_tok.eos_token_id,
        },
        ensure_ascii=False,
        indent=2,
    )
)

{
  "pad_token": "<|endoftext|>",
  "pad_token_id": 151643,
  "eos_token": "<|endoftext|>",
  "eos_token_id": 151643
}


Builds dialog texts via the tokenizer’s chat template, creates a small validation split, defines SFT `TrainingArguments`, and instantiates `SFTTrainer` on the Alpaca RU dialog Dataset.

In [None]:
# Build conversational texts via tokenizer chat template (no pandas)
messages_texts = []
for r in tqdm(sft_mapped, desc="build sft texts"):
    messages = []
    sys = (r.get("system") or "").strip()
    if sys:
        messages.append({"role": "system", "content": sys})
    messages.append({"role": "user", "content": r.get("user") or ""})
    messages.append({"role": "assistant", "content": r.get("assistant") or ""})
    text = qwen_tok.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    messages_texts.append(text)

sft_text_ds = Dataset.from_dict({"text": messages_texts})
# Small validation split
sft_splits = sft_text_ds.train_test_split(test_size=0.05, seed=SEED)

sft_args = TrainingArguments(
    output_dir="outputs/sft",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    learning_rate=2e-5,
    weight_decay=0.0,
    warmup_ratio=0.03,
    num_train_epochs=1,
    logging_steps=50,
    bf16=torch.cuda.is_available(),
    save_total_limit=2,
)

sft_trainer = SFTTrainer(
    model=qwen_model,
    processing_class=qwen_tok,
    args=sft_args,
    train_dataset=sft_splits["train"],
    eval_dataset=sft_splits["test"],
)

print("sft trainer ready")

build sft texts:   0%|          | 0/51760 [00:00<?, ?it/s]

build sft texts: 100%|██████████| 51760/51760 [00:05<00:00, 9544.99it/s] 


Adding EOS to train dataset:   0%|          | 0/49172 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/49172 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/49172 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2588 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2588 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2588 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


sft trainer ready (completion-only)


Runs the SFT training loop and prints core metrics. Confirms that data formatting, optimization, and device settings are correctly integrated; produces an instruction‑tuned model.

In [46]:
sft_train_result = sft_trainer.train()
print(
    json.dumps(
        {
            k: float(v)
            for k, v in sft_train_result.metrics.items()
            if isinstance(v, (int, float))
        },
        ensure_ascii=False,
        indent=2,
    )
)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
50,0.0


KeyboardInterrupt: 

Generates answers for four evaluation prompts to qualitatively assess instruction‑following. Uses deterministic generation for comparability and formats prompts via the same chat template used in training.

In [None]:
# Generate answers for the 4 evaluation questions
questions_rus = [
    "сколько планет в нашей солнечной системе?",
    "расскажи стих",
    "когда собирать крыжовник?",
    "Как быстро выучить новый язык?",
]

responses = []
for q in questions_rus:
    msgs = []
    system_text = ""
    if system_text:
        msgs.append({"role": "system", "content": system_text})
    msgs.append({"role": "user", "content": q})
    prompt = qwen_tok.apply_chat_template(
        msgs,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs = qwen_tok(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(qwen_model.device)
    with torch.inference_mode():
        out = qwen_model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=qwen_tok.pad_token_id,
            eos_token_id=qwen_tok.eos_token_id,
        )
    # Slice out only the generated continuation after the prompt
    input_len = inputs["input_ids"].shape[1]
    gen_ids = out[0, input_len:]
    text = qwen_tok.decode(gen_ids, skip_special_tokens=True)
    responses.append({"question": q, "answer": text})

for r in responses:
    print("Q:", r["question"])
    print("A:", r["answer"])
    print("---")

Saves the SFT‑tuned model and tokenizer to disk for reproducibility and reuse. Enables others to validate and deploy the fine‑tuned model.

In [None]:
# Save SFT model and tokenizer

SFT_DIR = Path("outputs/sft/final")
SFT_DIR.mkdir(parents=True, exist_ok=True)

sft_trainer.save_model(str(SFT_DIR))
qwen_tok.save_pretrained(str(SFT_DIR))

print(
    json.dumps(
        {"sft_saved_dir": str(SFT_DIR), "exists": SFT_DIR.exists()},
        ensure_ascii=False,
        indent=2,
    )
)

Reports environment and library versions, model parameter counts, and dataset sizes for reproducibility/debugging. Captures runtime context to prevent silent drift across environments.

In [None]:
summary = {
    "seed": SEED,
    "device": str(device),
    "device_name": device_name,
    "dtype": str(dtype),
    "python": sys.version.split()[0],
    "torch": torch.__version__,
    "transformers": __import__("transformers").__version__,
    "datasets": __import__("datasets").__version__,
    "trl": __import__("trl").__version__,
    "pretrain": {
        "vocab_size": int(VOCAB_SIZE),
        "num_params": int(sum(p.numel() for p in model.parameters())),
        "context_len": int(model.config.max_position_embeddings),
    },
    "sft": {
        "base_model": QWEN_ID,
        "train_rows": int(getattr(sft_splits["train"], "num_rows", 0)),
        "eval_rows": int(getattr(sft_splits["test"], "num_rows", 0)),
    },
}
print(json.dumps(summary, ensure_ascii=False, indent=2))

Consolidates the last epoch’s pretrain generations and the SFT evaluation answers. Satisfies the requirement to show generation outputs in the notebook for review.

In [None]:
# Consolidated displays: 10 pretrain generations (last epoch) and 4 SFT answers

if epoch_generations:
    last = epoch_generations[-1]
    print(f"Pretrain epoch {last['epoch']} generations:")
    for i, g in enumerate(last["generations"], 1):
        print(f"[{i:02d}] prompt: {g['prompt']}")
        print(f"     text: {g['text']}")
        print("---")
else:
    print("No pretrain generations recorded.")

print("SFT evaluation answers:")
for r in responses:
    print("Q:", r["question"])
    print("A:", r["answer"])
    print("---")

Persists generations and evaluation outputs to JSON/CSV for external inspection and comparison. These machine‑readable artifacts mirror on‑screen results for analysis and archiving.

In [None]:
# Persist outputs to JSON/CSV

OUT_DIR = Path("outputs/reports")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Pretrain generations per epoch to JSON
with (OUT_DIR / "pretrain_epoch_generations.json").open(
    "w", encoding="utf-8"
) as f:
    json.dump(epoch_generations, f, ensure_ascii=False, indent=2)

# Last epoch generations to CSV
with (OUT_DIR / "pretrain_last_epoch_generations.csv").open(
    "w", encoding="utf-8", newline=""
) as f:
    w = csv.writer(f)
    w.writerow(["epoch", "idx", "prompt", "text"])
    if epoch_generations:
        last = epoch_generations[-1]
        for i, g in enumerate(last["generations"], 1):
            w.writerow([last["epoch"], i, g["prompt"], g["text"]])

# SFT responses to CSV
with (OUT_DIR / "sft_eval_responses.csv").open(
    "w", encoding="utf-8", newline=""
) as f:
    w = csv.writer(f)
    w.writerow(["question", "answer"])
    for r in responses:
        w.writerow([r["question"], r["answer"]])

print(
    json.dumps(
        {"written": True, "dir": str(OUT_DIR)}, ensure_ascii=False, indent=2
    )
)