In [None]:
from __future__ import annotations

import json
import os
import random
import re
from pathlib import Path

from datasets import Dataset, DatasetDict, load_dataset
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer
import torch
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizerFast,
    TrainerCallback,
    set_seed,
)
from trl.trainer.sft_trainer import SFTTrainer

In [None]:
SEED = int(os.environ.get("SEED", "42"))
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print({
    "seed": SEED,
    "device": str(device),
    "device_name": device_name,
    "cuda": torch.cuda.is_available(),
    "dtype": str(dtype),
})

In [None]:
CORPUS_DIR = Path("data/corpus")
ALPACA_DIR = Path("data/alpaca-cleaned-ru")

assert CORPUS_DIR.exists() and CORPUS_DIR.is_dir(), f"Missing directory: {CORPUS_DIR}"
assert ALPACA_DIR.exists() and ALPACA_DIR.is_dir(), f"Missing directory: {ALPACA_DIR}"

corpus_txt_files = sorted(CORPUS_DIR.glob("**/*.txt"))
parquet_files = sorted(ALPACA_DIR.glob("*.parquet"))

stats = {
    "corpus_num_files": len(corpus_txt_files),
    "corpus_sample": [str(p) for p in corpus_txt_files[:3]],
    "alpaca_parquet_files": [str(p) for p in parquet_files],
}

print(json.dumps(stats, ensure_ascii=False, indent=2))

In [None]:
files = sorted(CORPUS_DIR.glob("**/*.txt"))

num_files = len(files)
num_lines = 0
num_chars = 0

for p in tqdm(files, desc="scan files"):
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            num_lines += 1
            num_chars += len(line)

avg_chars_per_line = (num_chars / num_lines) if num_lines else 0.0
avg_lines_per_file = (num_lines / num_files) if num_files else 0.0

print({
    "num_files": num_files,
    "num_lines": num_lines,
    "num_chars": num_chars,
    "avg_chars_per_line": round(avg_chars_per_line, 2),
    "avg_lines_per_file": round(avg_lines_per_file, 2),
})

In [None]:
# Read all lines, strip trailing whitespace; group into paragraphs by blank lines
all_lines: list[str] = []
paragraphs: list[str] = []
current_paragraph: list[str] = []

for p in tqdm(files, desc="read paragraphs"):
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.rstrip("\n\r")
            all_lines.append(line)
            if line.strip():
                current_paragraph.append(line)
            else:
                if current_paragraph:
                    paragraphs.append("\n".join(current_paragraph))
                    current_paragraph = []
        if current_paragraph:
            paragraphs.append("\n".join(current_paragraph))
            current_paragraph = []

num_lines_before = len(all_lines)
num_paragraphs_before = len(paragraphs)

# Deduplicate exact lines and exact paragraphs
unique_lines = list(dict.fromkeys(all_lines))
unique_paragraphs = list(dict.fromkeys(paragraphs))

print({
    "lines_before": num_lines_before,
    "lines_after_unique": len(unique_lines),
    "paragraphs_before": num_paragraphs_before,
    "paragraphs_after_unique": len(unique_paragraphs),
})

In [None]:
# Allow punctuation, digits, whitespace, and Cyrillic letters (incl. Ё/ё). Filter out lines containing non-Cyrillic letters.
ALLOWED_CHARS = set(" \t\n\r0123456789!?,.:;-—()[]{}'\"“…»«/\\|@#$%^&*+=~`<>·•–\u00A0\u2000\u2001\u2002\u2003\u2009\u2026")

def is_cyrillic_line(line: str) -> bool:
    for ch in line:
        if ch.isalpha():
            if not ("А" <= ch <= "Я" or "а" <= ch <= "я" or ch in ("Ё", "ё")):
                return False
        elif not (ch in ALLOWED_CHARS or ch.isspace()):
            return False
    return True

filtered_lines: list[str] = [ln for ln in unique_lines if is_cyrillic_line(ln)]
print({
    "lines_before": len(unique_lines),
    "lines_after_cyrillic_filter": len(filtered_lines),
})

In [None]:
# Start from filtered_lines; normalize repeated punctuation and whitespace.
# Rules:
# - Replace sequences of ! or ? with a single char.
# - Normalize ellipsis variations to a single … (or three dots).
# - Collapse multiple spaces/tabs to a single space (preserve newlines).
# - Trim trailing spaces.

norm_lines: list[str] = []
for ln in tqdm(filtered_lines, desc="normalize"):
    s = ln
    s = re.sub(r"[!]{2,}", "!", s)
    s = re.sub(r"[?]{2,}", "?", s)
    s = re.sub(r"(\.{3,}|…{2,})", "…", s)
    s = re.sub(r"[ \t]{2,}", " ", s)
    s = s.rstrip()
    norm_lines.append(s)

print({
    "lines_before": len(filtered_lines),
    "lines_after_norm": len(norm_lines),
})

In [None]:
# Char-based chunking removed; token-level packing will follow after tokenizer training.
joined_text = []
for para in tqdm(unique_paragraphs, desc="join paragraphs"):
    s = para.replace("\r", "").strip()
    if s:
        joined_text.append(s)

print({
    "paragraphs_joined": len(joined_text),
})

In [None]:
pre_stats = {
    "initial_files": len(corpus_txt_files),
    "initial_lines": num_lines_before,
}

post_stats = {
    "unique_lines": len(unique_lines),
    "unique_paragraphs": len(unique_paragraphs),
    "cyrillic_lines": len(filtered_lines),
    "normalized_lines": len(norm_lines),
    "paragraphs_joined": len(joined_text),
}

print("Preprocessing summary:\n" + json.dumps({"pre": pre_stats, "post": post_stats}, ensure_ascii=False, indent=2))

In [None]:
SPECIAL_TOKENS = ["<unk>", "<pad>", "<bos>", "<eos>"]
VOCAB_SIZE = 3000

# Train on normalized lines as corpus
trainer = BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS)

model = BPE(unk_token="<unk>")
tokenizer = Tokenizer(model)
setattr(tokenizer, "pre_tokenizer", Whitespace())

tokenizer.train_from_iterator(norm_lines, trainer=trainer)

# Add BOS/EOS processing
setattr(tokenizer, "post_processor", TemplateProcessing(
    single="<bos> $A <eos>",
    pair="<bos> $A <eos> <bos> $B <eos>",
    special_tokens=[
        ("<bos>", tokenizer.token_to_id("<bos>")),
        ("<eos>", tokenizer.token_to_id("<eos>")),
    ],
))

print({
    "vocab_size": tokenizer.get_vocab_size(),
    "bos_id": tokenizer.token_to_id("<bos>"),
    "eos_id": tokenizer.token_to_id("<eos>"),
    "pad_id": tokenizer.token_to_id("<pad>"),
    "unk_id": tokenizer.token_to_id("<unk>"),
})

In [None]:
TOKENIZER_PATH = Path("tokenizer.json")

tokenizer.save(str(TOKENIZER_PATH))
print({"saved": TOKENIZER_PATH.exists(), "path": str(TOKENIZER_PATH)})

reloaded = Tokenizer.from_file(str(TOKENIZER_PATH))
print({
    "vocab_size": reloaded.get_vocab_size(),
    "bos_id": reloaded.token_to_id("<bos>"),
    "eos_id": reloaded.token_to_id("<eos>"),
    "pad_id": reloaded.token_to_id("<pad>"),
    "unk_id": reloaded.token_to_id("<unk>"),
})

In [None]:
# Round-trip encode/decode verification using the reloaded tokenizer
# Use a non-empty line from the corpus-derived normalized lines to avoid embedding literals
sample = next((s for s in norm_lines if s.strip()), "Sample text.")
enc = reloaded.encode(sample)
dec = reloaded.decode(enc.ids)

print({
    "sample_preview": sample[:80],
    "num_ids": len(enc.ids),
    "decoded_equals": dec == sample,
})

assert dec == sample, "Round-trip encode/decode mismatch"

In [None]:
PAD_ID = reloaded.token_to_id("<pad>")
BOS_ID = reloaded.token_to_id("<bos>")
EOS_ID = reloaded.token_to_id("<eos>")
UNK_ID = reloaded.token_to_id("<unk>")

print({
    "PAD_ID": PAD_ID,
    "BOS_ID": BOS_ID,
    "EOS_ID": EOS_ID,
    "UNK_ID": UNK_ID,
})

In [None]:
# Token-level packing to 512 tokens (BOS/EOS already applied by post-processor)
MAX_LEN = 512
ids_stream: List[int] = []
for t in tqdm(joined_text, desc="encode paragraphs"):
    e = reloaded.encode(t)
    ids_stream.extend(e.ids)

input_ids: List[List[int]] = []
attention_mask: List[List[int]] = []
cur: List[int] = []
for tok in ids_stream:
    cur.append(tok)
    while len(cur) >= MAX_LEN:
        input_ids.append(cur[:MAX_LEN])
        attention_mask.append([1] * MAX_LEN)
        cur = cur[MAX_LEN:]
if cur:
    input_ids.append(cur)
    attention_mask.append([1] * len(cur))

train_ds = Dataset.from_dict({
    "input_ids": input_ids,
    "attention_mask": attention_mask,
})

print({
    "num_sequences": len(train_ds),
    "avg_len": round(sum(len(x) for x in input_ids) / max(1, len(input_ids)), 1),
    "max_len": max((len(x) for x in input_ids), default=0),
})

In [None]:
# Deterministic 95/5 split by index
split_idx = int(0.95 * len(train_ds))
train_split = train_ds.select(range(0, split_idx))
val_split = train_ds.select(range(split_idx, len(train_ds)))

datasets_dict = DatasetDict({
    "train": train_split,
    "validation": val_split,
})

print({k: len(v) for k, v in datasets_dict.items()})

In [None]:
# Build a HF fast tokenizer from tokenizer.json for Trainer compatibility
tok_fast = PreTrainedTokenizerFast(tokenizer_file=str(TOKENIZER_PATH))
tok_fast.pad_token = "<pad>"
tok_fast.unk_token = "<unk>"
tok_fast.bos_token = "<bos>"
tok_fast.eos_token = "<eos>"

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tok_fast,
    mlm=False,
)

# Quick sanity: collate a tiny batch
batch = data_collator([train_ds[0], train_ds[min(1, len(train_ds)-1)]])
print({k: (v.shape if hasattr(v, 'shape') else type(v)) for k, v in batch.items()})

In [None]:
from transformers import LlamaConfig, LlamaForCausalLM

VOCAB_SIZE = reloaded.get_vocab_size()

config = LlamaConfig(
    vocab_size=VOCAB_SIZE,
    hidden_size=1024,
    intermediate_size=1536,
    num_hidden_layers=16,
    num_attention_heads=16,
    num_key_value_heads=8,
    max_position_embeddings=512,
    pad_token_id=PAD_ID,
    bos_token_id=BOS_ID,
    eos_token_id=EOS_ID,
    tie_word_embeddings=True,
)

model = LlamaForCausalLM(config)
# Ensure embeddings/head are tied and padding idx set
model.resize_token_embeddings(VOCAB_SIZE)
model.get_input_embeddings().padding_idx = PAD_ID
model.tie_weights()

# Quick checks
num_params = sum(p.numel() for p in model.parameters())
has_output_head = model.get_output_embeddings() is not None
print({
    "num_params": num_params,
    "has_output_head": has_output_head,
    "pad_token_id": model.config.pad_token_id,
})

In [None]:
from transformers import TrainingArguments, Trainer

# Effective batch size target 64–128 via gradient accumulation
per_device_train_batch_size = 2
gradient_accumulation_steps = 32  # adjust based on VRAM to hit effective batch size
warmup_ratio = 0.03

args = TrainingArguments(
    output_dir="outputs/pretrain",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    weight_decay=0.01,
    warmup_ratio=warmup_ratio,
    num_train_epochs=1,
    fp16=torch.cuda.is_available(),
    bf16=False,
    logging_steps=50,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets_dict["train"],
    eval_dataset=datasets_dict["validation"],
    data_collator=data_collator,
)

print("trainer ready")

In [None]:
# 10 fixed pretrain prompts from assignment
test_prompts = [
    "Все мысли, которые имеют огромные последствия",
    "Сила войска зависит от его духа",
    "Мысль о том, что он принес страдания",
    "Человек сознает себя свободным",
    "Что бы ни случилось, я всегда буду",
    "Любовь мешает смерти",
    "Нет, жизнь не кончена",
    "Всякая мысль, даже самая простая",
    "Война не любезность, а самое гадкое дело",
    "Чтобы жить честно",
]

# Minimal epoch-end generation callback
class EpochGenCB(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, tokenizer=None, **kwargs):[override]
        gens = []
        gen_kwargs = dict(max_new_tokens=64, do_sample=False, pad_token_id=PAD_ID, eos_token_id=EOS_ID)
        for p in test_prompts:
            x = tok_fast([p], return_tensors="pt").to(model.device)
            with torch.inference_mode():
                y = model.generate(**x, **gen_kwargs)
            gens.append({"prompt": p, "text": tok_fast.decode(y[0], skip_special_tokens=True)})
        epoch_generations.append({"epoch": int(state.epoch), "generations": gens})
        print(f"Stored generations for epoch {int(state.epoch)}")

epoch_generations: list[dict] = []
trainer.add_callback(EpochGenCB())
print("epoch-end generation callback registered")

In [None]:
from math import exp

train_result = trainer.train()
metrics_train = train_result.metrics

# Evaluate to get validation loss
metrics_eval = trainer.evaluate()
val_loss = metrics_eval.get("eval_loss", None)
val_ppl = (exp(val_loss) if val_loss is not None else None)

print({
    "train_metrics": {k: float(v) for k, v in metrics_train.items() if isinstance(v, (int, float))},
    "eval_metrics": {k: float(v) for k, v in metrics_eval.items() if isinstance(v, (int, float))},
    "val_ppl": (float(val_ppl) if val_ppl is not None else None),
})

In [None]:
from pathlib import Path

SAVE_DIR = Path("outputs/pretrain/final")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(SAVE_DIR))  # saves model + tokenizer (tok_fast)
trainer.state.save_to_json(str(SAVE_DIR / "trainer_state.json"))

print({
    "saved_dir": str(SAVE_DIR),
    "exists": SAVE_DIR.exists(),
})

In [None]:
alpaca = load_dataset("parquet", data_files=str(ALPACA_DIR / "train-00000-of-00001-*.parquet"))
print(alpaca)
train_split = alpaca["train"]
num_rows = int(getattr(train_split, "num_rows", 0))
columns = list(getattr(getattr(train_split, "features", {}), "keys", lambda: [])())
print({
    "num_rows": num_rows,
    "columns": columns,
})

In [None]:
# Map fields strictly: input→system, instruction→user, output→assistant (no pandas)
sft_train = alpaca["train"]
feat = getattr(sft_train, "features", None)
columns = list(feat.keys()) if feat is not None else list(getattr(sft_train, "column_names", []))
required = {"input", "instruction", "output"}
assert required.issubset(set(columns)), f"Missing required columns: {required - set(columns)}"

def _map(row):
    return {
        "system": (row.get("input") or ""),
        "user": (row.get("instruction") or ""),
        "assistant": (row.get("output") or ""),
    }

sft_mapped = sft_train.map(_map, remove_columns=sft_train.column_names)

print({
    "num_rows": int(getattr(sft_mapped, "num_rows", 0)),
    "columns": list(getattr(sft_mapped, "features", {}).keys()),
})

In [None]:
QWEN_ID = "Qwen/Qwen2.5-0.5B"
qwen_tok = AutoTokenizer.from_pretrained(QWEN_ID)
qwen_model = AutoModelForCausalLM.from_pretrained(QWEN_ID, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

# Ensure pad and eos are set
if qwen_tok.pad_token is None:
    qwen_tok.pad_token = qwen_tok.eos_token or "<|endoftext|>"

print({
    "pad_token": qwen_tok.pad_token,
    "pad_token_id": qwen_tok.pad_token_id,
    "eos_token": qwen_tok.eos_token,
    "eos_token_id": qwen_tok.eos_token_id,
})

In [None]:
# Build conversational texts via tokenizer chat template (no pandas)
messages_texts = []
for r in tqdm(sft_mapped, desc="build sft texts"):
    messages = []
    sys = (r.get("system") or "").strip()
    if sys:
        messages.append({"role": "system", "content": sys})
    messages.append({"role": "user", "content": r.get("user") or ""})
    messages.append({"role": "assistant", "content": r.get("assistant") or ""})
    text = qwen_tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    messages_texts.append(text)

sft_text_ds = Dataset.from_dict({"text": messages_texts})
# Small validation split
sft_splits = sft_text_ds.train_test_split(test_size=0.05, seed=SEED)

sft_args = TrainingArguments(
    output_dir="outputs/sft",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    learning_rate=2e-5,
    weight_decay=0.0,
    warmup_ratio=0.03,
    num_train_epochs=1,
    logging_steps=50,
    eval_steps=200,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to=[],
)

sft_trainer = SFTTrainer(
    model=qwen_model,
    tokenizer=qwen_tok,
    args=sft_args,
    train_dataset=sft_splits["train"],
    eval_dataset=sft_splits["test"],
    dataset_text_field="text",
)

print("sft trainer ready")

In [None]:
sft_train_result = sft_trainer.train()
print({k: float(v) for k, v in sft_train_result.metrics.items() if isinstance(v, (int, float))})

In [None]:
# Generate answers for the 4 evaluation questions
questions_rus = [
    "сколько планет в нашей солнечной системе?",
    "расскажи стих",
    "когда собирать крыжовник?",
    "Как быстро выучить новый язык?",
]

responses = []
for q in questions_rus:
    msgs = []
    if "" != "":
        msgs.append({"role": "system", "content": ""})
    msgs.append({"role": "user", "content": q})
    prompt = qwen_tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    inputs = qwen_tok([prompt], return_tensors="pt").to(qwen_model.device)
    with torch.inference_mode():
        out = qwen_model.generate(**inputs, max_new_tokens=128, do_sample=False, pad_token_id=qwen_tok.pad_token_id, eos_token_id=qwen_tok.eos_token_id)
    text = qwen_tok.decode(out[0], skip_special_tokens=True)
    responses.append({"question": q, "answer": text})

for r in responses:
    print("Q:", r["question"]) 
    print("A:", r["answer"]) 
    print("---")

In [None]:
# Save SFT model and tokenizer
from pathlib import Path
SFT_DIR = Path("outputs/sft/final")
SFT_DIR.mkdir(parents=True, exist_ok=True)

sft_trainer.save_model(str(SFT_DIR))
qwen_tok.save_pretrained(str(SFT_DIR))

print({"sft_saved_dir": str(SFT_DIR), "exists": SFT_DIR.exists()})

In [None]:
import json
import sys

summary = {
    "seed": SEED,
    "device": str(device),
    "device_name": device_name,
    "dtype": str(dtype),
    "python": sys.version.split()[0],
    "torch": torch.__version__,
    "transformers": __import__("transformers").__version__,
    "datasets": __import__("datasets").__version__,
    "trl": __import__("trl").__version__,
    "pretrain": {
        "vocab_size": int(VOCAB_SIZE),
        "num_params": int(sum(p.numel() for p in model.parameters())),
        "context_len": int(model.config.max_position_embeddings),
    },
    "sft": {
        "base_model": QWEN_ID,
        "train_rows": int(getattr(sft_splits["train"], "num_rows", 0)),
        "eval_rows": int(getattr(sft_splits["test"], "num_rows", 0)),
    },
}
print(json.dumps(summary, ensure_ascii=False, indent=2))

In [None]:
# Consolidated displays: 10 pretrain generations (last epoch) and 4 SFT answers
import json

if epoch_generations:
    last = epoch_generations[-1]
    print(f"Pretrain epoch {last['epoch']} generations:")
    for i, g in enumerate(last["generations"], 1):
        print(f"[{i:02d}] prompt: {g['prompt']}")
        print(f"     text: {g['text']}")
        print("---")
else:
    print("No pretrain generations recorded.")

print("SFT evaluation answers:")
for r in responses:
    print("Q:", r["question"]) 
    print("A:", r["answer"]) 
    print("---")

In [None]:
# Persist outputs to JSON/CSV
import json
import csv
from pathlib import Path

OUT_DIR = Path("outputs/reports")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Pretrain generations per epoch to JSON
with (OUT_DIR / "pretrain_epoch_generations.json").open("w", encoding="utf-8") as f:
    json.dump(epoch_generations, f, ensure_ascii=False, indent=2)

# Last epoch generations to CSV
with (OUT_DIR / "pretrain_last_epoch_generations.csv").open("w", encoding="utf-8", newline="") as f:
    w = csv.writer(f)
    w.writerow(["epoch", "idx", "prompt", "text"])
    if epoch_generations:
        last = epoch_generations[-1]
        for i, g in enumerate(last["generations"], 1):
            w.writerow([last["epoch"], i, g["prompt"], g["text"]])

# SFT responses to CSV
with (OUT_DIR / "sft_eval_responses.csv").open("w", encoding="utf-8", newline="") as f:
    w = csv.writer(f)
    w.writerow(["question", "answer"])
    for r in responses:
        w.writerow([r["question"], r["answer"]])

print({"written": True, "dir": str(OUT_DIR)})