# –í—ã–≤–æ–¥ –≤ –∫–æ–Ω—Ü–µ

In [1]:
%%writefile /content/text_autocomplete/src/data_utils.py
import os
import re
import pandas as pd
from typing import Tuple

BASE_DIR = "/content/text_autocomplete"
RAW_PATH_DEFAULT = os.path.join(BASE_DIR, "data", "tweets.txt")
DATA_DIR_DEFAULT = os.path.join(BASE_DIR, "data")

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"[^\w\s.,!?]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def prepare_from_txt(raw_path: str = RAW_PATH_DEFAULT,
                     data_dir: str = DATA_DIR_DEFAULT,
                     train_frac: float = 0.8,
                     val_frac: float = 0.1) -> Tuple[str, str, str]:
    if not os.path.exists(raw_path):
        raise FileNotFoundError(f"–ù–µ –Ω–∞–π–¥–µ–Ω —Ñ–∞–π–ª: {raw_path}")

    os.makedirs(data_dir, exist_ok=True)

    # 1) —á–∏—Ç–∞–µ–º –∏—Å—Ö–æ–¥–Ω—ã–π txt
    with open(raw_path, "r", encoding="utf-8", errors="ignore") as f:
        lines = [ln.strip() for ln in f if ln.strip()]

    # 2) raw ‚Üí CSV
    raw_csv_path = os.path.join(data_dir, "raw_dataset.csv")
    pd.DataFrame({"text": lines}).to_csv(raw_csv_path, index=False)

    # 3) —á–∏—Å—Ç–∫–∞
    cleaned = [clean_text(t) for t in lines if len(t) > 1]
    df = pd.DataFrame({"text": cleaned})
    proc_path = os.path.join(data_dir, "dataset_processed.csv")
    df.to_csv(proc_path, index=False)

    # 4) —Å–ø–ª–∏—Ç—ã
    n = len(df)
    n_train = int(n * train_frac)
    n_val = int(n * val_frac)
    train = df.iloc[:n_train]
    val   = df.iloc[n_train:n_train + n_val]
    test  = df.iloc[n_train + n_val:]

    train_path = os.path.join(data_dir, "train.csv")
    val_path   = os.path.join(data_dir, "val.csv")
    test_path  = os.path.join(data_dir, "test.csv")

    train.to_csv(train_path, index=False)
    val.to_csv(val_path, index=False)
    test.to_csv(test_path, index=False)

    print(f"‚úî raw ‚Üí {raw_csv_path}")
    print(f"‚úî processed ‚Üí {proc_path}")
    print(f"‚úî splits ‚Üí {train_path}, {val_path}, {test_path}")
    print(f"–†–∞–∑–º–µ—Ä—ã: train={len(train)}, val={len(val)}, test={len(test)}")
    return train_path, val_path, test_path

if __name__ == "__main__":
    prepare_from_txt(RAW_PATH_DEFAULT, DATA_DIR_DEFAULT)


Writing /content/text_autocomplete/src/data_utils.py


In [2]:
# –ø—Ä–æ–≤–µ—Ä–∫–∞ –∏—Å—Ö–æ–¥–Ω–∏–∫–∞
import os
print("tweets.txt exists:", os.path.exists("/content/text_autocomplete/data/tweets.txt"))

# –∑–∞–ø—É—Å–∫ —Å–∫—Ä–∏–ø—Ç–∞
!python -u /content/text_autocomplete/src/data_utils.py


tweets.txt exists: True
‚úî raw ‚Üí /content/text_autocomplete/data/raw_dataset.csv
‚úî processed ‚Üí /content/text_autocomplete/data/dataset_processed.csv
‚úî splits ‚Üí /content/text_autocomplete/data/train.csv, /content/text_autocomplete/data/val.csv, /content/text_autocomplete/data/test.csv
–†–∞–∑–º–µ—Ä—ã: train=1014836, val=126854, test=126855


In [3]:
# src/next_token_dataset.py
%%writefile /content/text_autocomplete/src/next_token_dataset.py
import re
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from typing import List, Tuple

# —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è
TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)

def tokenize(s: str) -> List[str]:
    return TOKEN_RE.findall(str(s).lower())

# –∑–∞—Ä–µ–∑–µ—Ä–≤–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã
PAD, UNK, BOS, EOS = "<pad>", "<unk>", "<bos>", "<eos>"

# —Å–ª–æ–≤–∞—Ä—å
def build_vocab(train_csv: str, min_freq: int = 2, out_dir: str = "artifacts"
               ) -> Tuple[dict, dict, int, int, int, int]:
    """
    –°–æ–∑–¥–∞—ë—Ç —Å–ª–æ–≤–∞—Ä—å –ø–æ train.csv (–∫–æ–ª–æ–Ω–∫–∞ 'text').
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç: stoi, itos, pad_id, unk_id, bos_id, eos_id
    """
    import os
    os.makedirs(out_dir, exist_ok=True)

    df = pd.read_csv(train_csv)
    texts = df["text"].astype(str).tolist()

    counter = Counter()
    for s in texts:
        counter.update(tokenize(s))

    vocab = [PAD, UNK, BOS, EOS] + [t for t, c in counter.items() if c >= min_freq]
    stoi = {t: i for i, t in enumerate(vocab)}
    itos = {i: t for t, i in stoi.items()}

    with open(f"{out_dir}/vocab.json", "w", encoding="utf-8") as f:
        json.dump(
            {"stoi": stoi, "itos": {str(k): v for k, v in itos.items()}},
            f, ensure_ascii=False, indent=2
        )

    return stoi, itos, stoi[PAD], stoi[UNK], stoi[BOS], stoi[EOS]

# –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ
def encode(tokens: List[str], stoi: dict, unk_id: int) -> List[int]:
    return [stoi.get(t, unk_id) for t in tokens]

# —Ä–∞–∑–±–∏–µ–Ω–∏–µ –Ω–∞ –ø–∞—Ä—ã
def make_pairs_from_stream(
    text_list: List[str],
    stoi: dict,
    bos_id: int,
    eos_id: int,
    unk_id: int,
    max_len: int = 32
) -> List[Tuple[List[int], List[int]]]:
    """
    –±–ª–æ–∫ –¥–∞—ë—Ç x, y —Å–æ —Å–¥–≤–∏–≥–æ–º –Ω–∞ 1
    """
    ids: List[int] = []
    for s in text_list:
        toks = tokenize(s)
        seq  = [bos_id] + encode(toks, stoi, unk_id) + [eos_id]
        ids.extend(seq)

    pairs: List[Tuple[List[int], List[int]]] = []
    for i in range(0, len(ids) - 1, max_len):
        x = ids[i : i + max_len]
        y = ids[i + 1 : i + 1 + max_len]
        if len(x) == len(y):
            pairs.append((x, y))
    return pairs

# Dataset DataLoader
class BlockDataset(Dataset):
    def __init__(self, pairs: List[Tuple[List[int], List[int]]]):
        self.pairs = pairs

    def __len__(self) -> int:
        return len(self.pairs)

    def __getitem__(self, i: int):
        x, y = self.pairs[i]
        return torch.tensor(x), torch.tensor(y)

def collate_pad(batch, pad_id: int):
    xs, ys = list(zip(*batch))
    T = max(x.size(0) for x in xs)
    xpad = torch.full((len(xs), T), pad_id)
    ypad = torch.full((len(xs), T), pad_id)
    for i, (x, y) in enumerate(zip(xs, ys)):
        xpad[i, : x.size(0)] = x
        ypad[i, : y.size(0)] = y
    return xpad.long(), ypad.long()

def make_loader(
    pairs: List[Tuple[List[int], List[int]]],
    batch_size: int,
    pad_id: int,
    shuffle: bool,
    pin_memory: bool,
    num_workers: int
) -> DataLoader:
    return DataLoader(
        BlockDataset(pairs),
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        collate_fn=lambda b: collate_pad(b, pad_id),
    )

# –∑–∞–≥—Ä—É–∑–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤ –∏–∑ csv
def load_texts(csv_path: str) -> List[str]:
    return pd.read_csv(csv_path)["text"].astype(str).tolist()

# —Ç–µ—Å—Ç
if __name__ == "__main__":
    # –ø—Ä–∏–º–µ—Ä —Å–∞–º–æ–ø—Ä–æ–≤–µ—Ä–∫–∏ ‚Äî –ø–æ–º–µ–Ω—è–π –ø—É—Ç–∏ –ø–æ–¥ —Å–µ–±—è –ø—Ä–∏ –∂–µ–ª–∞–Ω–∏–∏
    DATA_DIR = "/content/text_autocomplete/data"
    train_csv = f"{DATA_DIR}/train.csv"

    stoi, itos, pad_id, unk_id, bos_id, eos_id = build_vocab(train_csv, min_freq=2, out_dir="artifacts")
    texts = load_texts(train_csv)
    pairs = make_pairs_from_stream(texts, stoi, bos_id, eos_id, unk_id, max_len=32)

    PIN = torch.cuda.is_available()
    loader = make_loader(pairs, batch_size=128, pad_id=pad_id, shuffle=True,
                         pin_memory=PIN, num_workers=0)

    xb, yb = next(iter(loader))
    print("shapes:", xb.shape, yb.shape, "| steps/epoch:", len(loader))


Writing /content/text_autocomplete/src/next_token_dataset.py


In [4]:
!python -u /content/text_autocomplete/src/next_token_dataset.py


shapes: torch.Size([128, 32]) torch.Size([128, 32]) | steps/epoch: 4398


In [5]:
# LSTM LM
%%writefile /content/text_autocomplete/src/lstm_model.py
import torch, torch.nn as nn

class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb=256, hidden=512, num_layers=2, drop=0.1, pad_id=0):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, emb, padding_idx=pad_id)
        self.lstm = nn.LSTM(emb, hidden, num_layers=num_layers, batch_first=True, dropout=drop)
        self.proj = nn.Linear(hidden, vocab_size)

    def forward(self, x):
        e = self.emb(x)
        h, _ = self.lstm(e)
        logits = self.proj(h)
        return logits, None

    @torch.no_grad()
    def generate(self, prefix_ids, max_new=20, eos=None, device="cpu"):
        self.eval()
        x = torch.tensor(prefix_ids, dtype=torch.long, device=device).unsqueeze(0)
        for _ in range(max_new):
            logits, _ = self.forward(x)
            next_id = logits[:, -1].argmax(-1)
            x = torch.cat([x, next_id.unsqueeze(0)], dim=1)
            if eos is not None and int(next_id.item()) == eos:
                break
        return x.squeeze(0).tolist()


Writing /content/text_autocomplete/src/lstm_model.py


In [6]:
!python -m py_compile /content/text_autocomplete/src/lstm_model.py


In [13]:
%%writefile /content/text_autocomplete/src/lstm_train.py
# lstm_train.py
import os, sys, json, math
import torch, torch.nn as nn
from tqdm.auto import tqdm
import argparse
from collections import Counter
import matplotlib.pyplot as plt

# –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã
MAX_LEN = 32
BATCH_SIZE = 128
PIN = torch.cuda.is_available()
NUM_WORKERS = 0

# –Ω–∞—Å—Ç—Ä–æ–π–∫–∞ –ø—É—Ç–µ–π
try:
    HERE = os.path.dirname(os.path.abspath(__file__))   # .../text_autocomplete/src
    BASE = os.path.abspath(os.path.join(HERE, ".."))    # .../text_autocomplete
except NameError:
    BASE = "/content/text_autocomplete"
    HERE = os.path.join(BASE, "src")
SRC  = os.path.join(BASE, "src")
if SRC not in sys.path:
    sys.path.insert(0, SRC)

DATA_DIR    = os.path.join(BASE, "data")
ART_DIR     = os.path.join(BASE, "artifacts")
MODEL_DIR   = os.path.join(BASE, "models")
RESULTS_DIR = os.path.join(BASE, "results")

from data_utils import prepare_from_txt
from next_token_dataset import build_vocab, load_texts, make_pairs_from_stream, make_loader
from lstm_model import LSTMLM

UNK = "<unk>"

# ---------------- helpers ----------------
def ids_to_text(ids, itos: dict, pad_id: int):
    return " ".join(itos.get(i, UNK) for i in ids if i != pad_id)

def _ngrams(seq, n):
    return [" ".join(seq[i:i+n]) for i in range(len(seq)-n+1)] if len(seq) >= n else []

def rouge_f1(pred_tokens, ref_tokens, n):
    p_ngr, r_ngr = Counter(_ngrams(pred_tokens, n)), Counter(_ngrams(ref_tokens, n))
    overlap = sum((p_ngr & r_ngr).values())
    pred_cnt, ref_cnt = max(1, sum(p_ngr.values())), max(1, sum(r_ngr.values()))
    prec = overlap / pred_cnt
    rec  = overlap / ref_cnt
    return 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)

@torch.no_grad()
def eval_rouge_on_loader(model, loader, itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=None):
    model.eval()
    r1s, r2s, seen = [], [], 0
    for xb, _ in loader:
        if max_batches is not None and seen >= max_batches:
            break
        seen += 1
        xb = xb.to(device)
        seq = xb[0].tolist()
        L = len([t for t in seq if t != pad_id])
        k = max(1, int(L * take_ratio))
        prefix, ref = seq[:k], seq[k:L]
        gen = model.generate(prefix, max_new=len(ref), eos=eos_id, device=device)
        pred = gen[k:L]
        r1s.append(rouge_f1([itos.get(i, UNK) for i in pred], [itos.get(i, UNK) for i in ref], 1))
        r2s.append(rouge_f1([itos.get(i, UNK) for i in pred], [itos.get(i, UNK) for i in ref], 2))
    n = max(1, len(r1s))
    return float(sum(r1s)/n), float(sum(r2s)/n)

def run_epoch_bar(model, loader, criterion, optimizer, scaler, device, pad_id, train=True, desc=""):
    model.train(train)
    total_loss, total_tok = 0.0, 0
    pbar = tqdm(loader, desc=desc)
    for xb, yb in pbar:
        xb, yb = xb.to(device), yb.to(device)
        with torch.autocast("cuda", enabled=torch.cuda.is_available()):
            logits, _ = model(xb)
            loss = criterion(logits.reshape(-1, logits.size(-1)), yb.reshape(-1))
        if train:
            optimizer.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer); scaler.update()
        with torch.no_grad():
            tokens = int((yb != pad_id).sum().item())
            total_loss += loss.item() * tokens; total_tok += tokens
            pbar.set_postfix(loss=f"{loss.item():.4f}")
    return total_loss / max(1, total_tok)

# –æ—Å–Ω–æ–≤–∞
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--epochs", type=int, default=2)         # 1‚Äì3 –æ–∫
    ap.add_argument("--min_freq", type=int, default=2)
    ap.add_argument("--raw_txt", default=os.path.join(DATA_DIR, "tweets.txt"))
    args = ap.parse_args([]) if "ipykernel" in sys.modules else ap.parse_args()

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(RESULTS_DIR, exist_ok=True)

    # –¥–∞–Ω–Ω—ã–µ
    train_csv = os.path.join(DATA_DIR, "train.csv")
    val_csv   = os.path.join(DATA_DIR, "val.csv")
    test_csv  = os.path.join(DATA_DIR, "test.csv")
    if not os.path.exists(train_csv):
        if not os.path.exists(args.raw_txt):
            raise FileNotFoundError(f"–ù–µ—Ç –∏—Å—Ö–æ–¥–Ω–∏–∫–∞: {args.raw_txt}")
        prepare_from_txt(args.raw_txt, DATA_DIR)

    # —Å–ª–æ–≤–∞—Ä—å
    stoi, itos, pad_id, unk_id, bos_id, eos_id = build_vocab(train_csv, min_freq=args.min_freq, out_dir=ART_DIR)
    vocab_size = len(stoi)

    # –ª–æ–∞–¥–µ—Ä—ã
    train_texts = load_texts(train_csv)
    val_texts   = load_texts(val_csv)
    test_texts  = load_texts(test_csv)

    train_pairs = make_pairs_from_stream(train_texts, stoi, bos_id, eos_id, unk_id, max_len=MAX_LEN)
    val_pairs   = make_pairs_from_stream(val_texts,   stoi, bos_id, eos_id, unk_id, max_len=MAX_LEN)
    test_pairs  = make_pairs_from_stream(test_texts,  stoi, bos_id, eos_id, unk_id, max_len=MAX_LEN)

    train_loader = make_loader(train_pairs, BATCH_SIZE, pad_id, True,  PIN, NUM_WORKERS)
    val_loader   = make_loader(val_pairs,   BATCH_SIZE, pad_id, False, PIN, NUM_WORKERS)
    test_loader  = make_loader(test_pairs,  BATCH_SIZE, pad_id, False, PIN, NUM_WORKERS)

    # –º–æ–¥–µ–ª—å
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = LSTMLM(vocab_size, emb=256, hidden=512, num_layers=2, drop=0.1, pad_id=pad_id).to(device)
    print(f"{sum(p.numel() for p in model.parameters()):,} parameters")

    criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())

    # –ª–æ–≥ –¥–ª—è –≥—Ä–∞—Ñ–∏–∫–æ–≤
    train_losses, val_losses, ppls = [], [], []
    best_val = float("inf")
    ckpt = os.path.join(MODEL_DIR, "lstm.pt")

    for ep in range(1, args.epochs + 1):
        tr = run_epoch_bar(model, train_loader, criterion, optimizer, scaler, device, pad_id, True,  f"Epoch {ep}/{args.epochs} [Train]")
        va = run_epoch_bar(model, val_loader,   criterion, optimizer, scaler, device, pad_id, False, f"Epoch {ep}/{args.epochs} [Val]  ")
        r1, r2 = eval_rouge_on_loader(model, val_loader, itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=200)
        ppl = math.exp(va) if va < 20 else float("inf")

        train_losses.append(tr); val_losses.append(va); ppls.append(ppl)

        xb, _ = next(iter(val_loader))
        seq = xb[0].tolist(); L = len([t for t in seq if t != pad_id]); k = max(1, int(L*0.75))
        prefix, ref = seq[:k], seq[k:L]
        pred = model.generate(prefix, max_new=len(ref), eos=eos_id, device=device)

        print(f"\nEpoch {ep}: TrainLoss={tr:.4f} | ValLoss={va:.4f} | ValPPL={ppl:.2f} | ROUGE-1={r1:.4f} | ROUGE-2={r2:.4f}")
        print("  –í—Ö–æ–¥ (3/4):  ", ids_to_text(prefix, itos, pad_id))
        print("  –¢–∞—Ä–≥–µ—Ç (1/4):", ids_to_text(ref,    itos, pad_id))
        print("  –ú–æ–¥–µ–ª—å (1/4):", ids_to_text(pred[k:L], itos, pad_id))

        if va < best_val:
            best_val = va
            torch.save(model.state_dict(), ckpt)
            print(f"Saved best to {ckpt}")

    # —Ñ–∏–Ω–∞–ª—å–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏ + —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ
    r1_val, r2_val = eval_rouge_on_loader(model, val_loader,  itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=None)
    r1_test, r2_test = eval_rouge_on_loader(model, test_loader, itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=None)

    os.makedirs(RESULTS_DIR, exist_ok=True)
    with open(os.path.join(RESULTS_DIR, "lstm_metrics.json"), "w", encoding="utf-8") as f:
        json.dump({"val":{"rouge1_f1":r1_val,"rouge2_f1":r2_val},
                   "test":{"rouge1_f1":r1_test,"rouge2_f1":r2_test}}, f, ensure_ascii=False, indent=2)
    print(f"[VAL]  ROUGE-1={r1_val:.4f} | ROUGE-2={r2_val:.4f}")
    print(f"[TEST] ROUGE-1={r1_test:.4f} | ROUGE-2={r2_test:.4f}")
    print(f"–ú–µ—Ç—Ä–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {os.path.join(RESULTS_DIR, 'lstm_metrics.json')}")

    # –≥—Ä–∞—Ñ–∏–∫–∏ (loss –∏ perplexity)
    # loss per epoch
    plt.figure(figsize=(8,4))
    plt.plot(train_losses, '-o', label='train')
    plt.plot(val_losses,   '-o', label='val')
    plt.title('Loss per epoch'); plt.xlabel('epoch'); plt.ylabel('loss')
    plt.grid(True); plt.legend()
    loss_png = os.path.join(RESULTS_DIR, "loss.png")
    plt.savefig(loss_png, bbox_inches='tight'); plt.close()

    # Val Perplexity
    plt.figure(figsize=(6,4))
    plt.plot(ppls, '-o', label='val PPL')
    plt.title('Validation Perplexity'); plt.xlabel('epoch'); plt.ylabel('PPL')
    plt.grid(True); plt.legend()
    ppl_png = os.path.join(RESULTS_DIR, "ppl.png")
    plt.savefig(ppl_png, bbox_inches='tight'); plt.close()

    print(f"–ì—Ä–∞—Ñ–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: {loss_png} –∏ {ppl_png}")


    if "ipykernel" in sys.modules:
        from IPython.display import Image, display
        display(Image(filename=loss_png))
        display(Image(filename=ppl_png))

if __name__ == "__main__":
    main()



Overwriting /content/text_autocomplete/src/lstm_train.py


In [14]:
!python -u /content/text_autocomplete/src/lstm_train.py --epochs 2


68,411,090 parameters
Epoch 1/2 [Train]: 100% 4398/4398 [11:02<00:00,  6.64it/s, loss=4.8027]
Epoch 1/2 [Val]  : 100% 521/521 [00:30<00:00, 17.26it/s, loss=4.9511]

Epoch 1: TrainLoss=5.1088 | ValLoss=4.9185 | ValPPL=136.80 | ROUGE-1=0.1310 | ROUGE-2=0.0315
  –í—Ö–æ–¥ (3/4):   <bos> just a <unk> full of sugar helps the meds goo down in the most delightful wayyy 39 days ! lt 3 <eos> <bos>
  –¢–∞—Ä–≥–µ—Ç (1/4): well my first wedding anniversary on sunday which
  –ú–æ–¥–µ–ª—å (1/4): i m so sorry to hear that .
‚úÖ Saved best to /content/text_autocomplete/models/lstm.pt
Epoch 2/2 [Train]: 100% 4398/4398 [11:04<00:00,  6.62it/s, loss=4.9791]
Epoch 2/2 [Val]  : 100% 521/521 [00:30<00:00, 17.28it/s, loss=4.8964]

Epoch 2: TrainLoss=4.7897 | ValLoss=4.8577 | ValPPL=128.73 | ROUGE-1=0.1378 | ROUGE-2=0.0266
  –í—Ö–æ–¥ (3/4):   <bos> just a <unk> full of sugar helps the meds goo down in the most delightful wayyy 39 days ! lt 3 <eos> <bos>
  –¢–∞—Ä–≥–µ—Ç (1/4): well my first wedding anniversary on

In [16]:
# src/eval_lstm.py
%%writefile /content/text_autocomplete/src/eval_lstm.py
import os, sys, json, torch
from collections import Counter
import matplotlib.pyplot as plt

# –Ω–∞—Å—Ç—Ä–æ–π–∫–∞ –ø—É—Ç–µ–π
try:
    HERE = os.path.dirname(os.path.abspath(__file__))   # .../text_autocomplete/src
    BASE = os.path.abspath(os.path.join(HERE, ".."))    # .../text_autocomplete
except NameError:
    BASE = "/content/text_autocomplete"
SRC  = os.path.join(BASE, "src")
if SRC not in sys.path:
    sys.path.insert(0, SRC)

DATA_DIR    = os.path.join(BASE, "data")
ART_DIR     = os.path.join(BASE, "artifacts")
MODEL_DIR   = os.path.join(BASE, "models")
RESULTS_DIR = os.path.join(BASE, "results")

from next_token_dataset import build_vocab, load_texts, make_pairs_from_stream, make_loader
from lstm_model import LSTMLM

UNK = "<unk>"

def ids_to_text(ids, itos: dict, pad_id: int):
    return " ".join(itos.get(i, UNK) for i in ids if i != pad_id)

def _ngrams(seq, n):
    return [" ".join(seq[i:i+n]) for i in range(len(seq)-n+1)] if len(seq) >= n else []

def rouge_f1(pred_tokens, ref_tokens, n):
    p_ngr, r_ngr = Counter(_ngrams(pred_tokens, n)), Counter(_ngrams(ref_tokens, n))
    overlap = sum((p_ngr & r_ngr).values())
    pred_cnt, ref_cnt = max(1, sum(p_ngr.values())), max(1, sum(r_ngr.values()))
    prec = overlap / pred_cnt; rec = overlap / ref_cnt
    return 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)

@torch.no_grad()
def eval_rouge_on_loader(model, loader, itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=None):
    model.eval()
    r1s, r2s, seen = [], [], 0
    for xb, _ in loader:
        if max_batches is not None and seen >= max_batches:
            break
        seen += 1
        xb = xb.to(device)
        seq = xb[0].tolist()
        L = len([t for t in seq if t != pad_id])
        k = max(1, int(L * take_ratio))
        prefix, ref = seq[:k], seq[k:L]
        gen = model.generate(prefix, max_new=len(ref), eos=eos_id, device=device)
        pred = gen[k:L]
        r1s.append(rouge_f1([itos.get(i, UNK) for i in pred], [itos.get(i, UNK) for i in ref], 1))
        r2s.append(rouge_f1([itos.get(i, UNK) for i in pred], [itos.get(i, UNK) for i in ref], 2))
    n = max(1, len(r1s))
    return float(sum(r1s)/n), float(sum(r2s)/n)

def main():
    # —Ñ–∞–π–ª—ã
    train_csv = os.path.join(DATA_DIR, "train.csv")
    val_csv   = os.path.join(DATA_DIR, "val.csv")
    test_csv  = os.path.join(DATA_DIR, "test.csv")
    ckpt = os.path.join(MODEL_DIR, "lstm.pt")
    os.makedirs(RESULTS_DIR, exist_ok=True)

    if not os.path.exists(ckpt):
        raise FileNotFoundError(f"–ù–µ—Ç —á–µ–∫–ø–æ–∏–Ω—Ç–∞: {ckpt}. –°–Ω–∞—á–∞–ª–∞ –æ–±—É—á–∏ –º–æ–¥–µ–ª—å (lstm_train.py).")

    # —Å–ª–æ–≤–∞—Ä—å –∏ data loaders
    stoi, itos, pad_id, unk_id, bos_id, eos_id = build_vocab(train_csv, min_freq=2, out_dir=ART_DIR)
    MAX_LEN = 32; BS = 128; PIN = torch.cuda.is_available()

    val_pairs   = make_pairs_from_stream(load_texts(val_csv),   stoi, bos_id, eos_id, unk_id, max_len=MAX_LEN)
    test_pairs  = make_pairs_from_stream(load_texts(test_csv),  stoi, bos_id, eos_id, unk_id, max_len=MAX_LEN)
    val_loader  = make_loader(val_pairs,  BS, pad_id, False, PIN, 0)
    test_loader = make_loader(test_pairs, BS, pad_id, False, PIN, 0)

    # –º–æ–¥–µ–ª—å  –≤–µ—Å–∞
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = LSTMLM(vocab_size=len(stoi), emb=256, hidden=512, num_layers=2, drop=0.1, pad_id=pad_id).to(device)
    model.load_state_dict(torch.load(ckpt, map_location=device))

    # –º–µ—Ç—Ä–∏–∫–∏
    r1_val, r2_val   = eval_rouge_on_loader(model, val_loader,  itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=None)
    r1_test, r2_test = eval_rouge_on_loader(model, test_loader, itos, pad_id, eos_id, device, take_ratio=0.75, max_batches=None)

    print(f"[VAL]  ROUGE-1={r1_val:.4f} | ROUGE-2={r2_val:.4f}")
    print(f"[TEST] ROUGE-1={r1_test:.4f} | ROUGE-2={r2_test:.4f}")

    # –ø—Ä–∏–º–µ—Ä –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
    xb, _ = next(iter(test_loader))
    seq = xb[0].tolist()
    L = len([t for t in seq if t != pad_id]); k = max(1, int(L*0.75))
    prefix, ref = seq[:k], seq[k:L]
    pred = model.generate(prefix, max_new=len(ref), eos=eos_id, device=device)
    print("\n–ü–†–ò–ú–ï–† –ì–ï–ù–ï–†–ê–¶–ò–ò (test):")
    print("  –í—Ö–æ–¥ (3/4):  ", ids_to_text(prefix, itos, pad_id))
    print("  –¢–∞—Ä–≥–µ—Ç (1/4):", ids_to_text(ref,    itos, pad_id))
    print("  –ú–æ–¥–µ–ª—å (1/4):", ids_to_text(pred[k:L], itos, pad_id))

    # —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –º–µ—Ç—Ä–∏–∫–∏ –≤ JSON
    out_json = os.path.join(RESULTS_DIR, "lstm_metrics_eval.json")
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump({"val":{"rouge1_f1":r1_val,"rouge2_f1":r2_val},
                   "test":{"rouge1_f1":r1_test,"rouge2_f1":r2_test}}, f, ensure_ascii=False, indent=2)
    print(f"\n–ú–µ—Ç—Ä–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {out_json}")

    # –≥—Ä–∞—Ñ–∏–∫ ROUGE
    plt.figure(figsize=(5,4))
    x = ["Val R1","Val R2","Test R1","Test R2"]
    y = [r1_val, r2_val, r1_test, r2_test]
    plt.bar(x, y)
    plt.ylim(0, 1)
    plt.title("ROUGE (F1)")
    plt.grid(axis="y", alpha=0.3)
    rouge_png = os.path.join(RESULTS_DIR, "rouge_eval.png")
    plt.savefig(rouge_png, bbox_inches="tight"); plt.close()
    print(f"–ì—Ä–∞—Ñ–∏–∫ —Å–æ—Ö—Ä–∞–Ω—ë–Ω: {rouge_png}")


    if "ipykernel" in sys.modules:
        from IPython.display import Image, display
        display(Image(filename=rouge_png))

if __name__ == "__main__":
    main()

Overwriting /content/text_autocomplete/src/eval_lstm.py


In [17]:
!python -u /content/text_autocomplete/src/eval_lstm.py


[VAL]  ROUGE-1=0.1428 | ROUGE-2=0.0293
[TEST] ROUGE-1=0.1457 | ROUGE-2=0.0279

–ü–†–ò–ú–ï–† –ì–ï–ù–ï–†–ê–¶–ò–ò (test):
  –í—Ö–æ–¥ (3/4):   <bos> point mallard tomorrow morning at 12 waterslides , lifeguards , and sun <unk> . tehe ! <eos> <bos> extremely happy right now !
  –¢–∞—Ä–≥–µ—Ç (1/4): you don t even know les <eos> <bos>
  –ú–æ–¥–µ–ª—å (1/4): <eos>

‚úÖ –ú–µ—Ç—Ä–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ /content/text_autocomplete/results/lstm_metrics_eval.json
üìä –ì—Ä–∞—Ñ–∏–∫ —Å–æ—Ö—Ä–∞–Ω—ë–Ω: /content/text_autocomplete/results/rouge_eval.png


In [18]:
# src/eval_transformer_pipeline.py
%%writefile /content/text_autocomplete/src/eval_transformer_pipeline.py
import os, sys, json, random
import torch
import pandas as pd
import matplotlib.pyplot as plt

# –ø—É—Ç–∏
try:
    HERE = os.path.dirname(os.path.abspath(__file__))   # .../text_autocomplete/src
    BASE = os.path.abspath(os.path.join(HERE, ".."))    # .../text_autocomplete
except NameError:
    BASE = "/content/text_autocomplete"
SRC  = os.path.join(BASE, "src")
if SRC not in sys.path:
    sys.path.insert(0, SRC)

DATA_DIR    = os.path.join(BASE, "data")
RESULTS_DIR = os.path.join(BASE, "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

# –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏
from transformers import pipeline, set_seed
from rouge_score import rouge_scorer

def load_val_texts(val_csv_path: str, sample_size: int = 100, seed: int = 42):
    df = pd.read_csv(val_csv_path)
    if "text" not in df.columns:
        df = df.rename(columns={df.columns[0]: "text"})
    df = df.dropna(subset=["text"])
    df["text"] = df["text"].astype(str)
    if len(df) > sample_size:
        df = df.sample(sample_size, random_state=seed)
    return df["text"].tolist()

def build_generator(model_name="distilgpt2", seed=42):
    device = 0 if torch.cuda.is_available() else -1
    set_seed(seed)
    gen = pipeline(
        task="text-generation",
        model=model_name,
        device=device
    )
    return gen

def complete_text(gen, prompt, max_new_tokens=30):
    out = gen(
        prompt,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        pad_token_id=gen.model.config.eos_token_id,
    )[0]["generated_text"]
    if out.startswith(prompt):
        cont = out[len(prompt):]
    else:
        cont = out
    return cont

def main():
    # –∫–æ–Ω—Ñ–∏–≥
    VAL_CSV = os.path.join(DATA_DIR, "val.csv")
    MODEL_NAME = "distilgpt2"
    SAMPLE_SIZE = 100
    SEED = 42
    CUTOFF_RATIO = 0.75
    MAX_NEW_TOKENS = 30

    # –¥–∞–Ω–Ω—ã–µ
    if not os.path.exists(VAL_CSV):
        raise FileNotFoundError(f"–ù–µ –Ω–∞–π–¥–µ–Ω {VAL_CSV}. –°–Ω–∞—á–∞–ª–∞ –ø–æ–¥–≥–æ—Ç–æ–≤—å —Å–ø–ª–∏—Ç—ã (train/val/test.csv).")
    texts = load_val_texts(VAL_CSV, sample_size=SAMPLE_SIZE, seed=SEED)

    # –≥–µ–Ω–µ—Ä–∞—Ç–æ—Ä
    generator = build_generator(MODEL_NAME, seed=SEED)

    # –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
    r1s, r2s = [], []
    samples = []

    # —Ü–∏–∫–ª –ø–æ –ø—Ä–∏–º–µ—Ä–∞–º
    random.seed(SEED)
    for text in texts:
        text = text.strip()
        if not text:
            continue
        cutoff = max(1, int(len(text) * CUTOFF_RATIO))
        prefix = text[:cutoff]
        target = text[cutoff:]
        pred_cont = complete_text(generator, prefix, max_new_tokens=MAX_NEW_TOKENS)

        # ROUGE –ø–æ —Ç–æ–∫–µ–Ω–∞–º —Å—Ç—Ä–æ–∫–∏
        scores = scorer.score(target, pred_cont)
        r1s.append(scores["rouge1"].fmeasure)
        r2s.append(scores["rouge2"].fmeasure)

        # –ø—Ä–∏–º–µ—Ä—ã –¥–ª—è –ø–µ—á–∞—Ç–∏
        if len(samples) < 3:
            samples.append({"prefix": prefix, "target": target, "pred": pred_cont})

    # —Å—Ä–µ–¥–Ω–∏–µ –º–µ—Ç—Ä–∏–∫–∏
    r1_mean = float(sum(r1s) / max(1, len(r1s)))
    r2_mean = float(sum(r2s) / max(1, len(r2s)))

    print(f"distilgpt2 on val ({len(r1s)} samples)")
    print(f"ROUGE-1 F1 = {r1_mean:.4f} | ROUGE-2 F1 = {r2_mean:.4f}")

    # –ø—Ä–∏–º–µ—Ä—ã
    for i, s in enumerate(samples, 1):
        print(f"\n–ü—Ä–∏–º–µ—Ä {i}:")
        print("  –í—Ö–æ–¥ (3/4): ", s["prefix"])
        print("  –¢–∞—Ä–≥–µ—Ç (1/4):", s["target"])
        print("  –ú–æ–¥–µ–ª—å (1/4):", s["pred"])

    # —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –º–µ—Ç—Ä–∏–∫–∏ –≤ JSON
    out_json = os.path.join(RESULTS_DIR, "transformer_metrics.json")
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump({"rouge1_f1": r1_mean, "rouge2_f1": r2_mean}, f, ensure_ascii=False, indent=2)
    print(f"\n–ú–µ—Ç—Ä–∏–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ {out_json}")

    # –≥—Ä–∞—Ñ–∏–∫ ROUGE
    plt.figure(figsize=(5,4))
    x = ["ROUGE-1", "ROUGE-2"]
    y = [r1_mean, r2_mean]
    plt.bar(x, y)
    plt.ylim(0, 1)
    plt.title("distilgpt2 on val")
    plt.grid(axis="y", alpha=0.3)
    out_png = os.path.join(RESULTS_DIR, "transformer_rouge.png")
    plt.savefig(out_png, bbox_inches="tight"); plt.close()
    print(f"–ì—Ä–∞—Ñ–∏–∫ —Å–æ—Ö—Ä–∞–Ω—ë–Ω: {out_png}")

    if "ipykernel" in sys.modules:
        from IPython.display import Image, display
        display(Image(filename=out_png))

if __name__ == "__main__":
    main()


Writing /content/text_autocomplete/src/eval_transformer_pipeline.py


In [20]:
!pip install -q transformers rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [21]:
!python -u /content/text_autocomplete/src/eval_transformer_pipeline.py


2025-11-09 11:37:53.573345: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762688273.636193   27329 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762688273.682446   27329 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762688273.753553   27329 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1762688273.753601   27329 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1762688273.753609   27329 computation_placer.cc:177] computation placer alr

In [22]:
!cd /content && zip -r text_autocomplete.zip text_autocomplete

  adding: text_autocomplete/ (stored 0%)
  adding: text_autocomplete/src/ (stored 0%)
  adding: text_autocomplete/src/lstm_model.py (deflated 53%)
  adding: text_autocomplete/src/__pycache__/ (stored 0%)
  adding: text_autocomplete/src/__pycache__/lstm_model.cpython-312.pyc (deflated 41%)
  adding: text_autocomplete/src/__pycache__/next_token_dataset.cpython-312.pyc (deflated 42%)
  adding: text_autocomplete/src/__pycache__/data_utils.cpython-312.pyc (deflated 45%)
  adding: text_autocomplete/src/eval_transformer_pipeline.py (deflated 54%)
  adding: text_autocomplete/src/data_utils.py (deflated 58%)
  adding: text_autocomplete/src/eval_lstm.py (deflated 58%)
  adding: text_autocomplete/src/next_token_dataset.py (deflated 58%)
  adding: text_autocomplete/src/lstm_train.py (deflated 63%)
  adding: text_autocomplete/results/ (stored 0%)
  adding: text_autocomplete/results/ppl.png (deflated 10%)
  adding: text_autocomplete/results/lstm_metrics_eval.json (deflated 41%)
  adding: text_autoco

–í—ã–≤–æ–¥—ã

- –í —Ö–æ–¥–µ –ø—Ä–æ–µ–∫—Ç–∞ —Ä–µ–∞–ª–∏–∑–æ–≤–∞–Ω—ã –∏ –æ—Ü–µ–Ω–µ–Ω—ã –¥–≤–µ –º–æ–¥–µ–ª–∏:
  1. **LSTM-LM**, –æ–±—É—á–µ–Ω–Ω–∞—è –Ω–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω–æ–º –∫–æ—Ä–ø—É—Å–µ —Ç–≤–∏—Ç–æ–≤
  2. –ü—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω–∞—è **distilgpt2**, –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–Ω–∞—è —á–µ—Ä–µ–∑ `transformers.pipeline` –±–µ–∑ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è.  

- LSTM –ø–æ–∫–∞–∑–∞–ª–∞ ROUGE-1 ‚âà 0.14 –∏ ROUGE-2 ‚âà 0.03 ‚Äî —ç—Ç–æ –≥–æ–≤–æ—Ä–∏—Ç –æ —Ç–æ–º, —á—Ç–æ –º–æ–¥–µ–ª—å —É–ª–æ–≤–∏–ª–∞ –∑–∞–∫–æ–Ω–æ–º–µ—Ä–Ω–æ—Å—Ç–∏ –∫–æ—Ä–ø—É—Å–∞.  
- distilgpt2 –ø–æ–∫–∞–∑–∞–ª–∞ ROUGE-1 ‚âà 0.03 –∏ ROUGE-2 ‚âà 0.004, –Ω–æ –≥–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –≥—Ä–∞–º–º–∞—Ç–∏—á–µ—Å–∫–∏ –±–æ–ª–µ–µ —Å–≤—è–∑–Ω—ã–µ —Ñ—Ä–∞–∑—ã.  
- –í —Ä–∞–º–∫–∞—Ö –∑–∞–¥–∞–Ω–∏—è –¥–æ–æ–±—É—á–µ–Ω–∏–µ —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä–∞ **–Ω–µ —Ç—Ä–µ–±–æ–≤–∞–ª–æ—Å—å**; –æ–Ω –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω ¬´–∏–∑ –∫–æ—Ä–æ–±–∫–∏¬ª –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è –ø–æ–¥—Ö–æ–¥–æ–≤.  
**–í—ã–≤–æ–¥:** –æ–±—É—á–µ–Ω–Ω–∞—è LSTM –ª—É—á—à–µ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –¥–∞–Ω–Ω—ã–º–∏ –∫–æ—Ä–ø—É—Å–∞, –∞ –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω—ã–π —Ç—Ä–∞–Ω—Å—Ñ–æ—Ä–º–µ—Ä —Å–æ–∑–¥–∞—ë—Ç –±–æ–ª–µ–µ –µ—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç.  
–ü–æ –º–æ–µ–º—É –º–Ω–µ–Ω–∏—é, –¥–ª—è –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–æ–≥–æ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏—è –æ–ø—Ç–∏–º–∞–ª—å–Ω—ã–º –±—É–¥–µ—Ç –¥–æ–æ–±—É—á–µ–Ω–∏–µ distilgpt2 –Ω–∞ —Å–≤–æ—ë–º –¥–∞—Ç–∞—Å–µ—Ç–µ.
