In [1]:
# Install dependencies
# !pip install sentencepiece pandas
# !pip install tokenizers  transformers datasets

import sentencepiece as spm
import pandas as pd
from google.colab import drive
import shutil
import os
import unicodedata

input_file = "raw.txt"
output_file = "shakespeare_clean.txt"

with open(input_file, 'r', encoding='utf-8') as f:
    text = f.read()

# Unicode normalization (NFKC)
text = unicodedata.normalize('NFKC', text)

# Keep line breaks and punctuation (important for poetic structure)
# Optional: strip trailing whitespaces
lines = [line.rstrip() for line in text.splitlines() if line.strip()]

with open(output_file, 'w', encoding='utf-8') as f:
    f.write("\n".join(lines))

# -------------------------------
# PARAMETERS YOU CAN TWEAK
# -------------------------------
INPUT_FILE = "shakespeare_clean.txt"  # path to your text file
MODEL_PREFIX = "sp_unigram_25k"       # model name prefix
VOCAB_SIZE = 25000                     # increase to 32000, 50000 for larger vocab
MODEL_TYPE = "unigram"                # "unigram" or "bpe"
CHARACTER_COVERAGE = 0.9995            # 0.99 for most English text
INPUT_SENTENCE_SIZE = 2000000          # number of sentences to sample for training
SHUFFLE = True                         # shuffle sentences before training
N_EVAL_LINES = 5000                     # number of lines to use for quick evaluation

# -------------------------------
# STEP 1: Train Tokenizer
# -------------------------------
print("Training Unigram tokenizer...")
spm.SentencePieceTrainer.train(
    input=INPUT_FILE,
    model_prefix=MODEL_PREFIX,
    vocab_size=VOCAB_SIZE,
    model_type=MODEL_TYPE,
    character_coverage=CHARACTER_COVERAGE,
    input_sentence_size=INPUT_SENTENCE_SIZE,
    shuffle_input_sentence=SHUFFLE
)
print(f"Tokenizer training complete! Files: {MODEL_PREFIX}.model / {MODEL_PREFIX}.vocab")

# -------------------------------
# STEP 2: Load Tokenizer
# -------------------------------
sp_tokenizer = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")

# -------------------------------
# STEP 3: Evaluation Function
# -------------------------------
def evaluate_tokenizer(sp_model, text_path, n_lines=None):
    total_tokens, total_words, recon_errors, max_len = 0, 0, 0, 0
    with open(text_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if n_lines and i >= n_lines:
                break
            line = line.rstrip("\n")
            if not line:
                continue
            enc = sp_model.encode(line, out_type=int)
            decoded = sp_model.decode(enc).strip()
            if decoded != line.strip():
                recon_errors += 1
            total_tokens += len(enc)
            total_words += len(line.split())
            max_len = max(max_len, len(enc))
    return {
        "avg_tokens_per_word": total_tokens / (total_words + 1e-9),
        "max_tokens_in_line": max_len,
        "reconstruction_rate": 1 - recon_errors / (i+1e-9)
    }

# -------------------------------
# STEP 4: Run Evaluation
# -------------------------------
results = evaluate_tokenizer(sp_tokenizer, INPUT_FILE, n_lines=N_EVAL_LINES)
print("\nEvaluation Results:")
print(f"Average tokens per word: {results['avg_tokens_per_word']:.4f}")
print(f"Max tokens in a line: {results['max_tokens_in_line']}")
print(f"Reconstruction rate: {results['reconstruction_rate']:.4f}")

# -------------------------------
# STEP 5: Save Tokenizer Locally in Colab
# -------------------------------
print(f"\n✅ Tokenizer saved locally in Colab as: {MODEL_PREFIX}.model and {MODEL_PREFIX}.vocab")
print("You can download them using:")
print(f"files.download('{MODEL_PREFIX}.model')")
print(f"files.download('{MODEL_PREFIX}.vocab')")
# 1. Average Tokens per Word

# What it tells you: How efficiently the tokenizer compresses text into tokens.

# Lower is better → fewer tokens per word means your model will process longer context for the same input size.

# Compare: If your new tokenizer has lower or similar avg tokens/word than 1.3085, it’s more efficient or equally efficient.

# 2. Max Tokens per Line

# What it tells you: The longest tokenized sequence in a single line.

# Lower is generally better → prevents very long sequences that could slow training or require truncation.

# Compare: If your new tokenizer reduces this from 24, it’s better in sequence efficiency.

# 3. Reconstruction Rate

# What it tells you: How accurately the tokenizer can reproduce the original text after encoding → decoding.

# Higher is better → close to 1 (or 100%) means no loss of information.

# Compare: Your current tokenizer has 0.9606 → new tokenizer should ideally be ≥ 0.9606.

# Summary Table for Evaluation
# Metric	Better Condition
# Avg Tokens per Word	Lower than 1.3085
# Max Tokens per Line	Lower than 24
# Reconstruction Rate	Equal to or higher than 0.9606

# ✅ Rule of Thumb:

# If average tokens per word decreases without dropping reconstruction rate, your tokenizer is better.

# Slight increase in max tokens per line is okay if reconstruction improves or avg tokens per word decreases.

# If reconstruction rate drops significantly, the tokenizer is losing fidelity, even if token counts improve.


Training Unigram tokenizer...
Tokenizer training complete! Files: sp_unigram_25k.model / sp_unigram_25k.vocab

Evaluation Results:
Average tokens per word: 1.2885
Max tokens in a line: 23
Reconstruction rate: 0.9606

✅ Tokenizer saved locally in Colab as: sp_unigram_25k.model and sp_unigram_25k.vocab
You can download them using:
files.download('sp_unigram_25k.model')
files.download('sp_unigram_25k.vocab')


In [2]:
# !pip install --upgrade transformers --quiet


In [None]:
# ===============================
# INSTALL DEPENDENCIES
# ===============================
# !pip install torch tqdm sentencepiece

# ===============================
# IMPORTS
# ===============================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import math
from tqdm import tqdm

# ===============================
# PARAMETERS (Tweak for experiments)
# ===============================
TEXT_FILE = "shakespeare_clean.txt"
TOKENIZER_MODEL = "sp_unigram_25k.model"

VOCAB_SIZE = 25000 + 3       # +3 for PAD=0, BOS=1, EOS=2
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2

BLOCK_SIZE = 128
EMBED_DIM = 512
NUM_HEADS = 8
NUM_LAYERS = 6
FF_DIM = 2048
DROPOUT = 0.1

BATCH_SIZE = 8
NUM_EPOCHS = 1
LEARNING_RATE = 5e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ===============================
# STEP 1: LOAD TOKENIZER
# ===============================
sp_tokenizer = spm.SentencePieceProcessor(model_file=TOKENIZER_MODEL)

# Safe encoding to offset IDs for special tokens
def encode_line_safe(line):
    ids = sp_tokenizer.encode(line, out_type=int)
    ids = [id + 3 for id in ids]  # shift all normal tokens by 3
    return [BOS_ID] + ids + [EOS_ID]

# ===============================
# STEP 2: CUSTOM DATASET
# ===============================
class TextDataset(Dataset):
    def __init__(self, file_path, block_size):
        self.block_size = block_size
        self.lines = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    token_ids = encode_line_safe(line)
                    if len(token_ids) > 1:  # skip empty sequences
                        self.lines.append(token_ids)

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        ids = self.lines[idx]
        if len(ids) > self.block_size:
            ids = ids[:self.block_size]
        x = torch.tensor(ids[:-1], dtype=torch.long)
        y = torch.tensor(ids[1:], dtype=torch.long)
        return x, y

dataset = TextDataset(TEXT_FILE, BLOCK_SIZE)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

def collate_fn(batch):
    x_batch, y_batch = zip(*batch)
    # Find max length in batch
    max_len = max([len(x) for x in x_batch])
    # Pad sequences
    x_padded = torch.full((len(batch), max_len), PAD_ID, dtype=torch.long)
    y_padded = torch.full((len(batch), max_len), PAD_ID, dtype=torch.long)
    for i in range(len(batch)):
        x_padded[i, :len(x_batch[i])] = x_batch[i]
        y_padded[i, :len(y_batch[i])] = y_batch[i]
    return x_padded, y_padded

# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# ===============================
# STEP 3: TRANSFORMER MODEL
# ===============================
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, block_size, dropout=0.1, pad_id=0):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(block_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
                                                   dim_feedforward=ff_dim, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.ln = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        b, t = x.size()
        positions = torch.arange(t, device=x.device).unsqueeze(0).expand(b, t)
        x = self.token_emb(x) + self.pos_emb(positions)
        x = self.transformer(x)
        x = self.ln(x)
        logits = self.head(x)
        return logits

model = TransformerLM(VOCAB_SIZE, EMBED_DIM, NUM_HEADS, NUM_LAYERS, FF_DIM, BLOCK_SIZE, DROPOUT, PAD_ID).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)

# ===============================
# STEP 4: TRAINING LOOP
# ===============================
scaler = torch.amp.GradScaler() if DEVICE=="cuda" else None

for epoch in range(NUM_EPOCHS):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for xb, yb in loop:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        with torch.amp.autocast(device_type='cuda', enabled=(DEVICE=="cuda")):
            logits = model(xb)
            loss = criterion(logits.view(-1, VOCAB_SIZE), yb.view(-1))
        if scaler:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        loop.set_postfix(loss=loss.item())

# ===============================
# STEP 5: EVALUATION & SAMPLE GENERATION
# ===============================
def generate_text(model, tokenizer, prompt, max_len=BLOCK_SIZE, temperature=1.0, top_k=50):
    model.eval()
    ids = encode_line_safe(prompt)
    ids = ids[:BLOCK_SIZE]
    x = torch.tensor(ids, dtype=torch.long).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        for _ in range(max_len - len(ids)):
            logits = model(x)
            logits = logits[:, -1, :] / temperature
            top_logits, top_idx = torch.topk(logits, top_k)
            probs = F.softmax(top_logits, dim=-1)
            next_id = top_idx[0, torch.multinomial(probs, 1)[0]]
            x = torch.cat([x, next_id.unsqueeze(0).unsqueeze(0)], dim=1)
    return tokenizer.decode([i-3 for i in x[0].cpu().numpy() if i>2])

# Test generation
prompt = "To be, or not to be"
print("\nGenerated text:\n", generate_text(model, sp_tokenizer, prompt))


Epoch 1:   1%|          | 148/18279 [00:06<10:31, 28.70it/s, loss=6.17]