In [5]:
# Install dependencies
# !pip install sentencepiece pandas
# !pip install tokenizers  transformers datasets

import sentencepiece as spm
import pandas as pd
from google.colab import drive
import shutil
import os
import unicodedata

input_file = "raw.txt"
output_file = "shakespeare_clean.txt"

with open(input_file, 'r', encoding='utf-8') as f:
    text = f.read()

# Unicode normalization (NFKC)
text = unicodedata.normalize('NFKC', text)

# Keep line breaks and punctuation (important for poetic structure)
# Optional: strip trailing whitespaces
lines = [line.rstrip() for line in text.splitlines() if line.strip()]

with open(output_file, 'w', encoding='utf-8') as f:
    f.write("\n".join(lines))

# -------------------------------
# PARAMETERS YOU CAN TWEAK
# -------------------------------
INPUT_FILE = "shakespeare_clean.txt"  # path to your text file
MODEL_PREFIX = "sp_unigram_25k"       # model name prefix
VOCAB_SIZE = 25000                     # increase to 32000, 50000 for larger vocab
MODEL_TYPE = "unigram"                # "unigram" or "bpe"
CHARACTER_COVERAGE = 0.9995            # 0.99 for most English text
INPUT_SENTENCE_SIZE = 2000000          # number of sentences to sample for training
SHUFFLE = True                         # shuffle sentences before training
N_EVAL_LINES = 5000                     # number of lines to use for quick evaluation

# -------------------------------
# STEP 1: Train Tokenizer
# -------------------------------
print("Training Unigram tokenizer...")
spm.SentencePieceTrainer.train(
    input=INPUT_FILE,
    model_prefix=MODEL_PREFIX,
    vocab_size=VOCAB_SIZE,
    model_type=MODEL_TYPE,
    character_coverage=CHARACTER_COVERAGE,
    input_sentence_size=INPUT_SENTENCE_SIZE,
    shuffle_input_sentence=SHUFFLE
)
print(f"Tokenizer training complete! Files: {MODEL_PREFIX}.model / {MODEL_PREFIX}.vocab")

# -------------------------------
# STEP 2: Load Tokenizer
# -------------------------------
sp_tokenizer = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")

# -------------------------------
# STEP 3: Evaluation Function
# -------------------------------
def evaluate_tokenizer(sp_model, text_path, n_lines=None):
    total_tokens, total_words, recon_errors, max_len = 0, 0, 0, 0
    with open(text_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if n_lines and i >= n_lines:
                break
            line = line.rstrip("\n")
            if not line:
                continue
            enc = sp_model.encode(line, out_type=int)
            decoded = sp_model.decode(enc).strip()
            if decoded != line.strip():
                recon_errors += 1
            total_tokens += len(enc)
            total_words += len(line.split())
            max_len = max(max_len, len(enc))
    return {
        "avg_tokens_per_word": total_tokens / (total_words + 1e-9),
        "max_tokens_in_line": max_len,
        "reconstruction_rate": 1 - recon_errors / (i+1e-9)
    }

# -------------------------------
# STEP 4: Run Evaluation
# -------------------------------
results = evaluate_tokenizer(sp_tokenizer, INPUT_FILE, n_lines=N_EVAL_LINES)
print("\nEvaluation Results:")
print(f"Average tokens per word: {results['avg_tokens_per_word']:.4f}")
print(f"Max tokens in a line: {results['max_tokens_in_line']}")
print(f"Reconstruction rate: {results['reconstruction_rate']:.4f}")

# -------------------------------
# STEP 5: Save Tokenizer Locally in Colab
# -------------------------------
print(f"\n✅ Tokenizer saved locally in Colab as: {MODEL_PREFIX}.model and {MODEL_PREFIX}.vocab")
print("You can download them using:")
print(f"files.download('{MODEL_PREFIX}.model')")
print(f"files.download('{MODEL_PREFIX}.vocab')")
# 1. Average Tokens per Word

# What it tells you: How efficiently the tokenizer compresses text into tokens.

# Lower is better → fewer tokens per word means your model will process longer context for the same input size.

# Compare: If your new tokenizer has lower or similar avg tokens/word than 1.3085, it’s more efficient or equally efficient.

# 2. Max Tokens per Line

# What it tells you: The longest tokenized sequence in a single line.

# Lower is generally better → prevents very long sequences that could slow training or require truncation.

# Compare: If your new tokenizer reduces this from 24, it’s better in sequence efficiency.

# 3. Reconstruction Rate

# What it tells you: How accurately the tokenizer can reproduce the original text after encoding → decoding.

# Higher is better → close to 1 (or 100%) means no loss of information.

# Compare: Your current tokenizer has 0.9606 → new tokenizer should ideally be ≥ 0.9606.

# Summary Table for Evaluation
# Metric	Better Condition
# Avg Tokens per Word	Lower than 1.3085
# Max Tokens per Line	Lower than 24
# Reconstruction Rate	Equal to or higher than 0.9606

# ✅ Rule of Thumb:

# If average tokens per word decreases without dropping reconstruction rate, your tokenizer is better.

# Slight increase in max tokens per line is okay if reconstruction improves or avg tokens per word decreases.

# If reconstruction rate drops significantly, the tokenizer is losing fidelity, even if token counts improve.


Training Unigram tokenizer...
Tokenizer training complete! Files: sp_unigram_25k.model / sp_unigram_25k.vocab

Evaluation Results:
Average tokens per word: 1.2885
Max tokens in a line: 23
Reconstruction rate: 0.9606

✅ Tokenizer saved locally in Colab as: sp_unigram_25k.model and sp_unigram_25k.vocab
You can download them using:
files.download('sp_unigram_25k.model')
files.download('sp_unigram_25k.vocab')


In [8]:
!pip install --upgrade transformers --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
# ===============================
# INSTALL DEPENDENCIES
# ===============================
# !pip install torch einops sentencepiece datasets accelerate tqdm --quiet

# ===============================
# IMPORTS
# ===============================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
from datasets import Dataset as HFDataset
from tqdm import tqdm
import math
import random
from einops import rearrange

# ===============================
# PARAMETERS (Tweak for experiments)
# ===============================
# Model parameters
VOCAB_SIZE = 25000            # Must match tokenizer
BLOCK_SIZE = 20           # Context window
EMBED_DIM = 512               # Embedding dimension
NUM_LAYERS = 6                # Transformer layers
NUM_HEADS = 8                 # Attention heads
DROPOUT = 0.1                 # Dropout
FF_DIM = 4*EMBED_DIM          # Feed-forward hidden dimension

# Training parameters
BATCH_SIZE = 16
NUM_EPOCHS = 5
LEARNING_RATE = 5e-4
GRAD_CLIP = 1.0
EVAL_INTERVAL = 500
SAVE_INTERVAL = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenizer
TOKENIZER_MODEL = "sp_unigram_25k.model"
TEXT_FILE = "shakespeare_clean.txt"

# ===============================
# STEP 1: LOAD TOKENIZER
# ===============================
tokenizer = spm.SentencePieceProcessor(model_file=TOKENIZER_MODEL)

# ===============================
# STEP 2: PREPARE DATASET
# ===============================
class LMDataset(Dataset):
    def __init__(self, text_file, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.data = []
        with open(text_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    ids = tokenizer.encode(line, out_type=int)
                    for i in range(0, len(ids)-block_size, block_size):
                        chunk = ids[i:i+block_size+1]  # +1 for next token prediction
                        self.data.append(torch.tensor(chunk))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return x, y

dataset = LMDataset(TEXT_FILE, tokenizer, BLOCK_SIZE)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# ===============================
# STEP 3: TRANSFORMER WITH RoPE & FLASH ATTENTION
# ===============================
class RotaryEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, seq_len, device):
        t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
        freqs = torch.einsum("i , j -> i j", t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        return emb

def apply_rotary_pos_emb(q, k, freqs):
    cos, sin = freqs.cos(), freqs.sin()
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

def rotate_half(x):
    x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
    return torch.cat([-x2, x1], dim=-1)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv = nn.Linear(embed_dim, 3*embed_dim)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.rope = RotaryEmbedding(self.head_dim)

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv(x).view(B, T, 3, self.num_heads, self.head_dim)
        q, k, v = qkv[:,:,0], qkv[:,:,1], qkv[:,:,2]
        q = q.transpose(1,2); k = k.transpose(1,2); v = v.transpose(1,2)
        freqs = self.rope(T, x.device)
        q, k = apply_rotary_pos_emb(q, k, freqs)
        attn = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(self.head_dim)
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).unsqueeze(0)
        attn = attn.masked_fill(mask==0, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        out = torch.matmul(attn, v)
        out = out.transpose(1,2).contiguous().view(B,T,C)
        return self.proj(out)

class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, embed_dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
        self.ff = FeedForward(embed_dim, ff_dim, dropout)
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class TransformerLM(nn.Module):
    def __init__(self, vocab_size, block_size, embed_dim, num_layers, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(block_size, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)
    def forward(self, idx):
        B, T = idx.size()
        pos = torch.arange(T, device=idx.device).unsqueeze(0)
        x = self.token_emb(idx) + self.pos_emb(pos)
        for layer in self.layers:
            x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

model = TransformerLM(VOCAB_SIZE, BLOCK_SIZE, EMBED_DIM, NUM_LAYERS, NUM_HEADS, FF_DIM, DROPOUT).to(DEVICE)

# Enable mixed precision
scaler = torch.cuda.amp.GradScaler() if DEVICE=="cuda" else None

# ===============================
# STEP 4: OPTIMIZER AND LOSS
# ===============================
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

# ===============================
# STEP 5: TRAINING LOOP
# ===============================
global_step = 0
for epoch in range(NUM_EPOCHS):
    model.train()
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        if scaler:
            with torch.cuda.amp.autocast():
                logits = model(x)
                loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(x)
            loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            optimizer.step()

        global_step += 1

        if global_step % EVAL_INTERVAL == 0:
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for vx, vy in val_loader:
                    vx, vy = vx.to(DEVICE), vy.to(DEVICE)
                    logits = model(vx)
                    val_loss += criterion(logits.view(-1, VOCAB_SIZE), vy.view(-1)).item()
            val_loss /= len(val_loader)
            print(f"\nStep {global_step} | Eval Loss: {val_loss:.4f} | Perplexity: {math.exp(val_loss):.2f}")
            model.train()

        if global_step % SAVE_INTERVAL == 0:
            torch.save(model.state_dict(), f"transformer_lm_step{global_step}.pt")
            print(f"Checkpoint saved at step {global_step}")

# ===============================
# STEP 6: TEXT GENERATION FUNCTION
# ===============================
def generate_text(model, tokenizer, prompt, max_len=BLOCK_SIZE, top_k=50, top_p=0.95):
    model.eval()
    idx = torch.tensor(tokenizer.encode(prompt, out_type=int), device=DEVICE).unsqueeze(0)
    for _ in range(max_len):
        logits = model(idx)
        logits = logits[:, -1, :]
        # Top-k and Top-p filtering
        topk = torch.topk(logits, top_k, dim=-1)
        filtered_logits = torch.full_like(logits, -float('Inf'))
        filtered_logits.scatter_(-1, topk.indices, topk.values)
        probs = F.softmax(filtered_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_token], dim=1)
    return tokenizer.decode(idx[0].cpu().numpy())

# Example generation
prompt = "To be, or not to be"
print("\nGenerated text:\n", generate_text(model, tokenizer, prompt))


  scaler = torch.cuda.amp.GradScaler() if DEVICE=="cuda" else None
  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 40/40 [00:02<00:00, 14.86it/s]
Epoch 2: 100%|██████████| 40/40 [00:01<00:00, 23.37it/s]
Epoch 3: 100%|██████████| 40/40 [00:01<00:00, 23.84it/s]
Epoch 4: 100%|██████████| 40/40 [00:01<00:00, 23.70it/s]
Epoch 5: 100%|██████████| 40/40 [00:01<00:00, 23.62it/s]
  return F.linear(input, self.weight, self.bias)


AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [10]:
# ===============================
# INSTALL DEPENDENCIES
# ===============================
# !pip install torch sentencepiece tqdm --quiet

# ===============================
# IMPORTS
# ===============================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import math
from tqdm import tqdm

# ===============================
# PARAMETERS (Tweak for experiments)
# ===============================
INPUT_FILE = "shakespeare_clean.txt"
TOKENIZER_MODEL = "sp_unigram_25k.model"

VOCAB_SIZE = 25000         # tokenizer vocab
BLOCK_SIZE = 128           # context window
NUM_LAYERS = 6
NUM_HEADS = 8
EMBED_DIM = 512
BATCH_SIZE = 8
LEARNING_RATE = 5e-4
NUM_EPOCHS = 3

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LEN = BLOCK_SIZE

# ===============================
# STEP 1: LOAD TOKENIZER
# ===============================
sp_tokenizer = spm.SentencePieceProcessor(model_file=TOKENIZER_MODEL)

BOS_ID = sp_tokenizer.bos_id()
EOS_ID = sp_tokenizer.eos_id()
PAD_ID = sp_tokenizer.pad_id()

# Safety clamp
BOS_ID = min(BOS_ID if BOS_ID != -1 else 0, VOCAB_SIZE - 1)
EOS_ID = min(EOS_ID if EOS_ID != -1 else 1, VOCAB_SIZE - 1)
PAD_ID = min(PAD_ID if PAD_ID != -1 else 2, VOCAB_SIZE - 1)

print("Tokenizer IDs -> BOS:", BOS_ID, "EOS:", EOS_ID, "PAD:", PAD_ID)

# ===============================
# STEP 2: LOAD DATASET
# ===============================
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.lines = []

        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    ids = tokenizer.encode(line, out_type=int)
                    # Truncate long sequences
                    if len(ids) > block_size:
                        ids = ids[:block_size]
                    self.lines.append(ids)

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        ids = self.lines[idx]
        # Pad sequence
        padding_len = self.block_size - len(ids)
        ids = ids + [PAD_ID] * padding_len
        return torch.tensor(ids, dtype=torch.long)

dataset = TextDataset(INPUT_FILE, sp_tokenizer, BLOCK_SIZE)
train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# ===============================
# STEP 3: DEFINE TRANSFORMER LM
# ===============================
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, block_size, pad_id):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.pos_emb = nn.Embedding(block_size, embed_dim)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=embed_dim,
                nhead=num_heads,
                dim_feedforward=embed_dim * 4,
                activation="gelu",
                batch_first=True
            ) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        pos_ids = torch.arange(seq_len, device=x.device).unsqueeze(0)
        h = self.token_emb(x) + self.pos_emb(pos_ids)
        for layer in self.layers:
            h = layer(h)
        h = self.ln_f(h)
        logits = self.head(h)
        return logits

model = TransformerLM(VOCAB_SIZE, EMBED_DIM, NUM_HEADS, NUM_LAYERS, BLOCK_SIZE, PAD_ID).to(DEVICE)

# ===============================
# STEP 4: TRAINING LOOP
# ===============================
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scaler = torch.amp.GradScaler() if DEVICE == "cuda" else None
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)

def train_epoch(model, loader, optimizer, scaler):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        batch = batch.to(DEVICE)
        inputs = batch[:, :-1]
        targets = batch[:, 1:]

        optimizer.zero_grad()
        with torch.amp.autocast(enabled=scaler is not None):
            logits = model(inputs)
            loss = loss_fn(logits.view(-1, VOCAB_SIZE), targets.view(-1))
        if scaler:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(DEVICE)
            inputs = batch[:, :-1]
            targets = batch[:, 1:]
            logits = model(inputs)
            loss = loss_fn(logits.view(-1, VOCAB_SIZE), targets.view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, scaler)
    val_loss = evaluate(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Perplexity: {math.exp(val_loss):.2f}")

# ===============================
# STEP 5: SAMPLE GENERATION
# ===============================
def generate_text(model, tokenizer, prompt, max_len=BLOCK_SIZE, temperature=1.0, top_k=50):
    model.eval()
    ids = tokenizer.encode(prompt, out_type=int)
    ids = [BOS_ID] + ids
    input_ids = torch.tensor(ids, device=DEVICE).unsqueeze(0)
    for _ in range(max_len):
        logits = model(input_ids)
        logits = logits[:, -1, :] / temperature
        top_logits, top_idx = torch.topk(logits, top_k)
        probs = F.softmax(top_logits, dim=-1)
        next_id = top_idx[0, torch.multinomial(probs, num_samples=1)]
        input_ids = torch.cat([input_ids, next_id.unsqueeze(0).unsqueeze(0)], dim=1)
        if next_id.item() == EOS_ID:
            break
    return tokenizer.decode(input_ids[0].cpu().numpy())

# Example
prompt = "To be, or not to be"
print("\nGenerated text:\n", generate_text(model, sp_tokenizer, prompt))


Tokenizer IDs -> BOS: 1 EOS: 2 PAD: 2


AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
