In [2]:
from datasets import load_dataset
import torch
from tokenizers import Tokenizer

num_titles = 10000
val_frac = 0.1
seed = 1337
ds = load_dataset("julien040/hacker-news-posts", split="train", cache_dir="./data").shuffle(seed=seed)
titles = [row["title"].strip() for row in ds.take(num_titles)]
n = int(num_titles * (1 - val_frac))
train_titles, val_titles= titles[:n], titles[n:]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print('Train TITLES')
print(train_titles[0])
print('VAL TITles')
print(val_titles[0])

Train TITLES
Doom on Ubuntu Phone
VAL TITles
New Twists in the Road to Quantum Supremacy


In [4]:
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, trainers    


def train_tokenizer(titles: list[str], vocab_size: int, unk_token: str = "<unk>", pad_token: str = "<pad>", eos_token: str = "<eos>") -> Tokenizer:
    tokenizer = Tokenizer(models.BPE(unk_token=unk_token))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=[pad_token, eos_token, unk_token]
    )
    tokenizer.train_from_iterator(titles, trainer)
    return tokenizer




def get_batch(split_ids: torch.Tensor, ptr: int, block_size: int, batch_size: int, device: torch.device):
    span = block_size * batch_size + 1
    if ptr + span >= len(split_ids):
        ptr = 0
    batch = split_ids[ptr: ptr + span]
    x = batch[:-1].view(batch_size, block_size).to(device)
    y = batch[1:].view(batch_size, block_size).to(device)
    return x, y, ptr + block_size * batch_size


In [5]:
vocab_size = 16000

In [6]:
train_tokenizer(train_titles+val_titles, vocab_size, eos_token="<eos>")

Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"<pad>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"<eos>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":2, "content":"<unk>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=None, pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=None, decoder=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), model=BPE(dropout=None, unk_token="<unk>", continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"<pad>":0, "<eos>":1, "<unk>":2, "!":3, """:4, "#":5, "$":6, "%":7, "&":8, "'":9, "(":10, ")":11, "*":12, "+":13, ",":14, "-":15, ".":16, "/":17, "0":18, "1":19, "2":20, "3":21, "4":22, "5":23, "6":24, "7":25, "8

In [7]:
from model.tokenizer.BPETokenizer import BPETokenizer
tok = BPETokenizer(train_tokenizer(train_titles+val_titles, vocab_size, eos_token="<eos>"))

In [8]:
eos_token="<eos>"
train_text = eos_token.join(train_titles) + eos_token
val_text = eos_token.join(val_titles) + eos_token
train_ids = torch.tensor(tok.encode(train_text), dtype=torch.long)
val_ids = torch.tensor(tok.encode(val_text), dtype=torch.long)

In [9]:
len(train_titles) , len(val_titles)

(9000, 1000)

In [11]:
train_ids.shape, val_ids.shape

(torch.Size([107337]), torch.Size([11963]))

In [12]:
train_ids[:10]

tensor([ 6130,   274,  1866,  1723,     1,  5115,  5482,   246,  1228, 10936])

In [13]:
# For train titles
max_train_idx = max(range(len(train_titles)), key=lambda i: len(train_titles[i]))
max_len_train = len(train_titles[max_train_idx])
max_train_title = train_titles[max_train_idx]

# For validation/test titles
max_val_idx = max(range(len(val_titles)), key=lambda i: len(val_titles[i]))
max_len_val = len(val_titles[max_val_idx])
max_val_title = val_titles[max_val_idx]

print(f"Train -> Max length: {max_len_train}, Index: {max_train_idx}, Title: {max_train_title}")
print(f"Validation -> Max length: {max_len_val}, Index: {max_val_idx}, Title: {max_val_title}")


Train -> Max length: 98, Index: 6099, Title: Rough silicon nanowires potentially allow much more efficient waste-heat to electricity conversion
Validation -> Max length: 81, Index: 229, Title: Official Google Blog: "This site may harm your computer" on every search result??


In [14]:
import torch
from model.tokenizer.BPETokenizer import BPETokenizer
from train import train_tokenizer


eos_token = "<eos>"
tok = BPETokenizer(train_tokenizer(train_titles+val_titles, 16000, eos_token=eos_token))
train_text = eos_token.join(train_titles) + eos_token
val_text = eos_token.join(val_titles) + eos_token
train_ids = torch.tensor(tok.encode(train_text), dtype=torch.long)
val_ids = torch.tensor(tok.encode(val_text), dtype=torch.long)

In [15]:
# Tokenize each title individually
train_token_lists = [tok.encode(title) for title in train_titles]
val_token_lists = [tok.encode(title) for title in val_titles]

# Find maximum token length and the corresponding title
max_len_train = max(len(t) for t in train_token_lists)
max_len_val = max(len(t) for t in val_token_lists)

# Optional: get the index or title itself
max_train_idx = max(range(len(train_token_lists)), key=lambda i: len(train_token_lists[i]))
max_val_idx = max(range(len(val_token_lists)), key=lambda i: len(val_token_lists[i]))

print("Max tokens in train:", max_len_train, "Title:", train_titles[max_train_idx])
print("Max tokens in val:", max_len_val, "Title:", val_titles[max_val_idx])


Max tokens in train: 68 Title: Unearthing Z͌̈́̾a͊̈́l͊̿g̏̉͆o̾̚̚S̝̬ͅc̬r̯̼͇ͅi̼͖̜̭͔p̲̘̘̹͖t̠͖̟̹͓͇ͅ with visual fuzzing
Max tokens in val: 44 Title: Bangladeshi model Farhana Akhtar Nisho (ফারহানা আখতার নিশ্) hot and sexy photo


In [16]:
len(train_ids) , len(val_ids)

(107337, 11963)

In [17]:
# Split text into words (assuming words are separated by spaces)
train_words = train_text.split()
val_words = val_text.split()

# Find unique words using set
unique_train_words = set(train_words)
unique_val_words = set(val_words)

print("Number of unique words in train:", len(unique_train_words))
print("Number of unique words in val:", len(unique_val_words))


Number of unique words in train: 25516
Number of unique words in val: 4177


In [18]:
# Split text into words
train_words = set(train_text.split())
val_words = set(val_text.split())

# Combine both sets to get all unique words
all_unique_words = train_words | val_words  # union of sets

print("Total number of unique words in train + val:", len(all_unique_words))


Total number of unique words in train + val: 27707


In [19]:
25516 + 4177

29693

In [20]:
# Tokenize train and val text individually
train_tokens = tok.encode(train_text)
val_tokens = tok.encode(val_text)

# Convert to sets to get unique tokens
unique_train_tokens = set(train_tokens)
unique_val_tokens = set(val_tokens)

# Combined unique tokens across train and val
combined_unique_tokens = unique_train_tokens | unique_val_tokens

print("Unique tokens in train:", len(unique_train_tokens))
print("Unique tokens in val:", len(unique_val_tokens))
print("Combined unique tokens:", len(combined_unique_tokens))

# Find tokens in val but not in train (new tokens)
tokens_only_in_val = unique_val_tokens - unique_train_tokens
print("Tokens in val not seen in train:", len(tokens_only_in_val))


Unique tokens in train: 14403
Unique tokens in val: 4669
Combined unique tokens: 14751
Tokens in val not seen in train: 348


In [21]:
14751+348

15099