In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
from pathlib import Path

input_path = Path("/Users/vlad/googledrive/AI/datasets/murakami/murakami.txt")
if not input_path.exists():
    input_path = Path("/content/drive/MyDrive/AI/datasets/murakami/murakami.txt")
    assert input_path.exists(), input_path
    
with input_path.open(encoding='utf-8') as f:
    input_text = f.read()

## Tokenizers

In [3]:
print(len(input_text))
train_text = input_text[:10_000]
test_text = input_text[:152]

10155065


### Pre-trained

In [45]:
from transformers import AutoTokenizer
bt = AutoTokenizer.from_pretrained("bert-base-cased")
print(bt.backend_tokenizer.normalizer.normalize_str(test_text))

2023-02-22 22:08:14.849157: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Охота на овец Часть первая 25.11.1970 ПИКНИК СРЕДИ НЕДЕЛИ О ее смерти сообщил мне по телефону старый приятель, наткнувшись на случайные строчки в газете


### CharBPETokenizer

In [9]:
from tokenizers.implementations import CharBPETokenizer
char_bpe = CharBPETokenizer()
char_bpe.train_from_iterator(iter([train_text]))
print(char_bpe.encode(test_text).tokens)
print(char_bpe.decode(char_bpe.encode(test_text).ids))




['О', 'хот', 'а</w>', 'на</w>', 'о', 'ве', 'ц</w>', 'Ч', 'а', 'сть</w>', 'пер', 'вая</w>', '25</w>', '.</w>', '1', '1</w>', '.</w>', '19', '70</w>', 'П', 'И', 'К', 'Н', 'И', 'К</w>', 'С', 'Р', 'ЕД', 'И</w>', 'Н', 'ЕД', 'Е', 'Л', 'И</w>', 'О</w>', 'ее</w>', 'смерти</w>', 'со', 'общи', 'л</w>', 'мне</w>', 'по</w>', 'телеф', 'о', 'ну</w>', 'старый</w>', 'приятел', 'ь</w>', ',</w>', 'на', 'т', 'к', 'нувшись</w>', 'на</w>', 'случай', 'ные</w>', 'стро', 'ч', 'ки</w>', 'в</w>', 'газет', 'е</w>']
Охота на овец Часть первая 25 . 11 . 1970 ПИКНИК СРЕДИ НЕДЕЛИ О ее смерти сообщил мне по телефону старый приятель , наткнувшись на случайные строчки в газете


### ByteLevelBPETokenizer

In [10]:
from tokenizers.implementations import ByteLevelBPETokenizer
bpe = ByteLevelBPETokenizer()
bpe.train_from_iterator(iter([train_text]))
print(bpe.encode(test_text).tokens)
print(bpe.decode(bpe.encode(test_text).ids))




['Ðŀ', 'ÑħÐ¾ÑĤ', 'Ð°', 'ĠÐ½Ð°', 'ĠÐ¾', 'Ð²', 'ÐµÑĨ', 'Ċ', 'Ð', '§', 'Ð°ÑģÑĤÑĮ', 'ĠÐ¿ÐµÑĢ', 'Ð²Ð°Ñı', 'Ċ', '25', '.', '1', '1', '.', '19', '70', 'Ċ', 'ÐŁ', 'ÐĺÐļ', 'ÐĿ', 'ÐĺÐļ', 'ĠÐ¡', 'Ðł', 'ÐķÐĶ', 'Ðĺ', 'ĠÐĿ', 'ÐķÐĶ', 'Ðķ', 'Ð', 'Ľ', 'Ðĺ', 'Ċ', 'Ðŀ', 'ĠÐµÐµ', 'ĠÑģÐ¼ÐµÑĢÑĤÐ¸', 'ĠÑģÐ¾Ð¾Ð±Ñī', 'Ð¸Ð»', 'ĠÐ¼Ð½Ðµ', 'ĠÐ¿Ð¾', 'ĠÑĤÐµÐ»ÐµÑĦÐ¾', 'Ð½Ñĥ', 'ĠÑģÑĤÐ°ÑĢÑĭÐ¹', 'ĠÐ¿ÑĢÐ¸Ñı', 'ÑĤ', 'ÐµÐ»ÑĮ', ',', 'ĠÐ½Ð°', 'ÑĤ', 'Ðº', 'Ð½ÑĥÐ²ÑĪÐ¸ÑģÑĮ', 'ĠÐ½Ð°', 'ĠÑģÐ»ÑĥÑĩÐ°Ð¹', 'Ð½ÑĭÐµ', 'ĠÑģÑĤÑĢÐ¾', 'Ñĩ', 'ÐºÐ¸', 'ĠÐ²', 'ĠÐ³Ð°Ð·ÐµÑĤ', 'Ðµ']
Охота на овец
Часть первая
25.11.1970
ПИКНИК СРЕДИ НЕДЕЛИ
О ее смерти сообщил мне по телефону старый приятель, наткнувшись на случайные строчки в газете


### SentencePieceBPETokenizer

In [12]:
from tokenizers.implementations import SentencePieceBPETokenizer
spm_bpe = SentencePieceBPETokenizer()
spm_bpe.train_from_iterator(iter([train_text]))
print(spm_bpe.encode(test_text).tokens)
print(spm_bpe.decode(spm_bpe.encode(test_text).ids))




['▁О', 'хо', 'та', '▁на', '▁о', 'ве', 'ц', '\n', 'Ч', 'а', 'сть', '▁пер', 'вая', '\n', '25', '.1', '1', '.1', '9', '70', '\n', 'П', 'ИК', 'Н', 'ИК', '▁С', 'Р', 'ЕД', 'И', '▁Н', 'ЕД', 'Е', 'Л', 'И', '\n', 'О', '▁ее', '▁смер', 'ти', '▁сооб', 'щ', 'ил', '▁мне', '▁по', '▁телефо', 'ну', '▁старый', '▁прия', 'т', 'ель', ',', '▁на', 'т', 'к', 'нувши', 'сь', '▁на', '▁случай', 'ные', '▁стро', 'ч', 'ки', '▁в', '▁газет', 'е']
Охота на овец
Часть первая
25.11.1970
ПИКНИК СРЕДИ НЕДЕЛИ
О ее смерти сообщил мне по телефону старый приятель, наткнувшись на случайные строчки в газете


### WordPiece

In [14]:
from tokenizers.implementations import BertWordPieceTokenizer
wp = BertWordPieceTokenizer(clean_text=False, lowercase=False)
wp.train_from_iterator(iter([train_text]))
print(wp.encode(test_text).tokens)
print(wp.decode(wp.encode(test_text).ids))




['О', '##хо', '##та', 'на', 'о', '##ве', '##ц', 'Ч', '##ас', '##ть', 'пер', '##вая', '25', '.', '1', '##1', '.', '19', '##7', '##0', 'П', '##ИК', '##Н', '##ИК', 'С', '##Р', '##ЕД', '##И', 'Н', '##ЕД', '##Е', '##Л', '##И', 'О', 'ее', 'смерти', 'сооб', '##щ', '##ил', 'мне', 'по', 'телефо', '##ну', 'старый', 'прия', '##тел', '##ь', ',', 'на', '##т', '##к', '##нувшись', 'на', 'случай', '##ные', 'стро', '##ч', '##ки', 'в', 'газет', '##е']
Охота на овец Часть первая 25. 11. 1970 ПИКНИК СРЕДИ НЕДЕЛИ О ее смерти сообщил мне по телефону старый приятель, наткнувшись на случайные строчки в газете


### SentencePieceUnigramTokenizer

In [13]:
from tokenizers.implementations import SentencePieceUnigramTokenizer
smp_ug = SentencePieceUnigramTokenizer()
smp_ug.train_from_iterator(iter([train_text]))
print(smp_ug.encode(input_text[:152]).tokens)
print(smp_ug.decode(smp_ug.encode(input_text[:152]).ids))



['▁О', 'хот', 'а', '▁на', '▁', 'ов', 'ец', '▁Ч', 'асть', '▁пер', 'ва', 'я', '▁25', '.1', '1', '.', '19', '70', '▁П', 'ИК', 'Н', 'ИК', '▁С', 'Р', 'ЕД', 'И', '▁Н', 'ЕД', 'Е', 'Л', 'И', '▁О', '▁ее', '▁смерти', '▁сообщ', 'ил', '▁мне', '▁по', '▁телефон', 'у', '▁стары', 'й', '▁приятел', 'ь', ',', '▁на', 'тк', 'нувшись', '▁на', '▁случайн', 'ые', '▁стро', 'ч', 'ки', '▁в', '▁газет', 'е']
Охота на овец Часть первая 25.11.1970 ПИКНИК СРЕДИ НЕДЕЛИ О ее смерти сообщил мне по телефону старый приятель, наткнувшись на случайные строчки в газете


## Murakami tokenizer

In [1]:
from pathlib import Path
input_path = Path("/Users/vlad/MyDrive/AI/datasets/murakami/murakami.txt")
assert input_path.exists(), input_path

input_path = input_path.with_name("murakami-1000lines.txt")


class Config:
    sample_only: str = True
    seed = 0

    # Dataset
    input_path = input_path
    vocab_size: int = 30_000
    context_len: int = 32

    # Network
    emb_dim: int = 32
    n_blocks: int = 2
    n_heads: int = 4

    # Optimization
    batch_size: int = 32
    learning_rate: float = 5e-4
    weight_decay: float = 0.01
    num_workers: int = 1
    max_steps: int = 100_000

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
bpe = ByteLevelBPETokenizer()
bpe.train(files=str(Config.input_path))
bpe.save_model(str(Config.input_path.parent), "bpe")

In [113]:
import torch
torch.manual_seed(Config.seed)
torch.cuda.manual_seed_all(Config.seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [116]:
from typing import Tuple
import torch
from torch.utils.data import random_split, TensorDataset


class TransformerDataset:
    def __init__(self, text: str, context_len: int):
        super().__init__()
        self.context_len = context_len
        self.data = torch.tensor(bpe.encode(text).ids, dtype=torch.long)
        
    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.data[index : index + self.context_len]
        y = self.data[index + 1 : index + self.context_len + 1]
        return x, y

    def __len__(self):
        return len(self.data) - self.context_len - 1


with Config.input_path.open(encoding='utf-8') as f:
    murakami = TransformerDataset(f.read(), Config.context_len)


test_n = min(1000, int(len(murakami) * 0.1))
train, test = random_split(murakami, [len(murakami) - test_n, test_n])

from torch.utils.data import DataLoader
from tokenizers.decoders import ByteLevel

dataloader = DataLoader(
    train, 
    batch_size=Config.batch_size,
    sampler=torch.utils.data.RandomSampler(
        train, replacement=True, num_samples=int(1e10)
    ),
    num_workers=Config.num_workers,
)

for (x, y) in dataloader:
    x = x.to(device)
    y = y.to(device)
    print(ByteLevel().decode(x))
    break


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TransformerDataset' on <module '__main__' (built-in)>


RuntimeError: DataLoader worker (pid(s) 31927) exited unexpectedly

In [112]:

from torch.utils.data import DataLoader
from tokenizers.decoders import ByteLevel

dataloader = DataLoader(
    train, 
    batch_size=Config.batch_size,
    sampler=torch.utils.data.RandomSampler(
        train, replacement=True, num_samples=int(1e10)
    ),
    num_workers=Config.num_workers,
)

for (x, y) in dataloader:
    x = x.to(device)
    y = y.to(device)
    print(ByteLevel().decode(x))
    break


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TransformerDataset' on <module '__main__' (built-in)>


RuntimeError: DataLoader worker (pid(s) 30827) exited unexpectedly

## Makemore tokenizer


In [73]:
from typing import List, Optional
from pathlib import Path

path = Path("/Users/vlad/MyDrive/AI/datasets/names/names.txt")
with path.open() as f:
    text = f.read()

class CharTokenizer:
    def __init__(self, text: str):
        chars = sorted(set("".join(text.split())))
        self.vocab_size = len(chars) + 1
        self.itos = {i + 1: c for i, c in enumerate(chars)}
        self.itos[0] = '.'
        self.itos[-1] = '_'
        self.stoi = {c: i for i, c in self.itos.items()}

    def get_vocab(self):
        return self.itos
    
    def token_to_id(self, token: str) -> int:
        return self.stoi[token]
    
    def get_vocab_size(self):
        return self.vocab_size
    
    def encode(self, text: str) -> List[int]:
        return [self.stoi[c] for c in text]

    def decode(self, ids: List[int], skip_special_tokens: bool = True) -> str:
        if skip_special_tokens:
            ids = [i for i in ids if i != 0]
        return "".join(self.itos[i] for i in ids)
    
tokenizer = CharTokenizer(text)
dict(sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1]))

{0: '.',
 -1: '_',
 1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z'}

In [9]:
from tokenizers.implementations import CharBPETokenizer
t = CharBPETokenizer()
t.train_from_iterator([text], vocab_size=0, suffix="")
print(t.get_vocab_size())
print(dict(sorted(t.get_vocab().items(), key=lambda kv: kv[1])))




27
{'<unk>': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [57]:
names = text.split()
max_name_len = max(len(n) for n in names)
t.enable_padding("left", pad_id=0, pad_token=".", length=max_name_len)
t.encode("abcd").tokens

['.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'a', 'b', 'c', 'd']

In [74]:
import torch

names = names[:2]
max_name_len = max(len(n) for n in names)
xs = []
ys = []
for name in names:
    """
    ....... -> _____.e
    ......e -> ____.em
    .....em -> ___.emm
    ....emm -> __.emma
    ...emma -> _.emma.
    ....... -> _____.o
    ......o -> ____.ol
    .....ol -> ___.oli
    ....oli -> __.oliv
    ...oliv -> _.olivi
    ..olivi -> .olivia
    .olivia -> olivia.
    """
    
    # name = "." + name + "."
    # for i in range(1, len(name)):
    #     x = torch.zeros(max_name_len + 1, dtype=torch.long)
    #     y = torch.zeros(max_name_len + 1, dtype=torch.long)
    #     y[:] = -1
    #     subname_x, subname_y = name[:i], name[:i + 1]
    #     if len(subname_y) > len(y):
    #         subname_y = subname_y[1:]  # trim leading dot for the longest word
    #     x[-len(subname_x):] = torch.tensor(tokenizer.encode(subname_x))
    #     y[-len(subname_y):] = torch.tensor(tokenizer.encode(subname_y))
    #     xs.append(x)
    #     ys.append(y)
    #     print(
    #         tokenizer.decode(x.tolist(), skip_special_tokens=False), '->',
    #         tokenizer.decode(y.tolist(), skip_special_tokens=False)
    #     )

    x = torch.zeros(max_name_len + 1, dtype=torch.long)
    y = torch.zeros(max_name_len + 1, dtype=torch.long)

    ids = [tokenizer.token_to_id(ch) for ch in name]

    x[0] = tokenizer.token_to_id(".")
    x[1: 1 + len(name)] = torch.tensor(ids)

    y[0: len(name)] = torch.tensor(tokenizer.encode(name))
    y[len(name)] = tokenizer.token_to_id(".")
    y[len(name) + 1 :] = tokenizer.token_to_id("_")

    xs.append(x)
    ys.append(y)
    print(
        tokenizer.decode(x.tolist(), skip_special_tokens=False), '->', 
        tokenizer.decode(y.tolist(), skip_special_tokens=False)
    )

xs = torch.stack(xs)
ys = torch.stack(ys)

.emma.. -> emma.__
.olivia -> olivia.


In [19]:
import torch
with open(path) as f:
    names = f.read().split()
max_name_len = max(len(n) for n in names)
names = names[:5]
xs, ys = [], []
for name in names:
    x = torch.zeros(max_name_len + 1, dtype=torch.long)
    y = torch.zeros(max_name_len + 1, dtype=torch.long)
    x[0] = tokenizer.token_to_id(".")
    print(name, tokenizer.encode(name).ids, tokenizer.encode(name).tokens)
    x[1: 1 + len(name)] = torch.tensor(tokenizer.encode(name).ids)
    y[0: len(name)] = torch.tensor(tokenizer.encode(name).ids)
    y[len(name)] = tokenizer.token_to_id(".")
    xs.append(x)
    ys.append(y)
    print(tokenizer.decode(x.tolist(), skip_special_tokens=False), '->', tokenizer.decode(y.tolist(), skip_special_tokens=False))

xs = torch.stack(xs)
ys = torch.stack(ys)

emma [27, 5, 13, 13, 1] ['▁', 'e', 'm', 'm', 'a']


RuntimeError: The expanded size of the tensor (4) must match the existing size (5) at non-singleton dimension 0.  Target sizes: [4].  Tensor sizes: [5]

In [70]:
line = 'abc'
torch.tensor(
    tokenizer.encode('abc').ids, dtype=torch.long
).unsqueeze(0)


tensor([[2, 3, 4]])

In [56]:
from tokenizers.implementations import ByteLevelBPETokenizer
save_path = input_path.with_name(input_path.stem + '-tokenizer.json')
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=str(input_path))
tokenizer.save(str(save_path))






In [57]:
from tokenizers import Tokenizer
print(save_path)
loaded = Tokenizer.from_file(str(save_path))
loaded.encode('ОХОТА').tokens

/Users/vlad/MyDrive/AI/datasets/murakami/murakami-1000lines-tokenizer.json


['ÐŀÐ¥', 'ÐŀÐ¢ÐĲ']

In [58]:
from tokenizers.implementations import BaseTokenizer
loaded2 = BaseTokenizer(loaded)
print(loaded2.encode('ОХОТА\nНА ОВЕЦ').tokens)
print(loaded2.decode(loaded2.encode('ОХОТА\nНА ОВЕЦ').ids))

['ÐŀÐ¥', 'ÐŀÐ¢ÐĲ', 'Ċ', 'ÐĿÐĲ', 'ĠÐŀÐĴÐķÐ¦']
ОХОТА
НА ОВЕЦ


In [24]:
SentencePieceBPETokenizer(
    vocab=str(input_path.parent / "murakami-1000lines-smp-bpe-vocab.json"),
    merges=str(input_path.parent / "murakami-1000lines-smp-bpe-merges.txt"),
)

Exception: Error while initializing BPE: Merges text file invalid at line 29

In [19]:
loaded.save(str(save_path) + '2')

In [None]:
# load tokenizer from file
ByteLevelBPETokenizer
tokenizer = SentencePieceBPETokenizer(
    "/Users/vlad/git/vladsaveliev/deeplearning/hipogpt/tmp/bpe-vocab.json",
    "/Users/vlad/git/vladsaveliev/deeplearning/hipogpt/tmp/bpe-merges.txt",
)
