In [82]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [68]:
from pathlib import Path

input_path = Path("/Users/vlad/googledrive/AI/datasets/murakami/murakami.txt")
if not input_path.exists():
    input_path = Path("/content/drive/MyDrive/AI/datasets/murakami/murakami.txt")
    assert input_path.exists(), input_path
    
with input_path.open(encoding='utf-8') as f:
    input_text = f.read()

## Tokenizers

In [99]:
from transformers import AutoTokenizer
bt = AutoTokenizer.from_pretrained("bert-base-cased")
print(bt.backend_tokenizer.normalizer.normalize_str(input_text[:100]))

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Охота на овец Часть первая 25.11.1970 ПИКНИК СРЕДИ НЕДЕЛИ О ее смерти сообщил мне по телефону старый


### BPE

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
bpe = ByteLevelBPETokenizer()
bpe.train(files=str(input_path))
bpe.save_model(str(input_path.parent), "huggingface-bpe")

In [65]:
from tokenizers.decoders import ByteLevel
print(''.join(ByteLevel().decode([t]) for t in bpe.encode(input_text[:152]).tokens))

Охота на овец
Часть первая
25.11.1970
ПИКНИК СРЕДИ НЕДЕЛИ
О ее смерти сообщил мне по телефону старый приятель, наткнувшись на случайные строчки в газете


### SentencePiece BPE

In [60]:
from tokenizers.implementations import SentencePieceBPETokenizer
spm_bpe = SentencePieceBPETokenizer()
spm_bpe.train(files=str(input_path))
spm_bpe.save_model(str(input_path.parent), "sentencepiece-bpe")






['/Users/vlad/googledrive/AI/datasets/murakami/sentencepiece-bpe-vocab.json',
 '/Users/vlad/googledrive/AI/datasets/murakami/sentencepiece-bpe-merges.txt']

In [61]:
from tokenizers.decoders import ByteLevel
', '.join(ByteLevel().decode([t]) for t in spm_bpe.encode(input_text[:152]).tokens)

'▁О, хо, та, ▁на, ▁овец, \n, Ч, а, сть, ▁первая, \n, 2, 5., 1, 1., 19, 7, 0\n, П, И, К, НИ, К, ▁С, РЕ, Д, И, ▁НЕ, Д, ЕЛ, И, \n, О, ▁ее, ▁смерти, ▁сообщил, ▁мне, ▁по, ▁телефону, ▁старый, ▁прия, тель,, ▁нат, кнувшись, ▁на, ▁случай, ные, ▁строчки, ▁в, ▁газете'

### WordPiece

In [None]:
from tokenizers.implementations import BertWordPieceTokenizer
wp = BertWordPieceTokenizer()
wp.train(files=str(input_path))
wp.save_model(str(input_path.parent), "huggingface-bertwordpiece")

In [53]:
', '.join(wp.encode(input_text[:152]).tokens)

'охота, на, овец, часть, первая, 25, ., 11, ., 1970, пикник, среди, недели, о, ее, смерти, сообщил, мне, по, телефону, старыи, приятель, ,, натк, ##нувшись, на, случаиные, строчки, в, газете'

### SentencePiece Unigram

In [None]:
from tokenizers.implementations import SentencePieceUnigramTokenizer
smp_ug = SentencePieceUnigramTokenizer()
smp_ug.train(files=str(input_path))
smp_ug.save_model(str(input_path.parent), "sentencepiece-unigram")

In [59]:
', '.join(smp_ug.encode(input_text[:152]).tokens)

'▁О, х, от, а, ▁на, ▁овец, ▁Час, ть, ▁первая, ▁2, 5, ., 1, 1, ., 1970, ▁П, И, К, Н, И, К, ▁С, Р, Е, Д, И, ▁, Н, Е, Д, Е, Л, И, ▁О, ▁ее, ▁смерти, ▁сообщил, ▁мне, ▁по, ▁телефону, ▁старый, ▁приятель, ,, ▁на, т, к, нувшись, ▁на, ▁случай, ные, ▁строч, ки, ▁в, ▁газет, е'

In [66]:
from tokenizers.implementations import ByteLevelBPETokenizer
bpe = ByteLevelBPETokenizer()
bpe.train(files=str(input_path), vocab_size=30_000)
bpe.save_model(input_path.parent, "bpe-tokenizer")

Tokenizer(vocabulary_size=30000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [109]:
from pathlib import Path
input_path = Path("/Users/vlad/googledrive/AI/datasets/murakami/murakami.txt")
if not input_path.exists():
    input_path = Path("/content/drive/MyDrive/AI/datasets/murakami/murakami.txt")
    assert input_path.exists(), input_path
    
input_path = input_path.with_name("murakami-1000lines.txt")


class Config:
    sample_only: str = True
    seed = 0

    # Dataset
    input_path = input_path
    vocab_size: int = 30_000
    context_len: int = 32

    # Network
    emb_dim: int = 32
    n_blocks: int = 2
    n_heads: int = 4

    # Optimization
    batch_size: int = 32
    learning_rate: float = 5e-4
    weight_decay: float = 0.01
    num_workers: int = 1
    max_steps: int = 100_000

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
bpe = ByteLevelBPETokenizer()
bpe.train(files=str(Config.input_path))
bpe.save_model(str(Config.input_path.parent), "bpe")

In [113]:
import torch
torch.manual_seed(Config.seed)
torch.cuda.manual_seed_all(Config.seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [116]:
from typing import Tuple
import torch
from torch.utils.data import random_split, TensorDataset


class TransformerDataset:
    def __init__(self, text: str, context_len: int):
        super().__init__()
        self.context_len = context_len
        self.data = torch.tensor(bpe.encode(text).ids, dtype=torch.long)
        
    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor]:
        x = self.data[index : index + self.context_len]
        y = self.data[index + 1 : index + self.context_len + 1]
        return x, y

    def __len__(self):
        return len(self.data) - self.context_len - 1


with Config.input_path.open(encoding='utf-8') as f:
    murakami = TransformerDataset(f.read(), Config.context_len)


test_n = min(1000, int(len(murakami) * 0.1))
train, test = random_split(murakami, [len(murakami) - test_n, test_n])

from torch.utils.data import DataLoader
from tokenizers.decoders import ByteLevel

dataloader = DataLoader(
    train, 
    batch_size=Config.batch_size,
    sampler=torch.utils.data.RandomSampler(
        train, replacement=True, num_samples=int(1e10)
    ),
    num_workers=Config.num_workers,
)

for (x, y) in dataloader:
    x = x.to(device)
    y = y.to(device)
    print(ByteLevel().decode(x))
    break


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TransformerDataset' on <module '__main__' (built-in)>


RuntimeError: DataLoader worker (pid(s) 31927) exited unexpectedly

In [112]:

from torch.utils.data import DataLoader
from tokenizers.decoders import ByteLevel

dataloader = DataLoader(
    train, 
    batch_size=Config.batch_size,
    sampler=torch.utils.data.RandomSampler(
        train, replacement=True, num_samples=int(1e10)
    ),
    num_workers=Config.num_workers,
)

for (x, y) in dataloader:
    x = x.to(device)
    y = y.to(device)
    print(ByteLevel().decode(x))
    break


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TransformerDataset' on <module '__main__' (built-in)>


RuntimeError: DataLoader worker (pid(s) 30827) exited unexpectedly