In [8]:
import os
from typing import BinaryIO

In [13]:
f = open('./tokenizer/bpe_example.txt', 'rb')

In [16]:
f.tell()

10

In [15]:
b = f.read(10)
b

b'low low lo'

In [None]:
b

In [None]:
def train_bpe(
        input_path: str | os.PathLike,
        vocab_size: int,
        special_tokens: list[str]
) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:
    path: Path = Path(input_path)
    context: str
    with path.open('+r'):
        
    return {}, []

In [17]:
p = './tokenizer/bpe_example.txt'

In [22]:
import os
from pathlib import Path

path: Path = Path(p)
content: str = ''
with path.open('+r') as f:
    content = f.read()
content

'low low low low low\nlower lower widest widest widest\nnewest newest newest newest newest newest'

In [23]:
content.encode()

b'low low low low low\nlower lower widest widest widest\nnewest newest newest newest newest newest'

In [32]:
import regex as re

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

def pretokenizer(content: str, special_tokens: list[str]) -> dict[list[bytes], int]:
    matches = re.finditer(PAT, content)
    counter = {}
    for match in matches:
        text = match.group()
        counter[text] = counter.get(text, 0) + 1
    return counter

In [33]:
pretokenizer(content, [])

{'low': 1,
 ' low': 4,
 '\n': 2,
 'lower': 1,
 ' lower': 1,
 ' widest': 3,
 'newest': 1,
 ' newest': 5}

In [41]:
def remove_special_tokens(content: str, special_tokens: list[str]) -> list[str]:
    pattern = '|'.join(re.escape(d) for d in special_tokens)
    return re.split(pattern, content)


PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""


def pretokenizer(content: str, special_tokens: list[str]) -> dict[list[bytes], int]:
    docs = remove_special_tokens(content, special_tokens)
    counter = {}
    for doc in docs:
        matches = re.finditer(PAT, doc)

        for match in matches:
            text = tuple(match.group().encode('utf8'))
            counter[text] = counter.get(text, 0) + 1
    return counter

In [None]:
path = Path('data/TinyStoriesV2-GPT4-valid.txt')
with path.open('+r') as f:
    content = f.read()
    print(pretokenizer(content, ['<|endoftext|>']))

In [43]:
a = 1

In [45]:
bytes(a)

b'\x00'

In [None]:
for i in range(256):
    print(bytes([i]))

In [48]:
def init_vocab(special_tokens: list[str]) -> dict[int, bytes]:
    vocab = {}
    for i in range(256):
        vocab[i] = bytes([i])
    for st in special_tokens:
        idx = len(vocab)
        vocab[idx] = st.encode('utf8')
    return vocab

In [49]:
init_vocab(['<|endoftext|>'])

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [62]:
def find_max_bp(pretokenized_count: dict[tuple[bytes], int]):
    # Count byte pairs
    bp_counter = {}
    for bytes_tuple, count in pretokenized_count.items():
        for i in range(len(bytes_tuple) - 1):
            bp = (bytes_tuple[i], bytes_tuple[i+1])
            bp_counter[bp] = bp_counter.get(bp, 0) + count
    max_bp, max_count = (bytes([0]), bytes([0])), 0
    for bp, bp_count in bp_counter.items():
        if bp_count > max_count:
            max_bp = bp
            max_count = bp_count
        if bp_count == max_count and bp > max_bp:
            max_bp = bp
            max_count = bp_count
    return max_bp, max_count

In [63]:
def get_pretokenized_count():
    path = Path('data/TinyStoriesV2-GPT4-valid.txt')
    with path.open('+r') as f:
        content = f.read()
        return pretokenizer(content, ['<|endoftext|>'])

In [64]:
pretokenized_count = get_pretokenized_count()
max_bp, max_count = find_max_bp(pretokenized_count)

In [66]:
max_bp

(32, 116)

In [69]:
b1 = b'\x01\x02'

In [70]:
len(b1)

2

In [72]:
bytes(max_bp)

b' t'

In [1]:
import pickle

In [4]:
with open('./output/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

print(len(vocab))

10000


In [None]:
with open('./output/merges.pkl', 'rb') as f:
    merges = pickle.load(f)

print(merges)