In [1]:
import pandas as pd
from collections import defaultdict

## 2.4 BPE Tokenizer Training


In [53]:
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [54]:
import regex as re

In [55]:
re.findall(PAT, "some text that i'll pre-tokenize")

['some', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenize']

In [56]:
re.finditer(PAT, "some text that i'll pre-tokenize")

<_regex.Scanner at 0x138774ac0>

## 2.5 Experimenting with BPE Tokenizer Training


In [2]:
def train_bpe(input_path: str, vocab_size: int, special_tokens: list[str]) -> tuple[dict[int, bytes], list[tuple[bytes, bytes]]]:
    pass

In [None]:
def load_txt_as_str(input_path: str) -> str:
    with open(input_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

def split_string(string: str, special_tokens: list[str]) -> list[str]:
    pattern = "|".join(re.escape(tok) for tok in special_tokens)
    return re.split(pattern,string)

def get_tok_counts(string_list: list[str]) -> dict[str, int]:
    counts = defaultdict(int)
    for s in string_list:
        tokens = re.finditer(PAT, s)
        for m in tokens:
            tok = m.group(0)
            counts[tok] += 1
    return counts

def get_element_counts(counts: dict[str, int])-> dict[str, int]:
    element_counts = defaultdict(int)
    for token, count in counts.items():
        elements = tuple([k for k in token])
        element_counts[elements] += count
    return element_counts

def get_pair_counts(element_counts: dict[str, int]) -> dict[str, int]:
    pair_counts = defaultdict(int)
    for elements, count in element_counts.items():
        for i in range(len(elements)-1):
            pair = elements[i]+elements[i+1]
            pair_counts[pair] += count
    return pair_counts

In [165]:
special_tokens = ['<|endoftext|>']
input_path = fr'./data/test.txt'
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [None]:
# string = load_txt_as_str(input_path)
string = 'hi. i\'m yifan li. nice to meet you. this the what when here where'
string_list = split_string(string, special_tokens)
counts = get_tok_counts(string_list)
element_counts = get_element_counts(counts)
pair_counts = get_pair_counts(element_counts)

In [168]:
element_counts

defaultdict(int,
            {('h', 'i'): 1,
             ('.',): 3,
             (' ', 'i'): 1,
             ("'", 'm'): 1,
             (' ', 'y', 'i', 'f', 'a', 'n'): 1,
             (' ', 'l', 'i'): 1,
             (' ', 'n', 'i', 'c', 'e'): 1,
             (' ', 't', 'o'): 1,
             (' ', 'm', 'e', 'e', 't'): 1,
             (' ', 'y', 'o', 'u'): 1,
             (' ', 't', 'h', 'i', 's'): 1,
             (' ', 't', 'h', 'e'): 1,
             (' ', 'w', 'h', 'a', 't'): 1,
             (' ', 'w', 'h', 'e', 'n'): 1,
             (' ', 'h', 'e', 'r', 'e'): 1,
             (' ', 'w', 'h', 'e', 'r', 'e'): 1})

In [150]:
max(pair_counts, key=pair_counts.get)

'he'

In [160]:
pair_counts

defaultdict(int,
            {'hi': 2,
             ' i': 1,
             "'m": 1,
             ' y': 2,
             'yi': 1,
             'if': 1,
             'fa': 1,
             'an': 1,
             ' l': 1,
             'li': 1,
             ' n': 1,
             'ni': 1,
             'ic': 1,
             'ce': 1,
             ' t': 3,
             'to': 1,
             ' m': 1,
             'me': 1,
             'ee': 1,
             'et': 1,
             'yo': 1,
             'ou': 1,
             'th': 2,
             'is': 1,
             'he': 4,
             ' w': 3,
             'wh': 3,
             'ha': 1,
             'at': 1,
             'en': 1,
             ' h': 1,
             'er': 2,
             're': 2})

In [148]:
pair_counts

defaultdict(int,
            {'hi': 2,
             ' i': 1,
             "'m": 1,
             ' y': 2,
             'yi': 1,
             'if': 1,
             'fa': 1,
             'an': 1,
             ' l': 1,
             'li': 1,
             ' n': 1,
             'ni': 1,
             'ic': 1,
             'ce': 1,
             ' t': 3,
             'to': 1,
             ' m': 1,
             'me': 1,
             'ee': 1,
             'et': 1,
             'yo': 1,
             'ou': 1,
             'th': 2,
             'is': 1,
             'he': 4,
             ' w': 3,
             'wh': 3,
             'ha': 1,
             'at': 1,
             'en': 1,
             ' h': 1,
             'er': 2,
             're': 2})