In [99]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [100]:
tokenizer.get_vocab()

{'!': 0,
 '"': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 '@': 31,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 58,
 '\\': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 '`': 63,
 'a': 64,
 'b': 65,
 'c': 66,
 'd': 67,
 'e': 68,
 'f': 69,
 'g': 70,
 'h': 71,
 'i': 72,
 'j': 73,
 'k': 74,
 'l': 75,
 'm': 76,
 'n': 77,
 'o': 78,
 'p': 79,
 'q': 80,
 'r': 81,
 's': 82,
 't': 83,
 'u': 84,
 'v': 85,
 'w': 86,
 'x': 87,
 'y': 88,
 'z': 89,
 '{': 90,
 '|': 91,
 '}': 92,
 '~': 93,
 '¬°': 94,
 '¬¢': 95,
 '¬£': 96,
 '¬§': 97,
 '¬•': 98,
 '¬¶': 99,
 '¬

In [44]:
string = "Hello, üåç! ‰Ω†Â•Ω!"
print(tokenizer.encode(string))
tokens = tokenizer.encode(string)

[15496, 11, 12520, 234, 235, 0, 220, 19526, 254, 25001, 121, 0]


In [45]:
# decode single value
for token in tokens:
    print(f"{token} -> ", tokenizer.decode(token))



15496 ->  Hello
11 ->  ,
12520 ->   ÔøΩ
234 ->  ÔøΩ
235 ->  ÔøΩ
0 ->  !
220 ->   
19526 ->  ÔøΩ
254 ->  ÔøΩ
25001 ->  ÔøΩ
121 ->  ÔøΩ
0 ->  !


In [46]:
def compression_ratio(string:str, tokens:list):
    original_size = len(bytes(string, encoding="utf-8"))
    num_tokens = len(tokens)
    return original_size/num_tokens

In [47]:
# compression ratio
original_size = len(bytes(string, encoding="utf-8"))
num_tokens = len(tokens)


print("original_size -> ", original_size)
print("num_tokens -> ", num_tokens)
print("compression -> ", original_size/num_tokens)
print("compression_formula -> ", compression_ratio(string=string, tokens=tokens))

original_size ->  20
num_tokens ->  12
compression ->  1.6666666666666667
compression_formula ->  1.6666666666666667


In [48]:
# Options for tokenizes
# Option 1: Each character a token
# Lets then map each character to a number
# do I need catalog? 

vocab_dict = {}
index_cnt = 0
char_tokens = []
for char in string: 
    print(char)
    if char not in vocab_dict:
        vocab_dict[char] = index_cnt
        char_tokens.append(index_cnt)
        index_cnt += 1

    else:
        char_tokens.append(vocab_dict[char])

#tokens
vocab_dict, char_tokens

H
e
l
l
o
,
 
üåç
!
 
‰Ω†
Â•Ω
!


({'H': 0,
  'e': 1,
  'l': 2,
  'o': 3,
  ',': 4,
  ' ': 5,
  'üåç': 6,
  '!': 7,
  '‰Ω†': 8,
  'Â•Ω': 9},
 [0, 1, 2, 2, 3, 4, 5, 6, 7, 5, 8, 9, 7])

In [49]:
print("char compression -> , ", compression_ratio(string=string, tokens=char_tokens))

char compression -> ,  1.5384615384615385


In [50]:
ord("a")

97

In [51]:
chr(97)

'a'

In [52]:
# Byte-based tokenization
# unicode strings can be represented as integers between 0 and 255
# utf-8 representation
print(bytes("a", encoding="utf-8"))
print(bytes('üåé', encoding="utf-8"))

b'a'
b'\xf0\x9f\x8c\x8e'


In [53]:
int.from_bytes(b'\xf0')

240

In [54]:
len(bytes('üåé', encoding="utf-8"))

4

In [55]:
def byte_char_encoding(string:str):
    """ Splits string in separate characters
     Split in single bytes, case a character is represented by more than 1 byte. 
       """
    vocab_dict = {}

    char_tokens = []
    for char in string: 
        #print(bytes(char, encoding="utf-8"))
        char_bytes = bytes(char, encoding="utf-8")
        if len(char_bytes) == 0:
            byte = char_bytes[0]
            vocab_dict[char] = char_bytes
            char_tokens.append(byte)

        else: 
            vocab_dict[char] = char_bytes

            for b in char_bytes:
                char_tokens.append(b)

    return char_tokens


byte_tokens = byte_char_encoding(string=string)
print("byte tokens: ", byte_tokens)


byte tokens:  [72, 101, 108, 108, 111, 44, 32, 240, 159, 140, 141, 33, 32, 228, 189, 160, 229, 165, 189, 33]


In [56]:
print("byter compression -> , ", compression_ratio(string=string, 
                                                   tokens=byte_tokens))

# is 1, which is the worst possible compression

byter compression -> ,  1.0


In [57]:
chr(72)

'H'

In [58]:
# Word based tokenization
# 1st define regez to split the sentence
# Then split
WORD_REGEX = r'([, !?])'  # Lets use just space for now

import re
sentence_word = re.split(pattern=WORD_REGEX, string=string)



In [59]:
print(string,"\n", sentence_word) 

Hello, üåç! ‰Ω†Â•Ω! 
 ['Hello', ',', '', ' ', 'üåç', '!', '', ' ', '‰Ω†Â•Ω', '!', '']


In [60]:
clean_sentence = [word for word in sentence_word if word != ""]

In [61]:
clean_sentence

['Hello', ',', ' ', 'üåç', '!', ' ', '‰Ω†Â•Ω', '!']

In [62]:
vocab = {}
index = 0
word_tokens = []
for word in clean_sentence:
    if word not in vocab: 
        vocab[word] = index
        word_tokens.append(index)
        index += 1
    else:
        word_tokens.append(vocab[word])

In [63]:
clean_sentence, string

(['Hello', ',', ' ', 'üåç', '!', ' ', '‰Ω†Â•Ω', '!'], 'Hello, üåç! ‰Ω†Â•Ω!')

In [64]:
word_tokens

[0, 1, 2, 3, 4, 2, 5, 4]

In [65]:
compression_ratio(string=string, tokens=word_tokens)

2.5

# Byte Pair Encoding (BPE)


In [66]:
#### Step 1: Convert each char into a single byte
#### Step 2: Count each adjacent pair
#### Step 3: Merge the top1 most frequent
#### Step 4: Repeat with a fix limit of merges ??? 


In [67]:
sentence = "the cat is in the hat"

In [68]:
bytes('üåé', encoding="utf-8")
ord('üåé')

127758

In [69]:
char_bytes = [ord(char) for char in sentence]

counts_dict = {}
for i in range(len(char_bytes)-1):
    pair =  (char_bytes[i],char_bytes[i+1])
    if pair not in counts_dict:
        counts_dict[pair] = 0
    else: 
        counts_dict[pair] += 1


In [70]:
counts_dict = {k: v for k, v in sorted(counts_dict.items(), key=lambda item: item[1], reverse=True)}
print(counts_dict)


{(116, 104): 1, (104, 101): 1, (101, 32): 1, (97, 116): 1, (32, 105): 1, (32, 99): 0, (99, 97): 0, (116, 32): 0, (105, 115): 0, (115, 32): 0, (105, 110): 0, (110, 32): 0, (32, 116): 0, (32, 104): 0, (104, 97): 0}


In [71]:
counts_dict

{(116, 104): 1,
 (104, 101): 1,
 (101, 32): 1,
 (97, 116): 1,
 (32, 105): 1,
 (32, 99): 0,
 (99, 97): 0,
 (116, 32): 0,
 (105, 115): 0,
 (115, 32): 0,
 (105, 110): 0,
 (110, 32): 0,
 (32, 116): 0,
 (32, 104): 0,
 (104, 97): 0}

In [72]:
to_merge = next(iter(counts_dict))

In [73]:
to_merge

(116, 104)

In [74]:
# merge
next_token = 257
merge_position = char_bytes.index(to_merge[0])

char_bytes.pop(merge_position)
# repeat since the list will move left 
char_bytes.pop(merge_position)

104

In [75]:
char_bytes 

[101,
 32,
 99,
 97,
 116,
 32,
 105,
 115,
 32,
 105,
 110,
 32,
 116,
 104,
 101,
 32,
 104,
 97,
 116]

In [76]:
# insert new token on the removed indexes
char_bytes.insert(merge_position, next_token)
next_token += 1

In [77]:
char_bytes

[257,
 101,
 32,
 99,
 97,
 116,
 32,
 105,
 115,
 32,
 105,
 110,
 32,
 116,
 104,
 101,
 32,
 104,
 97,
 116]

In [None]:
#### Step 1: Convert each char into a single byte

class BytePairEncoder:
    
    def __init__(self, num_merges:int=10):
        self.num_merges = num_merges
        self.vocab = {}
        self.token_dict = {}

    def train(self):
        pass

    
    def encode(self, sentence:str):
        NEXT_TOKEN = 256
        char_bytes = [ord(char) for char in sentence]
        for _ in range(self.num_merges):
            new_chars = []

            # pairs counter
            counts_dict = {}
            for i in range(len(char_bytes)-1):
                pair =  (char_bytes[i],char_bytes[i+1])
                if pair not in counts_dict:
                    counts_dict[pair] = 1
                else: 
                    counts_dict[pair] += 1
            

            # order
            counts_dict = {k: v for k, v in sorted(counts_dict.items(), key=lambda item: item[1], reverse=True)}

            # find pair to merge
            to_merge_key = next(iter(counts_dict))
            print("to merge:  ", to_merge_key)

            
            # remove merged
            pointer = 0
            while pointer < len(char_bytes)-1: 
                if char_bytes[pointer] ==  to_merge_key[0] and char_bytes[pointer + 1] == to_merge_key[1]:
                    new_chars.append(NEXT_TOKEN)
                    pointer += 2

                else: 
                    new_chars.append(char_bytes[pointer])
                    pointer += 1
            NEXT_TOKEN += 1
            if pointer != len(char_bytes):
                new_chars.append(char_bytes[pointer])
            
            char_bytes = new_chars
            
        return char_bytes


    def decode(self):
        pass
    



In [79]:
len(char_bytes)

20

In [80]:
sentence

'the cat is in the hat'

In [81]:
from bpetokenizer import BPETokenizer
tokenizer = BPETokenizer()
tokenizer.train(sentence, vocab_size=257)
print(tokenizer.encode(sentence))

[256, 101, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 256, 101, 32, 104, 97, 116]


In [93]:
bpe = BytePairEncoder(num_merges=1)
result = bpe.encode(sentence=sentence)
print(result, len(result))
bpe = BytePairEncoder(num_merges=2)
result = bpe.encode(sentence=sentence)
print(result, len(result))

bpe = BytePairEncoder(num_merges=3)
result = bpe.encode(sentence=sentence)
print(result, len(result))


to merge:   (116, 104)
[256, 101, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 256, 101, 32, 104, 97, 116] 19
to merge:   (116, 104)
to merge:   (256, 101)
[257, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 257, 32, 104, 97, 116] 17
to merge:   (116, 104)
to merge:   (256, 101)
to merge:   (257, 32)
[258, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 258, 104, 97, 116] 15


In [83]:
print(bpe.encode(sentence=sentence))
print(tokenizer.encode(sentence))

to merge:   (116, 104)
[256, 101, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 256, 101, 32, 104, 97, 116]
[256, 101, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 256, 101, 32, 104, 97, 116]


In [None]:
import time
num_merges = 5
for i in range(1,num_merges+1):
    vocab = 256 + i
    print("vocab is ", vocab)
    tokenizer = BPETokenizer()
    tic = time.time()
    tokenizer.train(sentence, vocab_size=vocab)
    toc = time.time()
    print("my bpe takes: ", toc-tic)
    bpe = BytePairEncoder(num_merges=i)
    print("my bpe ", bpe.encode(sentence=sentence))
    print("   lib:", tokenizer.encode(sentence))



vocab is  257
to merge:   (116, 104)
my bpe  [256, 101, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 256, 101, 32, 104, 97, 116]
   lib: [256, 101, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 256, 101, 32, 104, 97, 116]
vocab is  258
to merge:   (116, 104)
to merge:   (256, 101)
my bpe  [257, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 257, 32, 104, 97, 116]
   lib: [257, 32, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 257, 32, 104, 97, 116]
vocab is  259
to merge:   (116, 104)
to merge:   (256, 101)
to merge:   (257, 32)
my bpe  [258, 99, 97, 116, 32, 105, 115, 32, 105, 110, 32, 258, 104, 97, 116]
   lib: [257, 32, 99, 258, 32, 105, 115, 32, 105, 110, 32, 257, 32, 104, 258]
vocab is  260
to merge:   (116, 104)
to merge:   (256, 101)
to merge:   (257, 32)
to merge:   (97, 116)
my bpe  [258, 99, 259, 32, 105, 115, 32, 105, 110, 32, 258, 104, 259]
   lib: [257, 32, 99, 258, 259, 115, 259, 110, 32, 257, 32, 104, 258]
vocab is  261
to merge:   (116, 104)
to merge:   (256, 10