In [56]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [57]:
string = "Hello, üåç! ‰Ω†Â•Ω!"
print(tokenizer.encode(string))
tokens = tokenizer.encode(string)

[15496, 11, 12520, 234, 235, 0, 220, 19526, 254, 25001, 121, 0]


In [58]:
# decode single value
for token in tokens:
    print(f"{token} -> ", tokenizer.decode(token))



15496 ->  Hello
11 ->  ,
12520 ->   ÔøΩ
234 ->  ÔøΩ
235 ->  ÔøΩ
0 ->  !
220 ->   
19526 ->  ÔøΩ
254 ->  ÔøΩ
25001 ->  ÔøΩ
121 ->  ÔøΩ
0 ->  !


In [59]:
def compression_ratio(string:str, tokens:list):
    original_size = len(bytes(string, encoding="utf-8"))
    num_tokens = len(tokens)
    return original_size/num_tokens

In [60]:
# compression ratio
original_size = len(bytes(string, encoding="utf-8"))
num_tokens = len(tokens)


print("original_size -> ", original_size)
print("num_tokens -> ", num_tokens)
print("compression -> ", original_size/num_tokens)
print("compression_formula -> ", compression_ratio(string=string, tokens=tokens))

original_size ->  20
num_tokens ->  12
compression ->  1.6666666666666667
compression_formula ->  1.6666666666666667


In [75]:
# Options for tokenizes
# Option 1: Each character a token
# Lets then map each character to a number
# do I need catalog? 

vocab_dict = {}
index_cnt = 0
char_tokens = []
for char in string: 
    print(char)
    if char not in vocab_dict:
        vocab_dict[char] = index_cnt
        char_tokens.append(index_cnt)
        index_cnt += 1

    else:
        char_tokens.append(vocab_dict[char])

#tokens
vocab_dict, char_tokens

H
e
l
l
o
,
 
üåç
!
 
‰Ω†
Â•Ω
!


({'H': 0,
  'e': 1,
  'l': 2,
  'o': 3,
  ',': 4,
  ' ': 5,
  'üåç': 6,
  '!': 7,
  '‰Ω†': 8,
  'Â•Ω': 9},
 [0, 1, 2, 2, 3, 4, 5, 6, 7, 5, 8, 9, 7])

In [62]:
print("char compression -> , ", compression_ratio(string=string, tokens=char_tokens))

char compression -> ,  1.5384615384615385


In [65]:
ord("a")

97

In [69]:
chr(97)

'a'

In [72]:
# Byte-based tokenization
# unicode strings can be represented as integers between 0 and 255
# utf-8 representation
print(bytes("a", encoding="utf-8"))
print(bytes('üåé', encoding="utf-8"))

b'a'
b'\xf0\x9f\x8c\x8e'


In [91]:
int.from_bytes(b'\xf0')

240

In [101]:
len(bytes('üåé', encoding="utf-8"))

4

In [106]:
def byte_char_encoding(string:str):
    """ Splits string in separate characters
     Split in single bytes, case a character is represented by more than 1 byte. 
       """
    vocab_dict = {}

    char_tokens = []
    for char in string: 
        #print(bytes(char, encoding="utf-8"))
        char_bytes = bytes(char, encoding="utf-8")
        if len(char_bytes) == 0:
            byte = char_bytes[0]
            vocab_dict[char] = char_bytes
            char_tokens.append(byte)

        else: 
            vocab_dict[char] = char_bytes

            for b in char_bytes:
                char_tokens.append(b)

    return char_tokens


byte_tokens = byte_char_encoding(string=string)
print("byte tokens: ", byte_tokens)


byte tokens:  [72, 101, 108, 108, 111, 44, 32, 240, 159, 140, 141, 33, 32, 228, 189, 160, 229, 165, 189, 33]


In [108]:
print("byter compression -> , ", compression_ratio(string=string, 
                                                   tokens=byte_tokens))

# is 1, which is the worst possible compression

byter compression -> ,  1.0


In [116]:
chr(72)

'H'

In [138]:
# Word based tokenization
# 1st define regez to split the sentence
# Then split
WORD_REGEX = r'([, !?])'  # Lets use just space for now

import re
sentence_word = re.split(pattern=WORD_REGEX, string=string)



In [139]:
print(string,"\n", sentence_word) 

Hello, üåç! ‰Ω†Â•Ω! 
 ['Hello', ',', '', ' ', 'üåç', '!', '', ' ', '‰Ω†Â•Ω', '!', '']


In [140]:
clean_sentence = [word for word in sentence_word if word != ""]

In [141]:
clean_sentence

['Hello', ',', ' ', 'üåç', '!', ' ', '‰Ω†Â•Ω', '!']

In [None]:
vocab = {}
