<a href="https://colab.research.google.com/github/x-kshadab/diffusion01/blob/main/txs/01_tx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request

In [2]:
url="https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

try:
  with urllib.request.urlopen(url) as response:
    text_content= response.read().decode('utf-8')
except urllib.error.URLError as e:
  print(f"Error in accessing URL : {e}")
except Exception as e:
  print(f"An unknown exception occured: {e}")

In [3]:
print(len(text_content))

20479


In [4]:
import re

In [16]:
rule=r'([,.:;?_!"()\']|--|\s)'

In [17]:
test_text = "Hello, it's me Sharon!, I am waiting for you --. or am I ?"
test_tokens= re.split(rule, test_text)
print(test_tokens)

## removing white space
test_tokens = [token for token in test_tokens if token.strip()]
print(test_tokens)


['Hello', ',', '', ' ', 'it', "'", 's', ' ', 'me', ' ', 'Sharon', '!', '', ',', '', ' ', 'I', ' ', 'am', ' ', 'waiting', ' ', 'for', ' ', 'you', ' ', '', '--', '', '.', '', ' ', 'or', ' ', 'am', ' ', 'I', ' ', '', '?', '']
['Hello', ',', 'it', "'", 's', 'me', 'Sharon', '!', ',', 'I', 'am', 'waiting', 'for', 'you', '--', '.', 'or', 'am', 'I', '?']


In [9]:
preprocessed_tokens= re.split(r'([,.:;"()_!?\']|--|\s)', text_content)
preprocessed_tokens = [token for token in preprocessed_tokens if token.strip()]

In [10]:
len(preprocessed_tokens)

4690

In [13]:
## set to remove duplicates
preprocessed_tokens = sorted(set(preprocessed_tokens))

vocab= {token: index for index, token in enumerate(preprocessed_tokens)}

for i, item in enumerate(vocab.items()):
  print(item)
  if i >10:
    break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)


In [25]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int= vocab
    self.int_to_str= {id:token for token, id in vocab.items()}

  def encode(self, text):
    tokens= re.split(rule, text)
    tokens= [token.strip() for token in tokens if token.strip()]
    token_ids= [self.str_to_int[token] for token in tokens]
    return token_ids

  def decode(self, ids):
    tokens_initial= [self.int_to_str[id] for id in ids]
    tokens_string= " ".join(tokens_initial)
    ## adding spaces
    text=re.sub(r'\s+([,.?!"()\'])', r'\1', tokens_string)
    return text


In [26]:
tokenizer= SimpleTokenizerV1(vocab)
test_text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
test_token_ids= tokenizer.encode(test_text)
print(test_token_ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [27]:
print(tokenizer.decode(test_token_ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [28]:
## Adding special tokens
print(len(preprocessed_tokens))
preprocessed_tokens.extend(["<|endoftext|>","<|unk|>"])
print(len(preprocessed_tokens))

1132
1134


In [29]:
vocab= {token: index for index, token in enumerate(preprocessed_tokens)}
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1132)
('<|unk|>', 1133)


In [50]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int= vocab
    self.int_to_str= {index:token for token, index in vocab.items()}

  def encode(self, text):
    tokens = re.split(rule, text)
    print(tokens)
    tokens= [token.strip() for token in tokens if token.strip()]
    tokens = [token if token in self.str_to_int else "<|unk|>" for token in tokens]
    ids= [self.str_to_int[token] for token in tokens]
    return ids

  def decode(self, ids):
    tokens_initial = [self.int_to_str[id] for id in ids]
    tokens_string = " ".join(tokens_initial)
    ## adding spaces
    text=re.sub(r'\s+([,.?!"()\'])', r'\1', tokens_string)
    return text

In [51]:
test_text_1="Hello, do you like tea?"
test_text_2 = "In the sunlit terraces of the palace."
test_text= "  <|endoftext|> ".join((test_text_1, test_text_2))
print(test_text)

Hello, do you like tea?  <|endoftext|> In the sunlit terraces of the palace.


In [52]:
tokenizerV2= SimpleTokenizerV2(vocab)
text_to_ids= tokenizerV2.encode(test_text)
print(text_to_ids)
print(tokenizerV2.decode(text_to_ids))

['Hello', ',', '', ' ', 'do', ' ', 'you', ' ', 'like', ' ', 'tea', '?', '', ' ', '', ' ', '<|endoftext|>', ' ', 'In', ' ', 'the', ' ', 'sunlit', ' ', 'terraces', ' ', 'of', ' ', 'the', ' ', 'palace', '.', '']
[1133, 5, 355, 1126, 628, 975, 10, 1132, 55, 988, 956, 984, 722, 988, 1133, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [54]:
import tiktoken
tiktoken.__version__

'0.11.0'

In [56]:
tokenizer = tiktoken.get_encoding('gpt2')

In [65]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
ids=tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(ids)


[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [66]:
print(tokenizer.decode(ids))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.
