<a href="https://colab.research.google.com/github/vardhanreddy2003/GPT-2Training/blob/main/Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
with open("verdict.txt","r",encoding="utf-8") as f:
  raw_text=f.read()
print("total number of characters:",len(raw_text))

total number of characters: 20479


In [None]:
import re

In [None]:
text = "Hello, world. This, is a test."
result=re.split(r'([,.]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [None]:
result=[item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [None]:
print(len(preprocessed))

4690


In [None]:
vocab=sorted(set(preprocessed))
vocab_size=len(vocab)
print("vocab size:",vocab_size)

vocab size: 1130


In [None]:
print(vocab[:30])

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed']


In [None]:
for integer,token in enumerate(vocab):
  print(integer,token)
  if integer>50:
    break

0 !
1 "
2 '
3 (
4 )
5 ,
6 --
7 .
8 :
9 ;
10 ?
11 A
12 Ah
13 Among
14 And
15 Are
16 Arrt
17 As
18 At
19 Be
20 Begin
21 Burlington
22 But
23 By
24 Carlo
25 Chicago
26 Claude
27 Come
28 Croft
29 Destroyed
30 Devonshire
31 Don
32 Dubarry
33 Emperors
34 Florence
35 For
36 Gallery
37 Gideon
38 Gisburn
39 Gisburns
40 Grafton
41 Greek
42 Grindle
43 Grindles
44 HAD
45 Had
46 Hang
47 Has
48 He
49 Her
50 Hermia
51 His


In [None]:
vocab={token:integer for integer,token in enumerate(vocab)}

In [None]:
for token,integer in vocab.items():
  print(token,integer)
  if integer>50:
    break

! 0
" 1
' 2
( 3
) 4
, 5
-- 6
. 7
: 8
; 9
? 10
A 11
Ah 12
Among 13
And 14
Are 15
Arrt 16
As 17
At 18
Be 19
Begin 20
Burlington 21
But 22
By 23
Carlo 24
Chicago 25
Claude 26
Come 27
Croft 28
Destroyed 29
Devonshire 30
Don 31
Dubarry 32
Emperors 33
Florence 34
For 35
Gallery 36
Gideon 37
Gisburn 38
Gisburns 39
Grafton 40
Greek 41
Grindle 42
Grindles 43
HAD 44
Had 45
Hang 46
Has 47
He 48
Her 49
Hermia 50
His 51


In [None]:
class SimpleTokenizerV1:
  def __init__(self,vocab):
    self.str_to_int=vocab
    self.int_to_str={i:s for s,i in vocab.items()}
  def encode(self,text):
     preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
     preprocessed=[item.strip() for item in preprocessed if item.strip()]
     ids=[self.str_to_int[item] for item in preprocessed]
     return ids
  def decode(self,ids):
        text= " ".join([self.int_to_str[i] for i in ids])
    # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text



In [None]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [None]:
class SimpleTokenizerV2:
  def __init__(self,vocab):
    self.str_to_int=vocab
    self.int_to_str={i:s for s,i in vocab.items()}
  def encode(self,text):
     preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
     preprocessed=[item.strip() for item in preprocessed if item.strip()]

     preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
     ids=[self.str_to_int[item] for item in preprocessed]
     return ids
  def decode(self,ids):
        text= " ".join([self.int_to_str[i] for i in ids])
    # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text



In [None]:
all_tokens=sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

In [None]:
vocab={token:integer for integer,token in enumerate(all_tokens)}

In [None]:
tokenizer=SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
tokenizer.encode(text)


[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [None]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

In [None]:
!pip3 install tiktoken





In [None]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers=tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [None]:
integers=tokenizer.encode("Akwirw ier")
print(integers)

text=tokenizer.decode(integers)
print(text)

[33901, 86, 343, 86, 220, 959]
Akwirw ier
