# Reading GOT as text sample into python

# Step 1: Creating Tokens 

In [1]:
with open("gameofthrones.txt","r") as file:
    text=file.read()

In [2]:
print(f"Total number of characters: {len(text)}")
print(text[:99])

Total number of characters: 5662324


“We should start back,” Gared urged as the woods began to grow dark around them. “The wildlings a


#### As we are just learning the things, we'll use only fisrt 20,000 words of this text here.

In [3]:
text = text[:20000]
print(f"Total number of characters: {len(text)}")

Total number of characters: 20000


## use regular expression library to split the text.

In [4]:
import re

sentence = "Hello, world. This, is a test."
result = re.split(r"(\s)", sentence) # spliting on the basis of space( ).

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [5]:
result = re.split(r"([,.]|\s)", sentence) # spliting on the basis of commas(,) and fullstops(.).

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


#### now here is the problem that white spaces( ) are being counted as one word or token, to solve this we'll use strip function.

In [6]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


### Removing white space can change the meaning of sentences

In [7]:
preprocessed = re.split(r"([,.():;?_!\"']|--|\s)", text)
preprocessed = [item for item in preprocessed if item.strip()]
# print(preprocessed)
print(f"Length of tokens: {len(preprocessed)}")

Length of tokens: 4372


In [8]:
print(preprocessed)



# Step 2: creating Token IDs

In [9]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1179


In [10]:
vocabulary = {token:integer for integer, token in enumerate(all_words)}

In [11]:
vocabulary

{'!': 0,
 ',': 1,
 '.': 2,
 ':': 3,
 ';': 4,
 '?': 5,
 'A': 6,
 'Aemon': 7,
 'Again': 8,
 'All': 9,
 'And': 10,
 'Another': 11,
 'Are': 12,
 'At': 13,
 'Be': 14,
 'Behind': 15,
 'Better': 16,
 'Black': 17,
 'Blood': 18,
 'Branches': 19,
 'Brother': 20,
 'But': 21,
 'Castle': 22,
 'Despite': 23,
 'Down': 24,
 'Each': 25,
 'Especially': 26,
 'Ever': 27,
 'Everyone': 28,
 'Everything': 29,
 'Fallen': 30,
 'Far': 31,
 'Fear': 32,
 'Finally': 33,
 'First': 34,
 'For': 35,
 'Four': 36,
 'Frostfallen': 37,
 'Gared': 38,
 'Gared’s': 39,
 'Guard': 40,
 'He': 41,
 'Heavy-looking': 42,
 'His': 43,
 'I': 44,
 'If': 45,
 'In': 46,
 'It': 47,
 'Its': 48,
 'It’s': 49,
 'I’ll': 50,
 'Jewels': 51,
 'Leave': 52,
 'Look': 53,
 'Lying': 54,
 'Maester': 55,
 'Mallister': 56,
 'Mallisters’': 57,
 'Men': 58,
 'Moonlight': 59,
 'Most': 60,
 'Mounted': 61,
 'Night’s': 62,
 'Nine': 63,
 'No': 64,
 'Not': 65,
 'Nothing': 66,
 'One': 67,
 'Other': 68,
 'Others': 69,
 'Other’s': 70,
 'Pale': 71,
 'Peaceful': 72,
 

In [12]:
vocabulary["A"]

6

In [13]:
class SimpleTokenizerV1:
    def __init__(self, vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = {i:s for s,i in vocabulary.items()}

    def encode(self, text):
        preprocessed = re.split(r"([,.():;?_!\"']|--|\s)", text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]  
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r"\s+([,.():;?_!\"'])",r"\1", text) # replacing spacing before punctuations.
        return text

In [14]:
tokenizer = SimpleTokenizerV1(vocabulary)

sent = "“Fallen,” Will insisted. “There’s one woman up an ironwood, half-hid in the branches. A far-eyes.” He smiled thinly. “I took care she never saw me. When I got closer, I saw that she wasn’t moving neither.” Despite himself, he shivered."

ids = tokenizer.encode(sent)
print(ids)

[1132, 1, 1178, 108, 546, 2, 1160, 698, 1092, 1028, 134, 555, 1, 490, 544, 956, 211, 2, 6, 384, 2, 1178, 41, 879, 971, 2, 1139, 992, 236, 833, 670, 800, 630, 2, 105, 44, 468, 263, 1, 44, 800, 954, 833, 1054, 653, 668, 2, 1178, 23, 515, 1, 499, 840, 2]


In [15]:
sent = tokenizer.decode(ids)
print(sent)

“Fallen, ” Will insisted. “There’s one woman up an ironwood, half-hid in the branches. A far-eyes. ” He smiled thinly. “I took care she never saw me. When I got closer, I saw that she wasn’t moving neither. ” Despite himself, he shivered.


### Here we can see that we can get token ids with the help of encode function and gat our original text back with the help of decode function.

In [16]:
sent = "Hello, how are you?"
tokenizer.encode(sent)

KeyError: 'Hello'

# SPECIAL CONTEXT TOKENS
#### If there is a word in our sentence which is not in the vocabulary, how can we deal with it?
#### There is way to deal with this kind of the problem and that is adding "SPECIAL CONTEXT TOKENS".
#### Here we will add two more tokens in vobaulary: 1. "<|endoftext|>" and 2. "<|unk|>"

In [17]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocabulary = {token:integer for integer, token in enumerate(all_tokens)}

In [18]:
print(len(vocabulary))

1181


In [19]:
# last five words in vocabulary
list(vocabulary)[-5:]

['“Your', '“twisted', '”', '<|endoftext|>', '<|unk|>']

In [20]:
class SimpleTokenizerV2:
    def __init__(self, vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = {i:s for s,i in vocabulary.items()}

    def encode(self, text):
        preprocessed = re.split(r"([,.():;?_!\"']|--|\s)", text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()] 
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r"\s+([,.():;?_!\"'])",r"\1", text) # replacing spacing before punctuations.
        return text

In [21]:
text1 = "Hello, how are you?"
text2 = "“Fallen,” Will insisted. “There’s one woman up an ironwood, half-hid in the branches."

final_text = " <|endoftext|> ".join((text1,text2))

In [22]:
print(final_text)

Hello, how are you? <|endoftext|> “Fallen,” Will insisted. “There’s one woman up an ironwood, half-hid in the branches.


In [23]:
tokenizer = SimpleTokenizerV2(vocabulary)

tokenizer.encode(final_text)

[1180,
 1,
 529,
 145,
 1111,
 5,
 1179,
 1132,
 1,
 1178,
 108,
 546,
 2,
 1160,
 698,
 1092,
 1028,
 134,
 555,
 1,
 490,
 544,
 956,
 211,
 2]

In [24]:
tokenizer.decode(tokenizer.encode(final_text))

'<|unk|>, how are you? <|endoftext|> “Fallen, ” Will insisted. “There’s one woman up an ironwood, half-hid in the branches.'

#### "Hello" is not present here so "<|unk|>" word taking the place of it. 

BOS(begining of sequence), EOS(end of sequence) AND PAD(padding) also being used here.

ChatGpt does not requie any of the token written above, it just uses "<|endoftext|>" token. It does not use "<|unk|>" token also, instead it uses a "BYTE PAIR ENCODING TOKENIZER" which breakdown words into subwords.

# BYTE PAIR ENCODING TOKENIZER

#### Implementing BPE tokenizer from scratch can be very very tough, So we will use an existing Python open source library called "TIKTOKEN" to do so.

In [25]:
!pip install tiktoken

Collecting tiktoken
  Using cached tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Using cached regex-2024.11.6-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting requests>=2.26.0 (from tiktoken)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Using cached charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Using cached urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->tiktoken)
  Using cached certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Using cached tiktoken-0.9.0-cp310-cp310-macosx_11_0_arm64.whl (1.0 MB)
Using cached regex-2024.11.6-cp310-cp310-macosx_

In [26]:
import importlib
import tiktoken

In [27]:
tokenizer = tiktoken.get_encoding("gpt2")

In [36]:
text1 = "Hello, how are you?"
text2 = "“Fallen,” Will insisted. “There’s one woman up an ironwood, half-hid in the branches."
text3 = "go SomeWhereElse."

final_text = " <|endoftext|> ".join((text1,text2, text3))
print(final_text)

Hello, how are you? <|endoftext|> “Fallen,” Will insisted. “There’s one woman up an ironwood, half-hid in the branches. <|endoftext|> go SomeWhereElse.


In [33]:
integers = tokenizer.encode(final_text, allowed_special = {"<|endoftext|>"})
print(integers)

[15496, 11, 703, 389, 345, 30, 220, 50256, 564, 250, 24750, 268, 11, 447, 251, 2561, 11189, 13, 564, 250, 1858, 447, 247, 82, 530, 2415, 510, 281, 6953, 3822, 11, 2063, 12, 49675, 287, 262, 13737, 13, 220, 50256, 467, 2773, 8496, 40674, 13]


In [34]:
strings = tokenizer.decode(integers)

In [35]:
print(strings)

Hello, how are you? <|endoftext|> “Fallen,” Will insisted. “There’s one woman up an ironwood, half-hid in the branches. <|endoftext|> go SomeWhereElse.


### Another exapmle

In [38]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier
