# Working with text data

In [1]:
## Tokenizing text

import os 
import urllib.request 

file_path = "data.txt"



In [2]:
with open(file_path,"r",encoding="utf-8") as f :
    raw_text = f.read()
raw_text

'The Spiritual Phenomenon of Auras: A Comprehensive Analysis of Energy Fields in Human Consciousness\nThe concept of auras represents one of humanity\'s most enduring spiritual beliefs, encompassing the idea that all living beings emanate subtle energy fields that can be perceived, interpreted, and influenced through various practices. This luminous radiation, described across cultures as a colored emanation surrounding the human body, serves as a bridge between the physical and metaphysical realms, offering insights into an individual\'s emotional, mental, and spiritual states. The study of auras reveals a rich tapestry of cross-cultural spiritual wisdom, from ancient Hindu scriptures to modern New Age practices, demonstrating humanity\'s persistent fascination with the invisible forces that shape our existence and consciousness.\n\nHistorical and Cultural Foundations\nAncient Origins and Cross-Cultural Manifestations\nThe belief in auras extends far back into human history, with docu

In [3]:
len(raw_text)

19616

In [4]:
import re 

text  =  "Hello , world . This is a test for tokenization using regular expresison"
result = re.split(r'(\s)',text)
print(result)

['Hello', ' ', ',', ' ', 'world', ' ', '.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'for', ' ', 'tokenization', ' ', 'using', ' ', 'regular', ' ', 'expresison']


In [5]:
result = re.split(r'([,.]|\s)',text)
print(result)

['Hello', ' ', '', ',', '', ' ', 'world', ' ', '', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'for', ' ', 'tokenization', ' ', 'using', ' ', 'regular', ' ', 'expresison']


In [6]:
result = [item  for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', 'for', 'tokenization', 'using', 'regular', 'expresison']


In [7]:
result = re.split(r'([,.:;?_!"()\']|--|)\s',raw_text)
result = [item.strip() for item in result if item.strip()]
print(result)

['The', 'Spiritual', 'Phenomenon', 'of', 'Auras', ':', 'A', 'Comprehensive', 'Analysis', 'of', 'Energy', 'Fields', 'in', 'Human', 'Consciousness', 'The', 'concept', 'of', 'auras', 'represents', 'one', 'of', "humanity's", 'most', 'enduring', 'spiritual', 'beliefs', ',', 'encompassing', 'the', 'idea', 'that', 'all', 'living', 'beings', 'emanate', 'subtle', 'energy', 'fields', 'that', 'can', 'be', 'perceived', ',', 'interpreted', ',', 'and', 'influenced', 'through', 'various', 'practices', '.', 'This', 'luminous', 'radiation', ',', 'described', 'across', 'cultures', 'as', 'a', 'colored', 'emanation', 'surrounding', 'the', 'human', 'body', ',', 'serves', 'as', 'a', 'bridge', 'between', 'the', 'physical', 'and', 'metaphysical', 'realms', ',', 'offering', 'insights', 'into', 'an', "individual's", 'emotional', ',', 'mental', ',', 'and', 'spiritual', 'states', '.', 'The', 'study', 'of', 'auras', 'reveals', 'a', 'rich', 'tapestry', 'of', 'cross-cultural', 'spiritual', 'wisdom', ',', 'from', 'an

In [8]:
len(result)

2921

In [9]:
preprocessed_text = result

## 2.3 Converting tokens into token ids

In [10]:
preprocessed_text[:10]

['The',
 'Spiritual',
 'Phenomenon',
 'of',
 'Auras',
 ':',
 'A',
 'Comprehensive',
 'Analysis',
 'of']

In [11]:
all_words = sorted(set(preprocessed_text))
all_words

['"',
 '"Hands',
 '"Man',
 '"Radiant',
 '"The',
 '"aureole,',
 '"ka,',
 '"phantom',
 "'",
 '(chakras',
 '(energy',
 '(lohitaka)',
 '(mañjeṭṭha)',
 '(nīla)',
 '(odāta)',
 '(or',
 '(pīta)',
 ')',
 ',',
 '.',
 '1903',
 '1910',
 '1980s',
 '20th',
 '400',
 '800',
 ':',
 'A',
 'Additionally',
 'Advanced',
 'African',
 'Age',
 'American',
 'Analysis',
 'Ancient',
 'Applications',
 'Astral',
 'Aura',
 'Auras',
 'BCE',
 'Barbara',
 'Black',
 'Blue',
 'Body',
 'Brennan',
 "Brennan's",
 "Buddha's",
 'Buddhism',
 'Buddhist',
 'Causal',
 'Cayce',
 'Celestial',
 'Chakra',
 'Charles',
 'Chinese',
 'Christina',
 'Church',
 'Color',
 'Comprehensive',
 'Conclusion',
 'Connections',
 'Consciousness',
 'Contemporary',
 'Conversely',
 'Critical',
 'Cross-Cultural',
 'Cultural',
 'Development',
 'Each',
 'Eastern',
 'Edgar',
 'Egyptian',
 'Electromagnetic',
 'Emotional',
 'Energy',
 'England',
 'Etheric',
 'Explanations',
 'Field',
 'Fields',
 'Flow',
 'Foundations',
 'Future',
 'Gautama',
 'Greeks',
 'Gree

In [12]:
vocab_size = len(all_words)
print(vocab_size)

1007


In [13]:
vocab = {token:integer for integer,token in enumerate(all_words)}

vocab


{'"': 0,
 '"Hands': 1,
 '"Man': 2,
 '"Radiant': 3,
 '"The': 4,
 '"aureole,': 5,
 '"ka,': 6,
 '"phantom': 7,
 "'": 8,
 '(chakras': 9,
 '(energy': 10,
 '(lohitaka)': 11,
 '(mañjeṭṭha)': 12,
 '(nīla)': 13,
 '(odāta)': 14,
 '(or': 15,
 '(pīta)': 16,
 ')': 17,
 ',': 18,
 '.': 19,
 '1903': 20,
 '1910': 21,
 '1980s': 22,
 '20th': 23,
 '400': 24,
 '800': 25,
 ':': 26,
 'A': 27,
 'Additionally': 28,
 'Advanced': 29,
 'African': 30,
 'Age': 31,
 'American': 32,
 'Analysis': 33,
 'Ancient': 34,
 'Applications': 35,
 'Astral': 36,
 'Aura': 37,
 'Auras': 38,
 'BCE': 39,
 'Barbara': 40,
 'Black': 41,
 'Blue': 42,
 'Body': 43,
 'Brennan': 44,
 "Brennan's": 45,
 "Buddha's": 46,
 'Buddhism': 47,
 'Buddhist': 48,
 'Causal': 49,
 'Cayce': 50,
 'Celestial': 51,
 'Chakra': 52,
 'Charles': 53,
 'Chinese': 54,
 'Christina': 55,
 'Church': 56,
 'Color': 57,
 'Comprehensive': 58,
 'Conclusion': 59,
 'Connections': 60,
 'Consciousness': 61,
 'Contemporary': 62,
 'Conversely': 63,
 'Critical': 64,
 'Cross-Cultur

In [14]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ] 
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    
    def decode(self, ids) :
        text = " ".join([self.int_to_str[i]for i in ids])
        # Replace spaces before the specified punctuations 
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text) 
        return text 
    

In [15]:
tokenizer = SimpleTokenizerV1(vocab)

text = " Chakra Connections and Energy Flow"

ids = tokenizer.encode(text)
ids

[52, 60, 201, 74, 80]

In [16]:
print(tokenizer.decode(ids))

Chakra Connections and Energy Flow


## 2.4 Adding Special context tokens 

In [17]:
## lets modify  the vocabulary to include these two special tokens , <unk> and <|endoftext|> , by adding them to our list of all unique words 

all_tokens = sorted(list(set(preprocessed_text)))
all_tokens.extend(["<|endoftext|>" , "<|unk|>"])

vocab = {token :integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

1009


In [18]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('yellow', 1004)
('yoga', 1005)
('–', 1006)
('<|endoftext|>', 1007)
('<|unk|>', 1008)


In [19]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ] 
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    
    def decode(self, ids) :
        text = " ".join([self.int_to_str[i]for i in ids])
        # Replace spaces before the specified punctuations 
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text) 
        return text 
    

In [20]:
text1 = "Helllo do you love games ?"
text2 = "in the spiritual essence chakra wins."
text  = "<|endoftext|>".join((text1,text2))
print(text)

Helllo do you love games ?<|endoftext|>in the spiritual essence chakra wins.


In [21]:
tokenizer_v2 = SimpleTokenizerV2(vocab)
res = tokenizer_v2.encode(text)

In [22]:
tokenizer_v2.decode(res)

'<|unk|> <|unk|> <|unk|> love <|unk|> <|unk|> <|unk|> the spiritual essence chakra <|unk|>.'

## additional special tokens : [BOS] , [EOS] , [PAD]

# Byte Pair Encoding 

In [23]:
from importlib.metadata import version 
import tiktoken 
print("tiktoken version: ",version("tiktoken"))

tiktoken version:  0.8.0


In [24]:
tik_tokenizer = tiktoken.get_encoding("gpt2")

text =  ("Hello , do you love the reading books  <|endoftext|> In the sunluight terraces of someunknown Place. ")

integers = tik_tokenizer.encode( text,allowed_special={"<|endoftext|>"})
print(integers)

[15496, 837, 466, 345, 1842, 262, 3555, 3835, 220, 220, 50256, 554, 262, 4252, 2290, 432, 8812, 2114, 286, 617, 34680, 8474, 13, 220]


## 2.6 Data sampling with a Sliding window

In [25]:
senten = tik_tokenizer.decode(integers)
print(senten)

Hello , do you love the reading books  <|endoftext|> In the sunluight terraces of someunknown Place. 


In [26]:
# llms work by predicting the next token one at a time 

with open(file_path,"r",encoding="utf-8") as f :
    raw_text = f.read()
raw_text



'The Spiritual Phenomenon of Auras: A Comprehensive Analysis of Energy Fields in Human Consciousness\nThe concept of auras represents one of humanity\'s most enduring spiritual beliefs, encompassing the idea that all living beings emanate subtle energy fields that can be perceived, interpreted, and influenced through various practices. This luminous radiation, described across cultures as a colored emanation surrounding the human body, serves as a bridge between the physical and metaphysical realms, offering insights into an individual\'s emotional, mental, and spiritual states. The study of auras reveals a rich tapestry of cross-cultural spiritual wisdom, from ancient Hindu scriptures to modern New Age practices, demonstrating humanity\'s persistent fascination with the invisible forces that shape our existence and consciousness.\n\nHistorical and Cultural Foundations\nAncient Origins and Cross-Cultural Manifestations\nThe belief in auras extends far back into human history, with docu

In [27]:
enc_text = tik_tokenizer.encode(raw_text)
print(len(enc_text))

3332


In [28]:
enc_text

[464,
 33944,
 34828,
 3674,
 261,
 286,
 15412,
 292,
 25,
 317,
 40917,
 14691,
 286,
 6682,
 23948,
 287,
 5524,
 45595,
 1108,
 198,
 464,
 3721,
 286,
 257,
 17786,
 6870,
 530,
 286,
 9265,
 338,
 749,
 24056,
 8557,
 9056,
 11,
 20504,
 19696,
 262,
 2126,
 326,
 477,
 2877,
 9791,
 31184,
 378,
 11800,
 2568,
 7032,
 326,
 460,
 307,
 11067,
 11,
 16173,
 11,
 290,
 12824,
 832,
 2972,
 6593,
 13,
 770,
 29763,
 516,
 11881,
 11,
 3417,
 1973,
 13817,
 355,
 257,
 16396,
 31184,
 341,
 7346,
 262,
 1692,
 1767,
 11,
 9179,
 355,
 257,
 7696,
 1022,
 262,
 3518,
 290,
 42031,
 35423,
 11,
 6011,
 17218,
 656,
 281,
 1981,
 338,
 7016,
 11,
 5110,
 11,
 290,
 8557,
 2585,
 13,
 383,
 2050,
 286,
 257,
 17786,
 10069,
 257,
 5527,
 9814,
 395,
 563,
 286,
 3272,
 12,
 30844,
 8557,
 11501,
 11,
 422,
 6156,
 16397,
 39782,
 284,
 3660,
 968,
 7129,
 6593,
 11,
 21135,
 9265,
 338,
 16218,
 35556,
 351,
 262,
 14836,
 3386,
 326,
 5485,
 674,
 6224,
 290,
 10510,
 13,
 198,
 198,
 

In [29]:
enc_sample = enc_text[50:]

In [30]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x:{x}")
print(f"y :   {y}")

x:[307, 11067, 11, 16173]
y :   [11067, 11, 16173, 11]


In [31]:
 for i in range(1,context_size+1):
     context = enc_sample[:i]
     desired = enc_sample[i]
     print(context,"---->",desired)

[307] ----> 11067
[307, 11067] ----> 11
[307, 11067, 11] ----> 16173
[307, 11067, 11, 16173] ----> 11


In [34]:

for i in range(1,context_size+1):
     context = enc_sample[:i]
     desired = enc_sample[i]
     print(tik_tokenizer.decode(context),"---->",tik_tokenizer.decode([desired]))

 be ---->  perceived
 be perceived ----> ,
 be perceived, ---->  interpreted
 be perceived, interpreted ----> ,


In [46]:
import torch 
from torch.utils.data import Dataset,DataLoader 

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt)
        
        for i in range(0,len(token_ids) - max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length + 1]
            self.input_ids.append([torch.tensor(input_chunk)])
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
            return len(self.input_ids)
        
    def __getitem__(self,idx):
            return self.input_ids[idx],self.target_ids[idx]
        
def create_dataloader_v1(txt,batch_size = 4, max_length =256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
    
    return dataloader


dataloader = create_dataloader_v1(
    raw_text,batch_size=1,max_length=4,stride=1,shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[[tensor([[  464, 33944, 34828,  3674]])], tensor([[33944, 34828,  3674,   261]])]


In [47]:
second_batch = next(data_iter)
print(second_batch)

[[tensor([[33944, 34828,  3674,   261]])], tensor([[34828,  3674,   261,   286]])]


## Creating token embeddings 

In [48]:
input_ids = torch.tensor([2,3,5,1])

In [49]:
vocab_size = 6
output_dim = 3 


torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [51]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [54]:
print(embedding_layer(torch.tensor(input_ids)))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


  print(embedding_layer(torch.tensor(input_ids)))


In [55]:
 #Encoding Word positions 
 #An embedding vector is a dense numerical representation of a categorical input (like a word or token), where similar tokens get similar vectors.
 

## Encoding Word Positions 

In [57]:
vocab_size = 50257
output_dim = 256 

token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

print(token_embedding_layer.weight)

Parameter containing:
tensor([[ 0.7504,  0.7353,  1.7375,  ..., -0.2488, -0.7666, -0.2277],
        [ 0.5131, -0.2599,  1.6423,  ..., -0.0586,  1.3007,  0.4118],
        [ 0.1582, -1.1788, -0.4651,  ...,  0.8060, -0.0383,  1.4335],
        ...,
        [-0.2420,  0.8730, -0.1769,  ...,  1.1450, -0.8330, -0.6994],
        [ 0.1242,  0.0251, -0.0437,  ..., -0.6893,  0.0598,  0.0348],
        [ 1.0846, -0.1639, -0.1025,  ..., -1.0124, -1.5893,  0.7799]],
       requires_grad=True)
