In [1]:
import tiktoken
import torch
from torch.utils.data import Dataset , DataLoader

<div class="alert alert-block alert-info">
<b>Embedding:</b> 
<p>    
We are working on creating the POSITIONAL Embedding. This is required to give the token
embedding additional information about the posistion of text , so model does not overfit for 
repeated phrases
</p>
</div>

<div class="alert alert-block alert-info">
<b>Token Encoding:</b> 
<p>    
Below we use

    1. BPE (Byte Pair Encoding mechanism for tokenization of text
    2. Then we create input - Target pair using pytorch dataset
    
</p>
</div>

In [57]:
class GPTDatasetV1(Dataset):
    def __init__(self, text , tokenizer , max_length , stride):
        self.input_tensor=[]
        self.output_tensor=[]

        # Tokenize entire text usng gpt2 BPE
        token_ids=tokenizer.encode(text , allowed_special={"<|endoftext|>"})
        print("Length of Token Ids",len(token_ids))
        
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk=token_ids[i:i+max_length]
            output_chunk=token_ids[i+1:i+max_length+1]
            self.input_tensor.append(torch.tensor(input_chunk))
            self.output_tensor.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_tensor)

    def __getitem__(self, idx):
        #print("get next item")
        return self.input_tensor[idx] , self.output_tensor[idx]

In [58]:
def create_dataloader_v1(text , batch_size=4,max_length=256,
                           stride=128,shuffle=True,drop_last=True,
                            num_workers=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset=GPTDatasetV1(text,tokenizer,max_length,stride)

    dataloader=DataLoader(
                          dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers=num_workers
                         )

    return dataloader

In [59]:
with open("data/the-verdict.txt" ,'r',encoding="utf-8" ) as f:
    raw_text=f.read()

In [60]:
max_length=4
dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=max_length,
                     stride=max_length,shuffle=False)


Length of Token Ids 5145


<div class="alert alert-block alert-info">
<b>End of creating the datasets</b> 
<p>    
This is the end we load the dataset and create the datasets as:

    1. Input - Target Pair
    2. Create data in  batches
    3. Create the data with context window defined , in this case it = max_length
</p>
</div>

---

In [51]:
vocab_size=50257  #BPE gpt2 has fixed vocab length of 50257
output_dim=256    # The tokens will further be expaned vector space of 256 dimensions

token_embedding_layer=torch.nn.Embedding(vocab_size,output_dim)

In [52]:
# here we are actually taking just the 1st Batch and then performing
# Token encoding
# check the size of input and token encoding , it is just for 1 batch
# becuase we defined context window of 4 (mmax_length) and batch_size=8

data_iter=iter(dataloader)
inputs, output = next(data_iter)
token_embeddings=token_embedding_layer(inputs)

print("Shape of the inputs:",inputs.shape)
print("Shape of the token embedded:",token_embeddings.shape)

Shape of the inputs: torch.Size([8, 4])
Shape of the token embedded: torch.Size([8, 4, 256])


<div class="alert alert-block alert-info">
<b>Token Embedding for all of the tokens and batches</b> 
<p>    
We saw the previous cell. It only takes 1 batch of tokens.
The Below example is for all of the tokens from all batches
</p>
</div>

In [66]:
all_embeddings = []
tot=0
batches=0

# We loop through dataloader to get all batches and
# encode them collect into list , at the end we concate the items of list

for batch in dataloader:
    token_ids1 = batch[0]                           # shape (batch_size,)
    tot = len(token_ids1) + tot
    emb = token_embedding_layer(token_ids1)          # shape (batch_size, embed_dim)
    all_embeddings.append(emb)
    batches += 1
# Concatenate all batches back together
all_embeddings = torch.cat(all_embeddings, dim=0)x

print("All embeddings shape:", all_embeddings.shape)
print("Number of token ID'd", tot)
print("Number of Batches created", batches)

All embeddings shape: torch.Size([1280, 4, 256])
Number of token ID'd 1280
Number of Batches created 160


---

<div class="alert alert-block alert-info">
<b>Creating Positional embedding</b> 
<p>    
From here On  , we will work on creatig the Positional embedding

The dimension of this positional encoding as to be = (context_length * dimensional encoding)
which will be equal to (4 * 256) . Check the final token embedding it is (1280 * 4 * 256)
</p>
</div>

In [81]:
context_length=max_length

pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)

---

<div class="alert alert-block alert-info">

<p>    
Lets look at some torch and embedding understanding
</p>
</div>

In [82]:
type(pos_embedding_layer)

torch.nn.modules.sparse.Embedding

In [83]:
# Lets look at some basics
# torch.arange , creates a 1-D matrix
print("1-D matrix :-", torch.arange(context_length))

# Now these pos_embedding is a look up matrix .
# meaning if we pass the tensor it will map the value of tensor to the index and gives those rWindowsError

out=pos_embedding_layer(torch.arange(context_length))
print("Size of the embedding now:-",out.shape)

1-D matrix :- tensor([0, 1, 2, 3])
Size of the embedding now:- torch.Size([4, 256])


In [85]:
# just look at the matrix . It is of the size same as token embedding
# this will be now added to each of the elements in token embedding 
torch.set_printoptions(profile="full")
out

tensor([[-2.9574e-01,  2.0297e+00, -1.9559e-01,  9.8399e-01,  1.0991e+00,
         -6.4257e-02,  5.6272e-01,  4.7487e-01, -7.2253e-01, -2.3067e-01,
         -1.3186e-01,  9.8797e-01, -2.0638e+00, -7.9027e-01, -2.0503e-01,
         -3.2141e-01,  1.1196e+00, -2.9789e-01,  8.7125e-02, -4.4571e-01,
          9.9954e-01, -8.3010e-02, -4.3924e-01, -3.5169e-01,  1.3903e-01,
          1.3182e-01,  6.8044e-01,  4.1842e-01,  8.4340e-02,  7.1125e-01,
         -1.0704e+00,  7.7222e-02,  1.5551e-01, -8.2345e-01,  1.8396e+00,
          4.0773e-01,  1.1029e+00,  1.2878e+00,  8.9099e-01, -1.8520e+00,
         -3.3168e-01,  7.5198e-01, -2.7836e+00, -7.5571e-01, -1.5996e+00,
          1.4584e+00,  5.9436e-01, -1.1413e+00,  6.0807e-01, -2.9501e-01,
         -3.9207e-01, -5.7350e-01,  2.0634e+00, -2.6630e-01,  1.7259e-01,
          5.3171e-01, -6.5363e-01, -2.2981e-01, -7.0085e-01,  8.8571e-02,
         -1.5989e+00, -1.4588e+00,  1.4049e+00,  4.7040e-02,  6.9556e-01,
         -8.5004e-01,  6.4517e-02,  1.

---

In [86]:
pos_embedding = pos_embedding_layer(torch.arange(context_length))

In [87]:
input_embeddings = all_embeddings + pos_embedding

In [88]:
print(input_embeddings.shape)

torch.Size([1280, 4, 256])
