<a href="https://colab.research.google.com/github/teelch0/Data-Mining/blob/main/data_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Step 2

import torch
import tiktoken

In [3]:
#Step 3

tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
#Step 4

import requests
url= "https://raw.githubusercontent.com/teelch0/Data-Mining/refs/heads/main/ShortStory.txt"
response= requests.get(url)
raw_text= response.text
raw_text[:25]


"IF you please, ma'am, din"

In [5]:
#Step 5
enc_text= tokenizer.encode(raw_text)
print(enc_text)

[5064, 345, 3387, 11, 17266, 6, 321, 11, 8073, 318, 4983, 13, 447, 251, 198, 198, 11006, 27583, 11, 508, 550, 587, 16143, 656, 262, 2046, 11, 2067, 290, 3114, 736, 625, 465, 8163, 656, 262, 2119, 2157, 683, 13, 3244, 339, 13541, 13, 383, 8212, 373, 15403, 11, 257, 491, 8316, 36979, 11, 290, 257, 491, 8316, 26246, 278, 26, 475, 340, 750, 407, 1037, 284, 787, 683, 804, 7099, 393, 1342, 12922, 13, 198, 198, 447, 250, 23792, 616, 1573, 11, 314, 550, 11564, 345, 547, 612, 11, 10897, 11, 447, 251, 339, 531, 11, 355, 996, 2859, 3500, 465, 3356, 286, 5975, 13, 564, 250, 2061, 389, 345, 1804, 30, 314, 36059, 326, 6919, 546, 8073, 373, 407, 9469, 284, 502, 30, 447, 251, 198, 198, 447, 250, 10248, 1108, 11, 645, 0, 632, 338, 616, 636, 960, 10508, 257, 3076, 6058, 11, 8531, 1310, 636, 0, 49848, 502, 355, 257, 9048, 18807, 287, 1468, 11, 13400, 8242, 0, 843, 314, 10783, 284, 307, 2712, 3756, 10846, 0, 440, 8951, 77, 470, 314, 11, 3271, 30, 447, 251, 198, 198, 447, 250, 2949, 4719, 11, 447, 251, 339

In [6]:
#Step 6

from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [7]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [13]:
#Experimenting with different batch sizes, stride lengths, and sequence lengths
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
third_batch = next(data_iter)

print(first_batch)
print(second_batch)
print(third_batch)

[tensor([[5064,  345, 3387,   11]]), tensor([[  345,  3387,    11, 17266]])]
[tensor([[  345,  3387,    11, 17266]]), tensor([[ 3387,    11, 17266,     6]])]
[tensor([[ 3387,    11, 17266,     6]]), tensor([[   11, 17266,     6,   321]])]


In [14]:
#Step 7
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("Targets:\n", targets)



Inputs:
 tensor([[ 5064,   345,  3387,    11],
        [17266,     6,   321,    11],
        [ 8073,   318,  4983,    13],
        [  447,   251,   198,   198],
        [11006, 27583,    11,   508],
        [  550,   587, 16143,   656],
        [  262,  2046,    11,  2067],
        [  290,  3114,   736,   625]])
Targets:
 tensor([[  345,  3387,    11, 17266],
        [    6,   321,    11,  8073],
        [  318,  4983,    13,   447],
        [  251,   198,   198, 11006],
        [27583,    11,   508,   550],
        [  587, 16143,   656,   262],
        [ 2046,    11,  2067,   290],
        [ 3114,   736,   625,   465]])


In [15]:
#Step 8
for row in inputs:
  print(tokenizer.decode(row.tolist()))

IF you please,
 ma'am,
 dinner is served.
”


David Hardy, who
 had been staring into
 the fire, started
 and looked back over


In [25]:
#Step 9
#It is necessary to use embedding vectors in place of the IDs because the model
#has no way of understanding the similarities between words without the presence
#of these numbers. Once the IDs have been turned into vectors, they gain the
#ability to be processed in a continuous space with multiple dimensions.
#This allows for the model to learn similarities and patterns.

vocab_size = 8
output_dim = 4
embedding = torch.nn.Embedding( vocab_size, output_dim )
inputs = embedding.weight.detach()
print(inputs)

tensor([[-0.5377, -1.1525,  0.2904,  0.8930],
        [-0.3896,  1.7262,  0.0467, -0.6160],
        [ 0.0990, -1.7559, -0.8583,  1.6134],
        [ 1.1136, -0.3108, -1.4688,  1.2079],
        [ 1.3054,  1.2728,  0.9720,  0.5098],
        [-0.8079, -0.4803,  0.8715, -0.9987],
        [ 2.0033,  1.7351, -0.1549, -1.0294],
        [-2.1820, -1.2458,  0.5098, -0.8216]])


In [None]:
#Question: Are the embedding vectors above related to the text?

In [30]:
W_q = torch.nn.Parameter( torch.rand( 4, 4 ), requires_grad=False )
W_k = torch.nn.Parameter( torch.rand( 4, 4 ), requires_grad=False )
W_v = torch.nn.Parameter( torch.rand( 4, 4 ), requires_grad=False )

In [31]:
query= inputs[1] @ W_q
print(query)

tensor([0.5137, 0.4158, 0.2399, 0.0828])


In [32]:
keys= inputs @ W_k
values= inputs @ W_v
print("Keys:", keys)
print("Values:", values)

Keys: tensor([[-0.3791,  0.3120, -0.5163, -0.8183],
        [ 0.8752, -0.5921,  0.3309,  0.6624],
        [-0.6822,  0.6624, -0.8082, -0.8523],
        [ 0.2707,  0.8311, -0.0638,  0.3357],
        [ 2.2087,  1.6267,  1.5007,  1.3280],
        [-0.9313, -0.7760, -0.2519, -0.6042],
        [ 1.2874,  0.6374,  1.5435,  1.8384],
        [-2.0292, -1.7427, -1.3763, -1.6803]])
Values: tensor([[-1.1524,  0.7272, -0.4760,  0.6615],
        [ 1.1247, -0.6926,  0.7471, -0.4306],
        [-1.8647,  1.4323, -1.3511,  0.6160],
        [-0.1263,  1.3037, -0.5715,  0.4292],
        [ 3.3558,  1.1782,  2.5734,  2.0573],
        [-0.7507, -1.1238, -0.2565, -0.7063],
        [ 3.1935, -0.3145,  1.7156,  0.0549],
        [-3.0478, -1.5120, -1.7599, -1.5552]])


In [33]:
#Attention scores are calculated by multiplying Query and transverse of Keys
attention_scores= query @ keys.T
print(attention_scores)

tensor([-0.2567,  0.3376, -0.3395,  0.4971,  2.2810, -0.9115,  1.4489, -2.2363])


In [35]:
attention_weights= torch.softmax(attention_scores, dim= -1 )
print(attention_weights)

tensor([0.0405, 0.0734, 0.0373, 0.0861, 0.5128, 0.0211, 0.2231, 0.0056])


In [37]:
context_vector= attention_weights @ values
print(context_vector)

tensor([2.3559, 0.6462, 1.6230, 1.0987])
