In [1]:
import torch

In [None]:
with open("/Users/tusharsingharoy/Downloads/the-verdict.txt","r", encoding="utf-8") as f:
    raw_text=f.read()
print(len(raw_text))

20479


## Tokenized the Text

In [3]:
import re
token=re.split(r'([,.:;?"\s)]|--|\s)',raw_text)
print(token[:20])   ## Print frist 20 token 
print(f"Text Size :{len(raw_text)}")
print(f"Total number of token :{len(token)}")
token_e_strip = [item.strip() for item in token if item.strip()]   ## item.strip() removes whitespace characters 
print(f"Total Number of teken without whitespace {len(token_e_strip)}")
uni_token=list(set(token_e_strip))    ## Store only the unique text
print(f"Total Number of unique token {len(uni_token)}")
uni_token=sorted(uni_token)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--']
Text Size :20479
Total number of token :8921
Total Number of teken without whitespace 4456
Total Number of unique token 1181


In [4]:
vocab={token:integer for integer,token in enumerate(uni_token)}

In [5]:
vocab.items()

dict_items([('"', 0), ("'", 1), ("'Are", 2), ("'It's", 3), ("'coming'", 4), ("'done'", 5), ("'subject", 6), ("'technique'", 7), ("'way", 8), ('(I', 9), ('(Though', 10), (')', 11), (',', 12), ('--', 13), ('.', 14), (':', 15), (';', 16), ('?', 17), ('A', 18), ('Ah', 19), ('Among', 20), ('And', 21), ('Arrt', 22), ('As', 23), ('At', 24), ('Be', 25), ('Begin', 26), ('Burlington', 27), ('But', 28), ('By', 29), ('Carlo', 30), ('Chicago', 31), ('Claude', 32), ('Come', 33), ('Croft', 34), ('Destroyed', 35), ('Devonshire', 36), ("Don't", 37), ('Dubarry_', 38), ('Emperors', 39), ('Florence', 40), ('For', 41), ('Gallery', 42), ('Gideon', 43), ('Gisburn', 44), ('Gisburn!', 45), ("Gisburn's", 46), ('Gisburns', 47), ('Grafton', 48), ('Greek', 49), ('Grindle', 50), ("Grindle's", 51), ('Grindles', 52), ('HAD', 53), ('Had', 54), ('Hang', 55), ('Has', 56), ('He', 57), ('Her', 58), ('Hermia', 59), ("Hermia's", 60), ('His', 61), ('How', 62), ('I', 63), ("I'd", 64), ("I'll", 65), ("I've", 66), ('If', 67), (

## Printing frist 20 token with token ID'S

In [6]:
for i,item in enumerate(vocab.items()):
    print(item)
    if(i==20):
        break

('"', 0)
("'", 1)
("'Are", 2)
("'It's", 3)
("'coming'", 4)
("'done'", 5)
("'subject", 6)
("'technique'", 7)
("'way", 8)
('(I', 9)
('(Though', 10)
(')', 11)
(',', 12)
('--', 13)
('.', 14)
(':', 15)
(';', 16)
('?', 17)
('A', 18)
('Ah', 19)
('Among', 20)


## SimpleTokenizer

In [7]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encoder(self,text):
        token=re.split(r'([,;:".}{]?|--|\s)',text)
        token=[item.strip() for item in token if item.strip()]
        token_ids=[self.str_to_int[s] for s in token]
        return token_ids
    
    def decoder(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
         # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [8]:
# verdict_tokenizer = SimpleTokenizer(vocab)
# ids=verdict_tokenizer.encoder(raw_text)

## Add <|endoftext|> 
- Help to join to sentence 

In [9]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text=" <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


### BYTE PAIR ENCODING (BPE)   [BY using tiktoken]

In [10]:
# pip install tiktoken
import tiktoken as tik
model=["gpt2","gpt3","gpt4"]
gpt_vocab=[tik.get_encoding("gpt2"),tik.get_encoding("p50k_base"),tik.get_encoding("cl100k_base")]

In [11]:
for i, (model_name, vocab) in enumerate(zip(model, gpt_vocab)):
    print(f"The vocabulary size for {model_name} is: {vocab.n_vocab}")
    

The vocabulary size for gpt2 is: 50257
The vocabulary size for gpt3 is: 50281
The vocabulary size for gpt4 is: 100277


## DataLoader

In [12]:
from torch.utils.data import Dataset, DataLoader


In [13]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
        

In [14]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    tokenizer=tik.get_encoding("gpt2")
    
    dataset=GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [15]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=3, max_length=4, stride=1, shuffle=False
)


In [16]:

for input_ids, target_ids in dataloader:
    print("Input IDs:", input_ids)
    print("Target IDs:", target_ids)
    break

Input IDs: tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])
Target IDs: tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]])


### CREATING TOKEN EMBEDDINGS

In [17]:
vocab_size=6
output_dim=3

torch.manual_seed(123)
#####====>  IMP: embedding_layer  is not a list of tensor 
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)  ## Create some random weights 
print(embedding_layer.weight)    ## PrInt that random weights
a=embedding_layer.weight       ## It is a list of tensor


Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [18]:
print(embedding_layer(torch.tensor([3])))   ## print the 4th row of the embedding weight

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


### POSITIONAL EMBEDDINGS (ENCODING WORD POSITIONS)

In [19]:
## Let craete the embedding as like GPT 2
vocab_size=50257
output_dim=256

token_embedding_layer=torch.nn.Embedding(vocab_size,output_dim)

In [20]:
print(token_embedding_layer.weight)
print(token_embedding_layer.weight.shape)

Parameter containing:
tensor([[-2.1338,  1.0524, -0.3885,  ...,  0.2461,  1.2119,  0.3171],
        [ 1.2277, -0.4297, -2.2121,  ..., -0.1640, -0.3348, -0.0221],
        [ 1.3382,  0.2706,  0.5071,  ...,  0.0175, -2.1517,  0.3924],
        ...,
        [-1.4889, -1.2456,  1.8034,  ..., -0.6392, -1.4939,  0.3614],
        [-1.0703,  0.2795, -0.2637,  ..., -0.2810, -1.4755, -0.1183],
        [-0.0071,  0.4982, -0.3319,  ...,  0.4970,  0.9365, -0.2091]],
       requires_grad=True)
torch.Size([50257, 256])


In [21]:
dataloader=dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4,
    stride=4, shuffle=False)
count=0
for input_id,target_id in dataloader:
    # print(f"Input ID : {input_id}")
    # print(f"Input ID : {target_id}")
    count=count+1
    

print(count)  ## len(token_ids) = num_samples × stride + max_length = 1280 × 4 + 4 = 5124

160


In [22]:
for input_id,target_id in dataloader:
    print(f"Input ID : {input_id}")
    print(f"Input ID : {target_id}")
    break

Input ID : tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Input ID : tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Positional Embedding

In [23]:
context_length = 4
output_dim=256
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [24]:
print(pos_embedding_layer.weight)
pos_embeddings = pos_embedding_layer(torch.arange(4))
print(pos_embeddings.shape)

Parameter containing:
tensor([[-0.6303, -0.4848, -0.1366,  ...,  1.0345, -0.5012,  1.1045],
        [ 0.2062,  0.6078,  0.7187,  ..., -0.4628, -0.2319,  1.1980],
        [ 0.5806, -1.3846,  0.3266,  ...,  0.8579,  0.5059,  1.0243],
        [ 1.4323,  0.2217,  0.8599,  ...,  0.4827,  0.8459,  1.3038]],
       requires_grad=True)
torch.Size([4, 256])


In [25]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [26]:
input_embeddings = token_embeddings + pos_embeddings
print(f"Shape of token Embedding  {token_embeddings.shape}")
print(f"Shape of pos Embedding  {pos_embeddings.shape}")   ## Braodcast the row
print(f"Shape of input Embedding  {input_embeddings.shape}")


Shape of token Embedding  torch.Size([8, 4, 256])
Shape of pos Embedding  torch.Size([4, 256])
Shape of input Embedding  torch.Size([8, 4, 256])


##  ATTENTION MECHANISM

In [27]:
import torch

input = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
# print(input[2])

In [28]:
## Let print attention score
query =input[2] # start

def atten_score(input,query):
    att_score=torch.empty(input.shape[0])
    for i,x_i in enumerate(input):
        att_score[i] =torch.dot(x_i,query)   ## Simple attention only by dot product 
    return att_score
att_score=atten_score(input,query)
print(att_score)
    

tensor([0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605])


<strong>The main goal behind the normalization</strong> is to obtain attention weights
  that sum up to 1.

  This normalization is a convention that is useful for interpretation and for
  maintaining training stability in an LLM

  Here's a straightforward method for achieving this
  normalization step:


In [29]:
attn_weights_2_tmp = att_score / att_score.sum()

print("Attention weights:", attn_weights_2_tmp)
print("Sum:", attn_weights_2_tmp.sum())

Attention weights: tensor([0.1454, 0.2277, 0.2248, 0.1280, 0.1104, 0.1637])
Sum: tensor(1.)


In [30]:
## change into softmax attention
def softmax_att(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)
atten_soft_score=softmax_att(att_score)
print(f"Attention weight after softmax = {atten_soft_score} ")

Attention weight after softmax = tensor([0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565]) 


## Attention Matrix

In [31]:
import time
start = time.time()  ## Starting time
att_table=torch.empty(6,6) ## Create an empty matrix
for i,x_i in enumerate(input):
    query=input[i]
    att_table[i]=atten_score(input,query)
time.sleep(1)  ## Increase 1 Sec
end = time.time()  ## end time
print(att_table)
print(f"Time taken to run the loop {end-start} Sec")

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
Time taken to run the loop 1.0073270797729492 Sec


In [32]:
## Attention Score by using Matrix Multiplication
start=time.time()
attn_table=input @ input.T
time.sleep(1)
end=time.time()
print(att_table)
print(f"Time taken to run the loop {end-start} Sec")


tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])
Time taken to run the loop 1.0011286735534668 Sec


## Applying direct softmax

In [33]:
## Attention weight matrix
attn_weights = torch.softmax(attn_table, dim=-1)
print(attn_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [34]:
## Now find all the context vector
all_context_vec=attn_weights @ input
print(all_context_vec)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


## IMPLEMENTING SELF ATTENTION WITH TRAINABLE WEIGHTS(Key ,Query,Value)

In [35]:
print(input.shape)
d_in=input.shape[1]
d_out=2

torch.Size([6, 3])


In [36]:
# torch.manual_seed(123)
w_query=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
w_key=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
w_value=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)

In [37]:
print(f"w_query Matrix {w_query} ")
print(f"w_key Matrix {w_key} ")
print(f"w_value Matrix {w_value} ")

w_query Matrix Parameter containing:
tensor([[0.4156, 0.0014],
        [0.8157, 0.1645],
        [0.4039, 0.8877]]) 
w_key Matrix Parameter containing:
tensor([[0.3615, 0.7642],
        [0.2305, 0.5352],
        [0.1136, 0.5045]]) 
w_value Matrix Parameter containing:
tensor([[0.2898, 0.9282],
        [0.8003, 0.9874],
        [0.7082, 0.1279]]) 


In [38]:
## Let's take the x2 from the and find the corresponding Key ,query and value 
x_2=input[1]   ## this is for single value x2
q_2=x_2 @ w_query
v_2=x_2 @ w_key
print(q_2)

tensor([1.2049, 0.7297])


In [39]:
## Now find key , query and value for the all inputs
keys=input @ w_key
values=input @ w_value
queries=input @ w_query

print("keys Matrix:", keys)

print("values Matrix:", values)

print("queries Matrix:", queries)

keys Matrix: tensor([[0.2911, 0.8579],
        [0.4743, 1.2189],
        [0.4747, 1.2134],
        [0.2507, 0.6451],
        [0.3473, 0.7727],
        [0.2650, 0.7439]])
values Matrix: tensor([[0.8750, 0.6611],
        [1.3231, 1.4539],
        [1.2987, 1.4502],
        [0.7616, 0.8191],
        [0.4941, 0.9743],
        [1.0442, 0.9067]])
queries Matrix: tensor([[0.6606, 0.8153],
        [1.2049, 0.7297],
        [1.1888, 0.7087],
        [0.6978, 0.3887],
        [0.5644, 0.1310],
        [0.8955, 0.6199]])


In [40]:
## Let find the attention score for the x2 dot(q_2,k_2)
key_2=keys[1]
att_s2=q_2.dot(key_2)
print(att_s2)

tensor(1.4610)


In [41]:
## finding the attention score for the all the iputs
attn_scores = queries @ keys.T # omega
print(attn_scores)

tensor([[0.8917, 1.3071, 1.3029, 0.6915, 0.8594, 0.7815],
        [0.9768, 1.4610, 1.4574, 0.7728, 0.9824, 0.8621],
        [0.9541, 1.4278, 1.4243, 0.7552, 0.9605, 0.8422],
        [0.5366, 0.8048, 0.8029, 0.4257, 0.5427, 0.4740],
        [0.2766, 0.4273, 0.4268, 0.2260, 0.2972, 0.2470],
        [0.7925, 1.1804, 1.1773, 0.6244, 0.7900, 0.6984]])


In [42]:
d_k = keys.shape[-1]
attn_weights = torch.softmax(attn_scores / d_k**0.5,dim=-1)   ## divided by sqrt of key dimension
print(attn_weights)
print(d_k)

tensor([[0.1551, 0.2080, 0.2074, 0.1346, 0.1516, 0.1434],
        [0.1514, 0.2132, 0.2127, 0.1311, 0.1520, 0.1396],
        [0.1518, 0.2121, 0.2116, 0.1318, 0.1524, 0.1402],
        [0.1587, 0.1918, 0.1916, 0.1467, 0.1594, 0.1518],
        [0.1617, 0.1799, 0.1798, 0.1560, 0.1641, 0.1584],
        [0.1550, 0.2040, 0.2035, 0.1377, 0.1548, 0.1451]])
2


## IMPLEMENTING A COMPACT SELF ATTENTION PYTHON CLASS

In [43]:
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self,d_in,d_out):
        super().__init__()
        self.w_query=nn.Parameter(torch.rand(d_in,d_out))
        self.w_key=nn.Parameter(torch.rand(d_in,d_out))
        self.w_value=nn.Parameter(torch.rand(d_in,d_out))
    
    def forward(self,x):
        query =x @ self.w_query
        key=x @ self.w_key
        value=x @ self.w_value
        
        att_score=query @ keys.T 
        att_weight=torch.softmax(att_score/key.shape[-1]**0.5,dim=-1)
        
        context_vec=att_weight @ value
        
        return context_vec

# print(input.shape)

In [44]:
Self_att_cls = SelfAttention(d_in, d_out)
print(Self_att_cls(input))

tensor([[0.8720, 0.8265],
        [0.8921, 0.8483],
        [0.8917, 0.8479],
        [0.8695, 0.8238],
        [0.8707, 0.8250],
        [0.8750, 0.8297]], grad_fn=<MmBackward0>)


 ## CAUSAL ATTENTION

In [45]:
attn_weights

tensor([[0.1551, 0.2080, 0.2074, 0.1346, 0.1516, 0.1434],
        [0.1514, 0.2132, 0.2127, 0.1311, 0.1520, 0.1396],
        [0.1518, 0.2121, 0.2116, 0.1318, 0.1524, 0.1402],
        [0.1587, 0.1918, 0.1916, 0.1467, 0.1594, 0.1518],
        [0.1617, 0.1799, 0.1798, 0.1560, 0.1641, 0.1584],
        [0.1550, 0.2040, 0.2035, 0.1377, 0.1548, 0.1451]])

In [46]:
## applying causal attention
mask=torch.tril(torch.ones(attn_weights.shape))
print(f"MASK :{mask}")
causal_weight=mask*attn_weights
print(f"CAUSAL WEIGHT :{causal_weight}")
causal_norm=causal_weight/causal_weight.sum(dim=-1,keepdim=True)
print(causal_norm) ## ReNormailazation

MASK :tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])
CAUSAL WEIGHT :tensor([[0.1551, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1514, 0.2132, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1518, 0.2121, 0.2116, 0.0000, 0.0000, 0.0000],
        [0.1587, 0.1918, 0.1916, 0.1467, 0.0000, 0.0000],
        [0.1617, 0.1799, 0.1798, 0.1560, 0.1641, 0.0000],
        [0.1550, 0.2040, 0.2035, 0.1377, 0.1548, 0.1451]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4152, 0.5848, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2637, 0.3686, 0.3677, 0.0000, 0.0000, 0.0000],
        [0.2304, 0.2785, 0.2781, 0.2130, 0.0000, 0.0000],
        [0.1922, 0.2138, 0.2137, 0.1854, 0.1950, 0.0000],
        [0.1550, 0.2040, 0.2035, 0.1377, 0.1548, 0.1451]])


## NOTE:
- But after re Norm still there are some effect of previous value so , change the 0 element into -inf before applying softmax

In [47]:
mask=torch.triu(torch.ones(attn_weights.shape),diagonal=1)
causal_weight=causal_weight.masked_fill(mask.bool(),-torch.inf)

In [48]:
causal_weight

tensor([[0.1551,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1514, 0.2132,   -inf,   -inf,   -inf,   -inf],
        [0.1518, 0.2121, 0.2116,   -inf,   -inf,   -inf],
        [0.1587, 0.1918, 0.1916, 0.1467,   -inf,   -inf],
        [0.1617, 0.1799, 0.1798, 0.1560, 0.1641,   -inf],
        [0.1550, 0.2040, 0.2035, 0.1377, 0.1548, 0.1451]])

In [49]:
## Now apply softmax
C_attn_wgt=torch.softmax(causal_weight/keys.shape[-1]**0.5 ,dim=-1)

In [50]:
C_attn_wgt

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4891, 0.5109, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3240, 0.3381, 0.3380, 0.0000, 0.0000, 0.0000],
        [0.2476, 0.2535, 0.2534, 0.2455, 0.0000, 0.0000],
        [0.1991, 0.2016, 0.2016, 0.1983, 0.1994, 0.0000],
        [0.1653, 0.1711, 0.1710, 0.1633, 0.1652, 0.1641]])

## DROPOUT

In [51]:
import torch
one_tensor = torch.ones(6, 6)
print(one_tensor)


tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])


When applying dropout to an attention weight matrix with a rate of 50%, half of the
elements in the matrix are randomly set to zero. 

To compensate for the reduction in active
elements, the values of the remaining elements in the matrix are scaled up by a factor of
1/0.5 =2. 

This scaling is crucial to maintain the overall balance of the attention weights,
ensuring that the average influence of the attention mechanism remains consistent during
both the training and inference phases.


In [52]:
dropout=torch.nn.Dropout(0.5)
print(dropout(one_tensor))

tensor([[0., 2., 0., 0., 0., 0.],
        [2., 0., 0., 2., 0., 0.],
        [0., 2., 2., 0., 0., 0.],
        [2., 0., 0., 0., 0., 2.],
        [0., 0., 0., 2., 2., 0.],
        [2., 0., 2., 2., 0., 0.]])


In [53]:
import torch.nn as nn

class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias) ## output = x @ W_query.T + b_query
        self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
        self.dropout = nn.Dropout(dropout) if isinstance(dropout, float) else dropout  # expects a nn.Dropout instance or float

    def forward(self, x):
        b, num_token, d_in = x.shape   ## B is the number of batches 
        query = self.w_query(x)
        key = self.w_key(x)
        value = self.w_value(x)

        att_score = query @ key.transpose(1, 2)  #keys.shape = (batch_size, seq_len, d_out) ==>keys.transpose(1, 2).shape = (batch_size, d_out, seq_len)
        att_score.masked_fill
        att_score = att_score.masked_fill(
            self.mask[:num_token, :num_token].bool(), float('-inf')
        ) 
        attn_weights = torch.softmax(
            att_score / key.shape[-1]**0.5, dim=-1
        )
        attn_weights = self.dropout(attn_weights)

        context_vec = attn_weights @ value
        return context_vec

Causal_attn = CausalAttention(3, 2, 6, 0.0)   ## Context_length = Number of token
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
batch = torch.stack((inputs, inputs), dim=0)  ## 2 batch
Causal_attn(batch)

tensor([[[ 0.3762, -0.3049],
         [ 0.4803, -0.4585],
         [ 0.5127, -0.5059],
         [ 0.4626, -0.4655],
         [ 0.4414, -0.4385],
         [ 0.4323, -0.4399]],

        [[ 0.3762, -0.3049],
         [ 0.4803, -0.4585],
         [ 0.5127, -0.5059],
         [ 0.4626, -0.4655],
         [ 0.4414, -0.4385],
         [ 0.4323, -0.4399]]], grad_fn=<UnsafeViewBackward0>)

## MultiHead Attention

In [54]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.d_out=d_out
        self.num_heads=num_heads
        self.head_dim=d_out//num_heads
        
        self.w_query=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_key=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.w_value=nn.Linear(d_in,d_out,bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )
        
    def forward(self, x):
        b, num_token, d_in = x.shape
        key=self.w_key(x)
        query=self.w_query(x)
        value=self.w_value(x)
        
            # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys=key.view(b,num_token,self.num_heads,self.head_dim)
        queries=query.view(b,num_token,self.num_heads,self.head_dim)
        values=value.view(b,num_token,self.num_heads,self.head_dim)
        
            # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
            
        # keys=keys.veiw(b,self.num_heads,num_token,self.head_dim)
        # values=values.veiw(b,self.num_heads,num_token,self.head_dim)
        # queries=queries.veiw(b,self.num_heads,num_token,self.head_dim)
            
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)    ## same as above
        
        att_score=queries @ keys.transpose(2,3)  # Dot product for each head
        mask_bool = self.mask.bool()[:num_token, :num_token]
        att_score.masked_fill_(mask_bool,-torch.inf)           
        att_weight=torch.softmax(att_score/key.shape[-1]**0.5 ,dim=-1)
        
        att_weight=self.dropout(att_weight)
        
        context_vec=att_weight @ values 
        
        ## Back to original shape  : (b, num_tokens, num_heads, head_dim)
        
        context_vec = context_vec.view(b, num_token, self.d_out)
        
        context_vec = self.out_proj(context_vec) 
        return context_vec 
        

In [55]:
## Apply the above class 
inputs = torch.tensor(
    [[0.67, 0.78, 0.79, 0.70, 0.14, 0.66],  # Row 1
     [0.87, 0.34, 0.74, 0.12, 0.54, 0.35],  # Row 2
     [0.77, 0.25, 0.10, 0.05, 0.80, 0.55]]  # Row 3
)

input=torch.stack((inputs,inputs),dim=0)
d_out=6
batch,context_len,d_in=input.shape
Muti_head_att=MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2)
cont_vec = Muti_head_att(input)
print(cont_vec)

tensor([[[-0.0866, -0.0584,  0.7947,  0.4610,  0.0820, -0.1711],
         [-0.1763, -0.1528,  0.6682,  0.1842,  0.2418,  0.0166],
         [ 0.0903, -0.2794,  0.2088,  0.0811,  0.3074,  0.2633]],

        [[-0.0866, -0.0584,  0.7947,  0.4610,  0.0820, -0.1711],
         [-0.1763, -0.1528,  0.6682,  0.1842,  0.2418,  0.0166],
         [ 0.0903, -0.2794,  0.2088,  0.0811,  0.3074,  0.2633]]],
       grad_fn=<ViewBackward0>)


## IMPLEMENTING A GPT MODEL FROM SCRATCH TO GENERATE TEXT

In [56]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [57]:
## Tokenization
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")
# text="How are you"
# print(tokenizer.encode(text))

## Layer Normalization

In [58]:
batch=torch.randn(2,5)
layer=nn.Sequential(nn.Linear(5,6),nn.ReLU())
output=layer(batch)
print(batch.shape)

torch.Size([2, 5])


Let us apply layer normalization to the layer outputs we obtained earlier. The
operation consists of subtracting the mean and dividing by the square root of the variance
(also known as standard deviation):


In [59]:
mean=output.mean(dim=-1,keepdim=True)
var=output.var(dim=-1,keepdim=True)
Norm_out=(output-mean)/torch.sqrt(var)

In [60]:
print(f"Mean after Norm :{Norm_out.mean(dim=-1)}")   ## Mean are very close to zero
print(f"Varinace after Norm :{Norm_out.var(dim=-1)}")  ## Variance =1

Mean after Norm :tensor([ 1.9868e-08, -1.4901e-08], grad_fn=<MeanBackward1>)
Varinace after Norm :tensor([1.0000, 1.0000], grad_fn=<VarBackward0>)



Note that the value 2.9802e-08 in the output tensor is the scientific notation for 2.9802 ×
10-8, which is 0.0000000298 in decimal form. This value is very close to 0, but it is not
exactly 0 due to small numerical errors that can accumulate because of the finite precision
with which computers represent numbers.

In [61]:
## Let's Define the LayerNorm Class
class LayerNorm(nn.Module):
    def __init__(self,embd_dim):
        super().__init__()
        self.eps=1e-7
        self.scale = nn.Parameter(torch.ones(embd_dim))  ## It is trainable parameter
        self.shift = nn.Parameter(torch.zeros(embd_dim)) ## It is also trainable parameter
    
    def forward(self,x):
        mean=x.mean(dim=-1,keepdim=True)
        var=x.var(dim=-1,keepdim=True)
        norm_x= (x-mean)/torch.sqrt(var+self.eps)
        norm_x=norm_x * self.scale +self.shift
        return norm_x
    

## FEEDFORWARD NEURAL NETWORK WITH GELU ACTIVATION

In [62]:
## Lets Define 
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self,x):
        result=0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))))
        return result
    
    

Next, let's use the GELU function to implement the small neural network module,
FeedForward, that we will be using in the LLM's transformer block later:

In [63]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

## SHORTCUT CONNECTIONS

In [64]:
import torch.nn as nn
class DeepNeuralNetwork(nn.Module):
    def __init__(self,layer_sizes,use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])
    def forward(self,x):
        for layer in self.layers:
            layer_out=layer(x)
            ## check is the sahpe are same or not to apply the shortcut
            if self.use_shortcut and x.shape== layer_out.shape:
                x=x+layer_out
            else:
                x=layer_out
            return x

Let's use this code to first initialize a neural network without shortcut connections. Here,
each layer will be initialized such that it accepts an example with 3 input values and returns
3 output values. The last layer returns a single output value:

In [65]:
layer_size=[3, 3, 3, 3, 2, 1]
sample_input = torch.tensor([[1., 0., -1.]])
model_without_shortcut = DeepNeuralNetwork(
layer_size, use_shortcut=False
)
model_with_shortcut=DeepNeuralNetwork(layer_size,use_shortcut=True)

In [66]:
for name, para in model_with_shortcut.named_parameters():
	print(name, para.shape)

layers.0.0.weight torch.Size([3, 3])
layers.0.0.bias torch.Size([3])
layers.1.0.weight torch.Size([3, 3])
layers.1.0.bias torch.Size([3])
layers.2.0.weight torch.Size([3, 3])
layers.2.0.bias torch.Size([3])
layers.3.0.weight torch.Size([2, 3])
layers.3.0.bias torch.Size([2])
layers.4.0.weight torch.Size([1, 2])
layers.4.0.bias torch.Size([1])


In [67]:
print(f"Output Without ShortCut {model_without_shortcut(sample_input)} ")
print(f"Output With Shortcut {model_with_shortcut(sample_input)}")

Output Without ShortCut tensor([[ 0.2821, -0.1635,  0.2824]], grad_fn=<MulBackward0>) 
Output With Shortcut tensor([[1.2008, 0.2174, 0.3048]], grad_fn=<AddBackward0>)


In [100]:
def check_grad(model, x):
    # Forward pass
    output = model(x)
    target = torch.tensor([[0.]])

    # Calculate loss based on how close the target
    # and output are
    loss = nn.MSELoss()
    loss = loss(output, target)
    
    # Backward pass to calculate the gradients
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            if param.grad is not None:
                # Print the mean absolute gradient of the weights
                print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")
            else:
                print(f"{name} has no gradient (param.grad is None)")

In [101]:
check_grad(model_without_shortcut, sample_input)

layers.0.0.weight has gradient mean of 0.2820483148097992
layers.1.0.weight has no gradient (param.grad is None)
layers.2.0.weight has no gradient (param.grad is None)
layers.3.0.weight has no gradient (param.grad is None)
layers.4.0.weight has no gradient (param.grad is None)


In [102]:
print(f"Output Without ShortCut Gradient :{check_grad(model_without_shortcut, sample_input)} ")
print(f"Output With Shortcut Gradient:  {check_grad(model_with_shortcut, sample_input)}")

layers.0.0.weight has gradient mean of 0.3525603711605072
layers.1.0.weight has no gradient (param.grad is None)
layers.2.0.weight has no gradient (param.grad is None)
layers.3.0.weight has no gradient (param.grad is None)
layers.4.0.weight has no gradient (param.grad is None)
Output Without ShortCut Gradient :None 
layers.0.0.weight has gradient mean of 0.20850218832492828
layers.1.0.weight has no gradient (param.grad is None)
layers.2.0.weight has no gradient (param.grad is None)
layers.3.0.weight has no gradient (param.grad is None)
layers.4.0.weight has no gradient (param.grad is None)
Output With Shortcut Gradient:  None


##  ATTENTION AND LINEAR LAYERS

In [70]:
class Transformer(nn.Module):
    def __init__(self,cfg):
        super().__init__()
   
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

  

In [71]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768) #A
block = Transformer(GPT_CONFIG_124M)
output = block(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


## GPT ARCHITECTURE

In [72]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
        self.drop_emb=nn.Dropout(cfg["drop_rate"] )
        
        self.trf_block=nn.Sequential(
            *[Transformer(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm=LayerNorm(cfg["emb_dim"])
        self.out_head=nn.Linear(cfg["emb_dim"],cfg["vocab_size"], bias=False)
        
    def forward(self,in_idx):
        batch_size,seq_len=in_idx.shape  ## INput index
        tok_emb = self.tok_emb(in_idx)   ## Convert the ids into embedding spaces
        # pos_emb= self.pos_emb(in_idx)
        device = in_idx.device  # to keep it on the same device (CPU or GPU)
        positions = torch.arange(seq_len, device=device).unsqueeze(0)  # shape: (1, seq_len)
        pos_emb = self.pos_emb(positions)  # shape: (1, seq_len, emb_dim)

        x=tok_emb + pos_emb
        x = self.trf_block(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits      
            
        

In [76]:
batch=torch. tensor([[610, 3621,60,  3451],
        [610, 111, 622,  257]])

model=GPTModel(GPT_CONFIG_124M)
out=model(batch)


In [77]:
out

tensor([[[-0.2927,  0.4366,  0.5312,  ..., -0.5746,  0.6668,  0.1098],
         [ 1.5808,  1.3435,  0.5079,  ..., -0.2106,  0.0256, -0.0559],
         [-0.0593,  0.1581, -0.7858,  ...,  0.1746,  0.1069,  0.0903],
         [ 0.4270,  1.4716,  0.1388,  ..., -0.1685, -0.3926,  0.2356]],

        [[-0.0949,  0.4965,  0.7375,  ..., -0.7842,  0.6145,  0.4029],
         [ 0.0151,  1.0135, -0.5840,  ..., -0.4165, -0.4926, -0.3334],
         [ 0.9529,  0.0164, -0.4429,  ...,  0.0536,  0.6197,  0.5303],
         [ 1.3596,  0.8030,  0.0059,  ...,  0.1113, -0.1000, -0.0966]]],
       grad_fn=<UnsafeViewBackward0>)

In [78]:
print(torch.max(batch), GPT_CONFIG_124M["vocab_size"])


tensor(3621) 50257


## GENERATING TEXT FROM OUTPUT TOKENS
Step 1: idx is a (batch, n_tokens) array of indices in the current context

Step 2: Crop current context if it exceeds the supported context size E.g., if LLM supports only 5 tokens, and the
context size is 10 then only the last 5 tokens are used as context

Step 3: Focus only on the last time step, so that (batch, n_token, vocab_size) becomes (batch, vocab_size)

Step 4: probas has shape (batch, vocab_size)

Step 5: idx_next has shape (batch, 1)

Step 6: Append sampled index to the running sequence, where idx has shape (batch, n_tokens+1)

In [82]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

## Give a Input text 

In [88]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) #A
print("encoded_tensor.shape:", encoded_tensor.shape)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])


## Produce output ID'S

In [89]:
model.eval() #A
out = generate_text_simple(
model=model,
idx=encoded_tensor,
max_new_tokens=6,
context_size=GPT_CONFIG_124M
["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[15496,    11,   314,   716, 43400, 34877,  4139, 19528, 12083, 26108]])
Output length: 10


## Decode ID to text

In [90]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I ambleretsy Minister backwardding Regulations
