#Imports:

In [None]:
import torch
from torch import nn

#Dataset:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open("tiny_shakespeare.txt",encoding='utf-8') as f:
  text=f.read()
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [None]:
#Observation about the text:
text_lenght=len(text)
print(f"Length of text file: {text_lenght}")

Length of text file: 1115394


In [None]:
#Observation about the text:
characters=sorted(list(set(text)))
vocab_size=len(characters)
print(f"Vocab Size: {vocab_size}")
print(f"Characters: {''.join(characters)}" )

Vocab Size: 65
Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
#Tokenizing Strategic:
#->Create A lookup Table
#We could also used the Google's sentencepiece but it is a sub-word based tokenizer but here we are tokenizing each character of the word...
#GPT uses tiktoken... Very good in tokenizing long sequences...
#Mapping Characters to integer:-(Embedding)(Character level)
#At this stage, this is not an embedding yet—this is indexing.
Chrac_2_idx={ch:i for i,ch in enumerate(characters)}
idx_2_Chrac={i:ch for i,ch in enumerate(characters)}
encode=lambda s:[Chrac_2_idx[i] for i in s] #encoding each Character into interger
decode=lambda l:''.join([idx_2_Chrac[i] for i in l]) #Decoding the interger to it srespective Character

print(encode("Yashraj Sharma"))
print(decode(encode("Yashraj Sharma")))

[37, 39, 57, 46, 56, 39, 48, 1, 31, 46, 39, 56, 51, 39]
Yashraj Sharma


In [None]:
####converted all of Shakespeare into a single 1-D tensor of integers.
text_dataset=torch.tensor(encode(text),dtype=torch.long)
print(f"Dataset dtype: {text_dataset.dtype}")
print(f"Dataset shape: {text_dataset.shape}")
print(text_dataset[:1000])

Dataset dtype: torch.int64
Dataset shape: torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 

##Dataset split for training and validation:

In [None]:
#data_size of split:
data_size=int(0.9*len(text_dataset))
train_data=text_dataset[:data_size]
test_data=text_dataset[data_size:]
print(f"Lenght of train dataset: {len(train_data)}")
print(f"Lenght of test dataset: {len(test_data)}")

Lenght of train dataset: 1003854
Lenght of test dataset: 111540


In [None]:
#instead of training on whole chunck of dataset, we will train on random chucks of data:
#Basically the earlier tokens will be the context character for next target token:
block_size=8 #The model will never look at more than 8 past tokens
x=train_data[:block_size] #x[t] is the context
y=train_data[1:block_size+1] #y[t] is the correct next token
for t in range(block_size):
  context=x[:t+1]
  target=y[t]
  print(f"Context characters: {context}, Target Character: {target}")
#transformer will everytime get the chunk size of same to predict next character:

Context characters: tensor([18]), Target Character: 47
Context characters: tensor([18, 47]), Target Character: 56
Context characters: tensor([18, 47, 56]), Target Character: 57
Context characters: tensor([18, 47, 56, 57]), Target Character: 58
Context characters: tensor([18, 47, 56, 57, 58]), Target Character: 1
Context characters: tensor([18, 47, 56, 57, 58,  1]), Target Character: 15
Context characters: tensor([18, 47, 56, 57, 58,  1, 15]), Target Character: 47
Context characters: tensor([18, 47, 56, 57, 58,  1, 15, 47]), Target Character: 58


In [None]:
#Generalizing for parallel batch processing:
torch.manual_seed(1337)
batch_size=4     #How many batches of data is parallely trained.
block_size=8     #What is the size of the data Chunk.

def batch_creation(split):
  #This function Generates small batches of data with inputs x and targets y
  data=train_data if split=='train' else test_data
  ix=torch.randint(len(data)-block_size,(batch_size,)) #randomly selecting the starting point of input text of shape(batch_size)
  x=torch.stack([data[i:i+block_size] for i in ix]) #input Sequence (4*8) dimension batch size of 32 input characters
  y=torch.stack([data[i+1:i+block_size+1] for i in ix]) #output sequence  (4*8) dimension batch size of 32 output characters
  return x,y


input,tar=batch_creation('train')
print("inputs:")
print(input.shape)
print(input)
print("target:")
print(tar.shape)
print(tar)


print("----------------------------------------------------")


for b in range(batch_size):
  for t in range(block_size):
    context=input[b,:t+1]
    target=tar[b,t] #The current token at position t is allowed to see all tokens before it (and itself), but nothing after it.
    print(f"Inputs: {context.tolist()}, Target: {target}")

#What makes GPT autoregressive?
#Causal self-attention ensures each token only sees past tokens.

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------------------------------------------------
Inputs: [24], Target: 43
Inputs: [24, 43], Target: 58
Inputs: [24, 43, 58], Target: 5
Inputs: [24, 43, 58, 5], Target: 57
Inputs: [24, 43, 58, 5, 57], Target: 1
Inputs: [24, 43, 58, 5, 57, 1], Target: 46
Inputs: [24, 43, 58, 5, 57, 1, 46], Target: 43
Inputs: [24, 43, 58, 5, 57, 1, 46, 43], Target: 39
Inputs: [44], Target: 53
Inputs: [44, 53], Target: 56
Inputs: [44, 53, 56], Target: 1
Inputs: [44, 53, 56, 1], Target: 58
Inputs: [44, 53, 56, 1, 58], Target: 46
Inputs: [44, 53, 56, 1, 58, 46], Target: 39
Inputs: [44, 53, 56, 1, 58, 46, 39], Target: 58

#Bi-gram Language Model:

In [None]:
torch.manual_seed(1337)
#Bigram-Because the prediction for the next token depends only on the current token.
#Bigram:
#current token -> next-token distribution
#Predict the next token using everything generated so far.”
#######This is the entire point of attention.########
class BigramModel(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    #each token directly reads off the logits for the next token from a lookup table...
    self.Token_embedding_table=nn.Embedding(vocab_size,vocab_size)

  def forward(self,idx,target=None):
    logits=self.Token_embedding_table(idx) #you don’t just get a single number — you get the entire row of the embedding matrix that corresponds to that token ID.(Batch,Time,channel)
    if target==None:
      loss=0
    else: # we are basically predicting the next character based on identity of a single token...
        # loss=nn.CrossEntropyLoss(logits,target) (cannot work becuase(B,T,C)) cross entropy expect.(N,C)
        B,T,C=logits.shape # unpacking...
        logits_flat=logits.view(B*T,C) #2 dim instead of 3 dim(B,T,C) because Pytorch want input in this shape...
        target_flat=target.view(B*T)
        loss_fn=nn.CrossEntropyLoss() #no need to apply softmax for probabilty conversion as CrossEntropyloss=logsoftmax+negative log likelihood loss
        loss=loss_fn(logits_flat,target_flat)

    return logits,loss #(B,T,C) logit(4,8,65)[input shape,,vocab]...


  def generate(self,idx,max_token):
    #idx is (B,T) array of indices in the current context...
    for _ in range(max_token):
      #get the prediction:
      logits,loss=self(idx)
      #focusing only on the last step...
      logits=logits[:,-1,:]#(we only want the Batch_size and channel size)(B,C)
      #Softmax for probabilities....
      probi=nn.Softmax(dim=1)
      prob=probi(logits)
      #Sample from distribution...
      idx_next=torch.multinomial(prob,num_samples=1) #finding the next index element to be generated...(B,1)
      idx=torch.cat((idx,idx_next),dim=1)#(B,T+1)
    return idx


bigram_model=BigramModel(vocab_size)
output_ar,loss=bigram_model(input,tar)
print(output_ar.shape)
print(loss)
print(decode(bigram_model.generate(torch.zeros((1,1),dtype=torch.long),max_token=100)[0].tolist()))

torch.Size([4, 8, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


## Model Training:

In [None]:
#Loss Function and optimizer:
optimizer=torch.optim.AdamW(bigram_model.parameters(),lr=1e-3)


In [None]:
batch_size=32

for step in range(100000):
  #sample form the dataset:
  xb,yb=batch_creation('train')

  #Pytorch Loop:
  logits,loss=bigram_model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.5319576263427734


In [None]:
print(decode(bigram_model.generate(torch.zeros((1,1),dtype=torch.long),max_token=500)[0].tolist()))#Max_token means the maximum number of token the model will generate...


KI, ghyone'lisurerat ms The Bulin the theid IAm VOMal fis,
OMarethay:

MESheelore f bu hasen, t 'Wharong t bu s yove tend n, I:
LO:
cthy cotscu
ICHEdee r chidots
Whe methare f mave.
TEYo.
Fo ou, t h Exf it se ventifitou, osingave brd Gilinousthathele w ak ad g torulor wrail Je t ts osh se t ay howoutreputeeed siveica he qureades't insecoof wheagheateril th anonswa tt ald d n sm; she CEr you s n ENES: s y wird
Whit; alothers ws t y cthn;
Whasery se h ofe fise pest, oun Mou thar tlul be fearsthee 


### So Until Now in this model the token was only talking to the previous token now we want to store the pattern and sequence meaning of the whole sentences.

# The Mathematical Trick in Self-Attention:

In [None]:
torch.manual_seed(1337)
B,T,C=4,8,2
x=torch.randn(B,T,C) #batch,Token,Channel
x.shape
#Simplest way to establish Communcation between Tokens is to average out the past Channels in order to find the cuurent token...

torch.Size([4, 8, 2])

In [None]:
xbow=torch.zeros((B,T,C))
for b in range(B): #Basically Taking Average of the current and past tokens at each step...
  for t in range(T):
    xprev=x[b,:t+1]
    xbow[b,t]=torch.mean(xprev,0)
#This way is very Inefficient....

In [None]:
# version 2:
wei=torch.tril(torch.ones(T,T))
wei=wei/wei.sum(1,keepdim=True)
xbow2=wei@x
print(x[0])
print(xbow2[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [None]:
T = 8  #version 3
tril = torch.tril(torch.ones(T, T))

wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = torch.softmax(wei, dim=-1)

wei


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
#version 1:
### So we will use Matrix Multiplication:
torch.manual_seed(42)
a=torch.tril(torch.ones(3,3))
#tensor([[1., 0., 0.],
        #[1., 1., 0.],
        #[1., 1., 1.]])
#Interpretation:
#Row 0 → can see token 0
#Row 1 → can see tokens 0,1
#Row 2 → can see tokens 0,1,2
#This is a causal mask.

#output=softmax(mask)×values
#This is self-attention without learning.
#In the first row, only token 0 is visible, so after normalization it receives the full weight of 1.0.

#Row-wise normalization makes all visible tokens contribute equally

print(a)
print("--")
a=a/torch.sum(a,1,keepdim=True)#Normalization
#normalized weights × value vectors (matrix multiplication)
b=torch.randint(0,10,(3,2)).float()
c=a@b
print(a)
print('--')
print(b)
print('--')
print(c)
#Self-attention works by deciding how much each visible past token contributes to forming the context used for the next-token prediction.

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
--
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


past tokens
   ↓
embeddings
   ↓
attention (context distribution over past tokens)
   ↓
context vector
   ↓
linear head
   ↓
vocabulary distribution (next-token probabilities)


Attention decides which past tokens matter

Those tokens shape the context vector

The context vector determines which vocabulary tokens get high logits

# Here is the precise, correct flow of a GPT-style language model:

Input tokens (all past tokens, including the current one)
Token embedding
encodes what the token is
Positional embedding
encodes where the token is

Add token + position embeddings

gives an ordered sequence of vectors

Self-attention mechanism

computes how much each past token contributes

produces context vectors

Linear (LM) head

maps context vectors to vocabulary logits

Softmax

converts logits into a probability distribution

Next-token prediction

That is exactly the full pipeline.

# Each token in every single position will contain two vectors -> query:What am I looking for? Key:What do I contain? And to get affinity between these two be do the Dot Product of both vector
So my query dot product all the keys of the tokens and The dot products become scores, which are then normalized via softmax to become attention weights....

In [51]:
#Implementing Attention Mechanism:
torch.manual_seed(1337)
B,T,C=4,8,32
x=torch.randn(B,T,C)

#Single head self attention:
head_size=16
key=nn.Linear(C,head_size,bias=False)
query=nn.Linear(C,head_size,bias=False)
value = nn.Linear(C, head_size, bias=False)
k=key(x) # (B,T,16)
q=query(x) # (B,T,16)
v=value(x) # X is private to this token...V carries the actual information that flows forward...
wei=q@k.transpose(-2,-1)*(head_size**-0.5) #(B,T,16) @ (B,16,T) --> (B,T,T)
#Scaling is used to control the variance of the dot-product scores at initialization.
tril=torch.tril(torch.ones(T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
wei=torch.softmax(wei,dim=-1)
out=wei@v

In [52]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
        [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
        [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
        [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],
       grad_fn=<SelectBackward0>)

### The batch dimension is purely for parallel computation; each batch element is an independent sequence and never exchanges information with other batch elements.
# Each token can interact only with other tokens within the same batch element (i.e., within the same sequence).

### Encoder self-attention can see all tokens (past + future), while decoder self-attention can see only past and current tokens.

Conceptually, MultiHead Attention with n_embd=32:

Head 0: operates on 8 channels
Head 1: operates on 8 channels
Head 2: operates on 8 channels
Head 3: operates on 8 channels