<a href="https://colab.research.google.com/github/var-567/Ant.ai/blob/master/Ant_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Data Preparation**
##*Scrapping data and preprocessing*

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
url = "https://gutenberg.net.au/ebooks05/0500071h.html#H5"
response = requests.get(url)

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
paragraphs = soup.find_all('p')
# paragraph variable contains the *List* of all the <p> tag contents.
# Check if there are at least 4 <p> tags
if len(paragraphs) >= 4:
    #scrap all the data after the fourth para to eleminate the links and store only the text.
    text_content = ''
    for paragraph in paragraphs[16:]:
      for i in paragraph.get_text():
        if ord(i) < 128 :  #remove all the non standard English characters
          text_content += i
        else:
          text_content += ''+ '\n\n'
      text_content += '\n\n'
    text_content = re.sub(r'^\d+\.', '',text_content, flags=re.MULTILINE)
    # Write the text content to a file
    with open('Cook_Book.txt', 'w', encoding='utf-8') as file:
        file.write(text_content)
else:
    print("There are fewer than 4 <p> tags on the page.")




#**Let's build a GPT model**
GPT-generative pretrained transformer.
```

It is an **Autoregressive** ,*Unidirectional ,Transformer Encoder* architecture.

##**Aim**
To build a model that generate recipes that mimic the style and flavors typical of Indian cuisine
basically to predict the next text in the sequence by context provided by the preceding words.

In [5]:
#find the number of unique characters in the data.
chars=sorted(list(set(text_content)))
vocab_size=len(chars)
print(''.join(chars))
print(vocab_size)
##now 72 is our vocabulary size.the possible character the model can generate.


 "&'(),-.0123456789:;ABCDEFGHIJKLMNOPQRSTUVWYabcdefghijklmnopqrstuvwxyz
72


Tokeizing - converting text to numericals.since machines uderstand only that.

In [6]:
#strategy to tokenize to input text,since we are building a character level language model

toint={ch:i for i,ch in enumerate(chars)}#assigns a number to every letter in the character array.
tochar={i:ch for i,ch in enumerate(chars)}
encode=lambda s:[toint[c] for c in s]
decode=lambda l:''.join([tochar[i] for i in l])
# print(encode("hello"))
# print(decode(encode("hello")))
#there are various ways to do this like with tiktoken or sentence peice etc.

###Encoding the data.since a character token level model is build this a character level ecoding is done.

In [7]:
import torch
data=torch.tensor(encode(text_content),dtype=torch.long)
print(data.shape,data.dtype)
print(data[:1000])

torch.Size([259911]) torch.int64
tensor([ 0,  0, 23, 60, 54, 57, 50, 49,  1, 39, 54, 48, 50,  0,  0,  0, 44, 46,
        64, 53,  1, 53, 46, 57, 51,  1, 46,  1, 61, 60, 66, 59, 49,  1, 60, 63,
         1, 46,  1, 48, 60, 60, 59, 56, 50, 50, 51, 66, 57,  1, 60, 51,  1, 63,
        54, 48, 50,  7,  1, 46, 59, 49,  1, 61, 66, 65,  1, 54, 65,  1, 65, 60,
         1, 47, 60, 54, 57,  1, 54, 59,  0, 46,  1, 57, 46, 63, 52, 50,  1, 62,
        66, 46, 59, 65, 54, 65, 70,  1, 60, 51,  1, 68, 46, 65, 50, 63,  7,  1,
        60, 67, 50, 63,  1, 46,  1, 47, 63, 54, 64, 56,  1, 51, 54, 63, 50,  9,
         1, 30, 58, 58, 50, 49, 54, 46, 65, 50, 57, 70,  1, 65, 53, 50,  1, 63,
        54, 48, 50,  0, 47, 50, 52, 54, 59, 64,  1, 65, 60,  1, 47, 60, 54, 57,
         7,  1, 65, 53, 50,  1, 68, 46, 65, 50, 63,  1, 68, 54, 57, 57,  1, 47,
        66, 47, 47, 57, 50,  1, 66, 61,  1, 65, 60,  1, 65, 53, 50,  1, 64, 66,
        63, 51, 46, 48, 50,  1, 60, 51,  1, 65, 53, 50,  1, 61, 60, 65,  0, 46,
       

##Spliting training and testing data.

In [8]:
n=int(0.9*len(text_content))
train_data=data[:n]
val_data=data[n:]
train_data[:1000]

tensor([ 0,  0, 23, 60, 54, 57, 50, 49,  1, 39, 54, 48, 50,  0,  0,  0, 44, 46,
        64, 53,  1, 53, 46, 57, 51,  1, 46,  1, 61, 60, 66, 59, 49,  1, 60, 63,
         1, 46,  1, 48, 60, 60, 59, 56, 50, 50, 51, 66, 57,  1, 60, 51,  1, 63,
        54, 48, 50,  7,  1, 46, 59, 49,  1, 61, 66, 65,  1, 54, 65,  1, 65, 60,
         1, 47, 60, 54, 57,  1, 54, 59,  0, 46,  1, 57, 46, 63, 52, 50,  1, 62,
        66, 46, 59, 65, 54, 65, 70,  1, 60, 51,  1, 68, 46, 65, 50, 63,  7,  1,
        60, 67, 50, 63,  1, 46,  1, 47, 63, 54, 64, 56,  1, 51, 54, 63, 50,  9,
         1, 30, 58, 58, 50, 49, 54, 46, 65, 50, 57, 70,  1, 65, 53, 50,  1, 63,
        54, 48, 50,  0, 47, 50, 52, 54, 59, 64,  1, 65, 60,  1, 47, 60, 54, 57,
         7,  1, 65, 53, 50,  1, 68, 46, 65, 50, 63,  1, 68, 54, 57, 57,  1, 47,
        66, 47, 47, 57, 50,  1, 66, 61,  1, 65, 60,  1, 65, 53, 50,  1, 64, 66,
        63, 51, 46, 48, 50,  1, 60, 51,  1, 65, 53, 50,  1, 61, 60, 65,  0, 46,
        59, 49,  1, 60, 67, 50, 63, 51, 

In [9]:
torch.manual_seed(1337)
batch_size=16
block_size=64

#--block size , batch size are Hyperparameter .
#batch size is the number of batches that runs in parallel.
#block size is the number of character that the character that we are going to predict gets context from.


In [10]:
# a utility function to get the sample batch to process.each batch of data is trained to
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [11]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

##A Single head of attention machanism.
The main part where the code every character gets context from the previous words.

###Hyperparameters

In [12]:
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        #self is the attribute of every instance of the class.
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        #drop out layer is an regularization layer,which sets a fraction of input unit to 0 to prevent overfitting of module.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        #implemets casual masking which ensures that each position can only attend to its previous position but not the furthur ones.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

##Multiple heads of self-attention in parallel

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
      #concatination of each output from the multihead attention.
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

##A simple linear layer followed by a non-linearity (relu) and dropout()

In [15]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

##A Combined transformer bloack implemeting all the class defined and enables communication between them.

In [16]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

##A Complete Model that implements multi-head self attention,add and norm,a feed forward neural network,resuidal connections.

In [17]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [18]:
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200

In [19]:
#create an instance of bigramLanguage Model implemented.

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
#an  optimization algorithm used to update the weights of a neural network during training.
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

0.21268 M parameters
step 0: train loss 4.4597, val loss 4.4671
step 100: train loss 2.5274, val loss 2.5824
step 200: train loss 2.4211, val loss 2.4881
step 300: train loss 2.3739, val loss 2.4347
step 400: train loss 2.3242, val loss 2.4082
step 500: train loss 2.2605, val loss 2.3470
step 600: train loss 2.1743, val loss 2.2724
step 700: train loss 2.0824, val loss 2.2061
step 800: train loss 1.9990, val loss 2.1503
step 900: train loss 1.9180, val loss 2.1000
step 1000: train loss 1.8544, val loss 2.0571
step 1100: train loss 1.8114, val loss 2.0257
step 1200: train loss 1.7492, val loss 1.9966
step 1300: train loss 1.6964, val loss 1.9537
step 1400: train loss 1.6556, val loss 1.9371
step 1500: train loss 1.6119, val loss 1.9097
step 1600: train loss 1.5788, val loss 1.9049
step 1700: train loss 1.5456, val loss 1.8827
step 1800: train loss 1.5103, val loss 1.8722
step 1900: train loss 1.4852, val loss 1.8575
step 2000: train loss 1.4653, val loss 1.8609
step 2100: train loss 1.4