### Dummy GPT Model 
A GPT arhitecture contains embedding + positional encoding --> normalization --> some other operations in between --> transformer (scaled dot product attention in multi head attention + masked attention) 

In [3]:
import torch 
import torch.nn as nn 

### Backbone Terms: 
- vocabulary size: total # of vocabularies in the training dataset (depends on tokenization, we may add other <> tokens as well). 
- context_length: max number of input tokens taken at each time 
- embed_dim: the dimension of the word embedding at encoding stage (more dim can improve words learning better)
- n_layers: number of hidden layers in the transformer block 
- n_heads: # of attention heads 

In [4]:
config= {
    "vocab_size" : 50257, 
    "context_length" : 1024, 
    "embed_dim" : 768,
    "n_heads" : 12, 
    "n_layers" : 12, 
    "drop_rate" : 0.1  # each node during the hidden layer has 10% being dropped. Avoids overfitting 
}

In [None]:
class GPTModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__() # init torch.nn.Module class 
        self.embed= nn.Embedding(cfg["vocab_size"], cfg["embed_dim"])
        self.pos_embed= nn.Embedding(cfg["context_length"], cfg["embed_dim"])   # add the positional encoding. 
        # input will be the same as the context length (# of tokens processed each time, and embed them at a hidden dimension)

        self.drop_embed= nn.Dropout(cfg["drop_rate"])   # drop out function 
        #TODO: self.transformer = nn.Sequential () # transformer block 
        #TODO: self.final_norm= LayerNorm (cfg["embed_dim"])   # layer normalization 
        self.out_head= nn.Linear(cfg["embed_dim"], cfg["vocab_size"], bias=False)   # final FC layer back to vocabulary

    # return the non-softmax output of 
    def forward(self, in_idx):
        batch_size, seq_len= in_idx.shape # vector would be importe as the size batch x vocabulary size/ length of the setence
        text_embed= self.embed(in_idx)
        pos_embed= self.pos_embed(torch.arange(seq_len, device= in_idx.device)) # non-repeating positional encoding (copies the entire weight matrix of the token_embed)
        
        # step 1: add embedding and positional encoding 
        x= pos_embed + text_embed 
        # step 2: apply drop out 
        x= self.drop_embed(x)
        # step 3 :go through the transformer 
        x= self.transformer(x) 
        # step 4: more layer normalization and then FC to output 
        x= self.final_norm(x) 
        logits= self.out_head(x) 
        return logits 
        
        



#### tokenizing the input as example 
- Tokenize input with pre built tokenzier 
    - tokenzier will break down the input into tokens then make them into corresponding one-hot vector representations 
- Add to batch to feed into GPT model (the batch will be out in_idx)

In [None]:
import tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
batch = []  # append all text 
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

torch.Size([2, 4])
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
