### Dummy GPT Model 
A GPT arhitecture contains embedding + positional encoding --> normalization --> some other operations in between --> transformer (scaled dot product attention in multi head attention + masked attention) 

In [1]:
import torch 
import torch.nn as nn 


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/envs/newEnv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/newEnv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/envs/newEnv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File 

### Backbone Terms: 
- vocabulary size: total # of vocabularies in the training dataset (depends on tokenization, we may add other <> tokens as well). 
- context_length: max number of input tokens taken at each time 
- embed_dim: the dimension of the word embedding at encoding stage (more dim can improve words learning better)
- n_layers: number of hidden layers in the transformer block 
- n_heads: # of attention heads 

In [4]:
config= {
    "vocab_size" : 50257, 
    "context_length" : 1024, 
    "embed_dim" : 768,
    "n_heads" : 12, 
    "n_layers" : 12, 
    "drop_rate" : 0.1  # each node during the hidden layer has 10% being dropped. Avoids overfitting 
}

In [3]:
class GPTModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__() # init torch.nn.Module class 
        self.embed= nn.Embedding(cfg["vocab_size"], cfg["embed_dim"])
        self.pos_embed= nn.Embedding(cfg["context_length"], cfg["embed_dim"])   # add the positional encoding. 
        # input will be the same as the context length (# of tokens processed each time, and embed them at a hidden dimension)

        self.drop_embed= nn.Dropout(cfg["drop_rate"])   # drop out function 
        #TODO: self.transformer = nn.Sequential () # transformer block 
        #TODO: self.final_norm= LayerNorm (cfg["embed_dim"])   # layer normalization 
        self.out_head= nn.Linear(cfg["embed_dim"], cfg["vocab_size"], bias=False)   # final FC layer back to vocabulary

    # return the non-softmax output of 
    def forward(self, in_idx):
        batch_size, seq_len= in_idx.shape # vector would be importe as the size batch x vocabulary size/ length of the setence
        text_embed= self.embed(in_idx)
        pos_embed= self.pos_embed(torch.arange(seq_len, device= in_idx.device)) # non-repeating positional encoding (copies the entire weight matrix of the token_embed)
        
        # step 1: add embedding and positional encoding 
        x= pos_embed + text_embed 
        # step 2: apply drop out 
        x= self.drop_embed(x)
        # step 3 :go through the transformer 
        x= self.transformer(x) 
        # step 4: more layer normalization and then FC to output 
        x= self.final_norm(x) 
        logits= self.out_head(x) 
        return logits 
        
        



#### tokenizing the input as example 
- Tokenize input with pre built tokenzier 
    - tokenzier will break down the input into tokens then make them into corresponding one-hot vector representations 
- Add to batch to feed into GPT model (the batch will be out in_idx)

In [4]:
import tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
batch = []  # append all text 
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


### Layer Normalization 
Used to reset mean to 0 and variance to 1, so the training process is stable and we are less likely to hit convergence earlier and more effective   
Normalized Vector $$\frac{(x-\mu)}{\sigma}$$  
- Sometimes the normalization involves calculating the variance, and the formula of variance is $\sum{\frac{(X-\mu)^2}{N}}$ because we divide by $N$, this doesn't use **Bessel's correction**, which would construct a non-biased variance by diving by $N-1$ instead of $N$. 

In [5]:
torch.manual_seed(123)
a= torch.randn(2,5)
layer= nn.Sequential(nn.Linear(5,2), nn.ReLU())
out= layer(a) 
print(out)

# compute the mean and variance of these two tensors 
mean= out.mean(dim=-1, keepdim=True)
variance= out.var(dim=-1, keepdim=True)
print(mean)
print(variance)

tensor([[0.4490, 0.4633],
        [0.4363, 0.3558]], grad_fn=<ReluBackward0>)
tensor([[0.4561],
        [0.3960]], grad_fn=<MeanBackward1>)
tensor([[0.0001],
        [0.0032]], grad_fn=<VarBackward0>)


In [6]:
# Layer normalization to keep mean =0, and variance =1 
# normalz
out= (out-mean)/torch.sqrt(variance)
print(out) 
mean= out.mean(dim=-1, keepdim=True)
variance= out.var(dim=-1, keepdim=True)
print("Normalized Mean:\n",mean)
print("Normalized Variance:\n", variance)

tensor([[-0.7071,  0.7071],
        [ 0.7071, -0.7071]], grad_fn=<DivBackward0>)
Normalized Mean:
 tensor([[-1.4603e-06],
        [-2.3842e-07]], grad_fn=<MeanBackward1>)
Normalized Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


#### GELU Activation Function  
  - Non traditional activation methods such as GELU and SwiGLU are used to train transformers and LLMs because they offer better performance and smooth activation for gradient descent compared with RELU   
    
  - GELU uses Gaussian Distribution. It's computed as $x * \phi(x)$ where $\phi(x)$ is the cumulative distribution function of a gaussian distribution. It's approximated in implementation. 

  - At RELU, negative x will have 0 gradient, so it causes problems at gradient descents. But with GELU functions, which are continuous, the function still has gradients with negative x. 

  - RELU functions also give a non-zero output, so negative inputs can contribute to the function  

In [2]:
class GELU (nn.Module):
    def __init__(self): 
        super().__init__() 
    def forward(self, x): 
        return 0.5*x *(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi)) * (x+0.044715 * torch.pow(x,3))))

In [6]:
class Feedforward(nn.Module):
    def __init__(self, cfg): 
        super().__init__() 
        self.layers= nn.Sequential(nn.Linear (cfg["embed_dim"], 4 *cfg["embed_dim"]), 
                                   GELU(), 
                                   nn.Linear(4 *cfg["embed_dim"],cfg["embed_dim"])
                                   )
    def forward(self,x): 
        return self.layers(x) 

In [None]:
ffn= Feedforward(cfg=config)
x= torch.randn(2,3,768)
out= ffn(x) 
print(out.shape)    # linear layer then GELU activation with the same size then linear layer out 

torch.Size([2, 3, 768])


#### Skip Connections 
Skip connections (implemented from ResNet) refer to adding previous layers' outputs to the input of future layers. (Sort of like how our visual cortex passes information beyond a few layers). This allows the model to mitigate vanishing gradient problem 

In [22]:
class ExampleDNN(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])
    def forward(self, x): 
        for layer in self.layers:   # use ModuleList to explicitly show different layers 
            layer_output= layer(x) # get each layer's output 
            if self.use_shortcut and x.shape==layer_output.shape:   # add the output layer to x (x will the input to future)
                x = x+ layer_output
            else: 
                x=layer_output  # just the output 
        return x 
def print_gradients(model,x):
    output= model(x) 
    target= torch.tensor([0.])   # loss is itself 
    loss= nn.MSELoss() 
    loss= loss(output, target)
    loss.backward() 
    for name, param in model.named_parameters(): 
        if 'weight' in name: 
            print(f"{name} has gradient :{param.grad.abs().mean().item()}")

In [23]:
layer_sizes = [3, 3, 3, 3, 3, 1]  

sample_input = torch.tensor([[1., 0., -1.]])

torch.manual_seed(123)
model_without_shortcut = ExampleDNN(
    layer_sizes, use_shortcut=False
)
print_gradients(model_without_shortcut, sample_input)

layers.0.0.weight has gradient :0.00020173587836325169
layers.1.0.weight has gradient :0.0001201116101583466
layers.2.0.weight has gradient :0.0007152041653171182
layers.3.0.weight has gradient :0.001398873864673078
layers.4.0.weight has gradient :0.005049646366387606


In [24]:
torch.manual_seed(123)
model_with_shortcut = ExampleDNN(
    layer_sizes, use_shortcut=True
)
print_gradients(model_with_shortcut, sample_input)

layers.0.0.weight has gradient :0.22169792652130127
layers.1.0.weight has gradient :0.20694105327129364
layers.2.0.weight has gradient :0.32896995544433594
layers.3.0.weight has gradient :0.2665732502937317
layers.4.0.weight has gradient :1.3258541822433472
