<a href="https://colab.research.google.com/github/viti990/my_own_llama/blob/main/llamav27b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aviator LLaMA

This notebook aims at reconstructing LLAMA 2 7B architecture and use the model weights trained from meta to learn how it is done!

Also the aim is to adapt the model with LoRA and QLoRA for PEFT!

The fine tuning will be done with aviation regulations from 14 CFR...


## 1.0 Downloading the weights

Downloading the weights from meta for 7B-chat (i want the model to be able to respond to questions from the requirements)
for this one must go to the meta website and request access, then go the github and use the download.sh scrip as shown below.


In [3]:
!git clone https://github.com/meta-llama/llama/
!mv ./llama/download.sh ./
!rm -rf llama
!bash download.sh
!rm -rf LICENSE USE_POLICY.md tokenizer_checklist.chk

Cloning into 'llama'...
remote: Enumerating objects: 464, done.[K
remote: Counting objects:   2% (1/47)[Kremote: Counting objects:   4% (2/47)[Kremote: Counting objects:   6% (3/47)[Kremote: Counting objects:   8% (4/47)[Kremote: Counting objects:  10% (5/47)[Kremote: Counting objects:  12% (6/47)[Kremote: Counting objects:  14% (7/47)[Kremote: Counting objects:  17% (8/47)[Kremote: Counting objects:  19% (9/47)[Kremote: Counting objects:  21% (10/47)[Kremote: Counting objects:  23% (11/47)[Kremote: Counting objects:  25% (12/47)[Kremote: Counting objects:  27% (13/47)[Kremote: Counting objects:  29% (14/47)[Kremote: Counting objects:  31% (15/47)[Kremote: Counting objects:  34% (16/47)[Kremote: Counting objects:  36% (17/47)[Kremote: Counting objects:  38% (18/47)[Kremote: Counting objects:  40% (19/47)[Kremote: Counting objects:  42% (20/47)[Kremote: Counting objects:  44% (21/47)[Kremote: Counting objects:  46% (22/47)[Kremote: Counting o

## 2. Creating the architecture

### 2.1 Importing everything...

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
from typing import Optional

In [None]:
@dataclass
class ModelArgs:
  dim: int=4096
  n_layers: int=32
  n_heads: int=32 #number of heads for the queries
  n_kv_heads: Optional[int]=None #number of values for the key and value
  vocab_size: int = -1 # This will be set when we load the tokenizer
  multiple_of: int = 256
  ffn_dim_multiplier: Optional[float]=None
  norm_eps: float = 1e-5

  #Needed for KV cache
  max_batch_size: int=32
  max_seq_len: int=2048

  device: str=None

def precompute_theta_pos_frequencies(head_dim: int, seq_len: int, device: str, theta: float = 10000.0 ):
  # as written in the paper, the dimension of the embedding must be even.
  assert head_dim % 2 ==0, "Dimension must be divisible by 2"
  # Build the theta parameters
  # According to the formula theta_i = 10000^ (-2(i-1)/dim) for i = [1, 2, ... dim / 2]
  # Shape: (head_dim / 2)
  theta_numerator = torch.arange(0, head_dim, 2).float()
  theta = 1.0 / (theta ** (theta_numerator/head_dim)).to(device)
  # Construct the positions (the "m" parameter)
  # shape: (seq_len)
  m = torch.arange(seq_len, device=device)
  # Multiply each theta by each position using the outer product
  # Shape: {Seq_Len} outer_product * (head_dim / 2) -> (seq_len, head_dim / 2)
  freqs = torch.outer(m, theta).float()
  # we can compute complex numbers in the polar form c = R * exp(i * m * theta), where r = 1 as follows:
  freqs_complex = torch.polar(torch.ones_like(freqs), freqs)

  return freqs_complex

def apply_rotary_embedding(x: torch.Tensor, freqs_complex: torch.Tensor, device: str)
  # (B, Seq_len, H, Head_dim) -> (B,Seq_len, H, head_dim/2)
  x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:1], -1, 2))
  # (Seq_len, head_dim / 2) -> (1, Seq_len, head_dim / 2)
  freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(2)
  # (B, Seq_len, H, head_dim / 2) * (1, Seq_len, 1, head_dim / 2) * (B, seq_len, H, Head_Dim / 2)
  x_rotated = x_complex * freqs_complex
  # (B, seq_len, H, head_dim / 2) -> (B, seq_len, H, head_dim / 2, 2)
  x_out = torch.view_as_real(x_rotated)
  #(B, seq_len, H, head_dim / 2, 2) -> (B, seq_len, H, head_dim)
  x_out = x_out.reshape(*x.shape)

  return x_out.type_as(x).to(device)

class RMSNorm(nn.Module):
  def __init__(self, dim: int, eps: float=1e-5) -> None:
    super().__init__()
    self.eps = eps
    # the gamma parameter
    self.weight = nn.Parameter(torch.ones(dim))

  def _norm(self, x: torch.Tensor) -> torch.Tensor:
    # (B, seq_len, Dim)
    # rsqrt: 1/sqrt(x)
    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True)) + self.eps

  def forward(self, x: torch.Tensor):
    #(DIM) * (B, Seq_len, Dim) * (B, seq_len, Dim)
    return self.weight * self._norm(x.float()).type_as(x)

class EncoderBlock(nn.Module):
  def __init__(self, args: ModelArgs):
    super().__init__()

    self.n_head=args.n_heads
    self.dim = args.dim
    self.head_dim = args.dim // args.n_heads

    self.attention = SelfAttention(args)
    self.feed_forward = FeedForward(args)

    # Normalization BEFORE the self attention
    self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
    # Normalization BEFORE the feed forward block
    self.ffn_norm = RMSNorm(args.dim, eps= args.norm_eps)

  def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
    # (B, seq_len, Dim) + (B, seq_len, Dim) -> (B, seq_len, Dim)
    h = x + self.attention.forward(self.attention_norm(x), start_pos,freqs_complex)
    out = h + self.feed_forward.forward(self.ffn_norm(h))


class Transformer(nn.Module):

  def __init__(self, args: ModelArgs) -> None:
    super().__init__()
    assert args.vocab_size != -1, "Vocab size must be set"

    self.args = args
    self.vocab_size = args.vocab_size
    self.n_layers = args.n_layers
    self.tok_embeddings = nn.Embedding(self.vocab_size, args.dim)

    self.layers = nn.ModuleList()
    for _ in range(args.n_layers):
      self.layers.append(EncoderBlock(args))

    self.norm = RMSNorm(args.dim, eps=args.norm_eps)
    self.output = nn.Linear(args.dim, self.vocab_size, bias=False)

    self.freqs_complex = precompute_theta_pos_frequencies(self.args.dim // self.args.n_heads, self.args.max_seq_length * 2, device=self.args.device)

  def forward(self, tokens: torch.Tensor, start_pos: int):
    # (B, seq_len)
    batch_size, seq_len = tokens.shape
    assert seq_len == 1, "only one token at a time can be processed" #this only works for inference for training you must process more than 1 token at a time and must also remove KV cache

    # (B, Seq_Len) -> (B, seq_len, Dim)
    h = self.tok_embeddings(tokens)

    # Retrieve the pairs (m, theta) corresponding to the positions [start_pos, start_pos + seq_len]
    freqs_complex = self.freqs_complex[start_pos:start_pos+seq_len]

    # Consecutively apply all the encoder layers
    for layer in self.layers:
      h = layer(h, start_pos, freqs_complex)
    h = self.norm(h)

    output = self.output(h).float()
    return output



