<a href="https://colab.research.google.com/github/viti990/my_own_llama/blob/main/llamav27b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aviator LLaMA

This notebook aims at reconstructing LLAMA 2 7B architecture and use the model weights trained from meta to learn how it is done!

Also the aim is to adapt the model with LoRA and QLoRA for PEFT!

The fine tuning will be done with aviation regulations from 14 CFR...


## 1.0 Downloading the weights

Downloading the weights from meta for 7B-chat (i want the model to be able to respond to questions from the requirements)
for this one must go to the meta website and request access, then go the github and use the download.sh scrip as shown below.


In [1]:
!git clone https://github.com/meta-llama/llama/
!mv ./llama/download.sh ./
!rm -rf llama
!bash download.sh
!rm -rf LICENSE USE_POLICY.md tokenizer_checklist.chk

Cloning into 'llama'...
remote: Enumerating objects: 464, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 464 (delta 17), reused 33 (delta 12), pack-reused 417[K
Receiving objects: 100% (464/464), 1.12 MiB | 13.28 MiB/s, done.
Resolving deltas: 100% (235/235), done.
Enter the URL from email: https://download.llamameta.net/*?Policy=eyJTdGF0ZW1lbnQiOlt7InVuaXF1ZV9oYXNoIjoiZWNmZDIwYm9xaXQwYmlhYWxycno1NjRpIiwiUmVzb3VyY2UiOiJodHRwczpcL1wvZG93bmxvYWQubGxhbWFtZXRhLm5ldFwvKiIsIkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMDYzNDk0M319fV19&Signature=hFxxtS1sJMGmH3uW%7Eti%7EM7mUiVOydtiPzTG3vmFowlCvgwOEfPpXmBIvyqeVVJyiGbSEtX0GlFZtg6A8b93HQ%7Ecgqyea8-q2OvEleKUXp0ke3RT4OSe-3qtfySSdQNh1voO2ycLSkyBZRZjpTE32jkXcPtNUoODQf4glV-bPTZGcoMSH3VByvTjKJsCp8gvRm91OHv37DET2aFI4wr6wFJAChpqOrTWiTtQI4VksPp77AOWWg7vDOwj2yCmIg-hc-k%7EEJIFmOMum8-C1DkVDt2D1AQT-1JSMVn7I1EVClDZD2Ax2q3-A176S5JgwLSTpXYwV5hoXlULYyTvK4GrVbw__&Key-Pa

## 2. Creating the architecture

### 2.1 Importing everything...

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
from typing import Optional

In [2]:
@dataclass
class ModelArgs:
  dim: int=4096
  n_layers: int=32
  n_heads: int=32 #number of heads for the queries
  n_kv_heads: Optional[int]=None #number of values for the key and value
  vocab_size: int = -1 # This will be set when we load the tokenizer
  multiple_of: int = 256
  ffn_dim_multiplier: Optional[float]=None
  norm_eps: float = 1e-5

  #Needed for KV cache
  max_batch_size: int=32
  max_seq_len: int=2048

  device: str=None

def precompute_theta_pos_frequencies(head_dim: int, seq_len: int, device: str, theta: float = 10000.0 ):
  # as written in the paper, the dimension of the embedding must be even.
  assert head_dim % 2 ==0, "Dimension must be divisible by 2"
  # Build the theta parameters
  # According to the formula theta_i = 10000^ (-2(i-1)/dim) for i = [1, 2, ... dim / 2]
  # Shape: (head_dim / 2)
  theta_numerator = torch.arange(0, head_dim, 2).float()
  theta = 1.0 / (theta ** (theta_numerator/head_dim)).to(device)
  # Construct the positions (the "m" parameter)
  # shape: (seq_len)
  m = torch.arange(seq_len, device=device)
  # Multiply each theta by each position using the outer product
  # Shape: {Seq_Len} outer_product * (head_dim / 2) -> (seq_len, head_dim / 2)
  freqs = torch.outer(m, theta).float()
  # we can compute complex numbers in the polar form c = R * exp(i * m * theta), where r = 1 as follows:
  freqs_complex = torch.polar(torch.ones_like(freqs), freqs)

  return freqs_complex

def apply_rotary_embedding(x: torch.Tensor, freqs_complex: torch.Tensor, device: str):
  # (B, Seq_len, H, Head_dim) -> (B,Seq_len, H, head_dim/2)
  x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
  # (Seq_len, head_dim / 2) -> (1, Seq_len, 1, head_dim / 2)
  freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(2)
  # (B, Seq_len, H, head_dim / 2) * (1, Seq_len, 1, head_dim / 2) = (B, seq_len, H, Head_Dim / 2)
  x_rotated = x_complex * freqs_complex
  # (B, seq_len, H, head_dim / 2) -> (B, seq_len, H, head_dim / 2, 2)
  x_out = torch.view_as_real(x_rotated)
  # (B, seq_len, H, head_dim / 2, 2) -> (B, seq_len, H, head_dim)
  x_out = x_out.reshape(*x.shape)

  return x_out.type_as(x).to(device)

def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
  batch_size, seq_len, n_kv_heads, head_dim = x.shape
  if n_rep == 1:
     return x
  else:
    # (B, seq-len, n_kv_heads, 1, head_dim)
    return (
        x[:, :, :, None, :]
        .expand(batch_size,seq_len, n_kv_heads, n_rep, head_dim)
        .reshape(batch_size, seq_len, n_kv_heads * n_rep, head_dim)
    )

class RMSNorm(nn.Module):
  def __init__(self, dim: int, eps: float = 1e-6):
    super().__init__()
    self.eps = eps
    # the gamma parameter
    self.weight = nn.Parameter(torch.ones(dim))

  def _norm(self, x: torch.Tensor):
    # (B, seq_len, dim) * (B, seq_len, 1) = (B, seq_len, dim)
    # rsqrt: 1/sqrt(x)
    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

  def forward(self, x: torch.Tensor):
    #(dim) * (B, Seq_len, dim) * (B, seq_len, dim)
    return self.weight * self._norm(x.float()).type_as(x)

class SelfAttention(nn.Module):
  def __init__(self, args: ModelArgs):
    super().__init__()
    # Indicates the number of heads fot the key and values
    self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
    # Indicates the number of heads for the queries
    self.n_heads_q = args.n_heads
    # Indicates how many times the heads of keys and values should be repeated to match the head of the queries
    self.n_rep = self.n_heads_q // self.n_kv_heads
    # Indicates the dimension of each head
    self.head_dim = args.dim // args.n_heads

    self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias = False)
    self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias = False)
    self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias = False)
    self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim,  bias = False)

    self.cache_k = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
    self.cache_v = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))

  def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
    batch_size, seq_len, _ = x.shape #(B, 1, dim)

    # Apply the wq, wk, wv matrices to queries, keys and values
    # (B, 1, dim) -> (B, 1, H_Q * head_dim)
    xq = self.wq(x)

    # (B, 1, dim) -> (B, 1, H_KV * head_dim)
    xk = self.wk(x)
    xv = self.wv(x)

    # (B, 1, H_Q * head_dim) -> (B, 1, H_Q, head_dim)
    xq = xq.view(batch_size, 1, self.n_heads_q, self.head_dim)

    # (B, 1, H_KV * head_dim) -> (B, 1, H_KV, head_dim)
    xk = xk.view(batch_size, 1, self.n_kv_heads, self.head_dim)
    xv = xv.view(batch_size, 1, self.n_kv_heads, self.head_dim)

    # Does not change the shape of the tensors
    xq = apply_rotary_embedding(xq, freqs_complex, device = x.device)
    xk = apply_rotary_embedding(xk, freqs_complex, device = x.device)

    # Replace the entry in the cache fot this token
    self.cache_k[:batch_size, start_pos:start_pos+seq_len] = xk
    self.cache_v[:batch_size, start_pos:start_pos+seq_len] = xv

    # Retrieve all the cached keys and values so far
    # (B, seq_len_KV, H_KV, head_dim)
    keys = self.cache_k[:batch_size, 0:start_pos+seq_len]
    values = self.cache_v[:batch_size, 0:start_pos+seq_len]

    # Repeat the heads of the K and V to reach the number of heads of the queries
    keys = repeat_kv(keys, self.n_rep)
    values = repeat_kv(values, self.n_rep)

    # (B, 1, H_Q, head_dim) -> (B, H_Q, 1, head_dim)
    xq = xq.transpose(1, 2)
    keys = keys.transpose(1, 2)
    values = values.transpose(1, 2)

    # (B, H_Q, 1, seq_len) @ (B, H_Q, seq_len_kv, head_dim) -> (B, H_Q, 1, head_dim)
    scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
    scores = F.softmax(scores, dim=-1).type_as(xq)

    # (B, H_Q, 1, head_dim) -> (B, 1, H_Q, head_dim) -> (B, 1, dim)
    output = torch.matmul(scores, values)
    output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)

    return self.wo(output)

class FeedForward(nn.Module):
  def __init__(self, args: ModelArgs):
    super().__init__()

    hidden_dim = 4 * args.dim
    hidden_dim = int(2 * hidden_dim / 3)
    if args.ffn_dim_multiplier is not None:
      hidden_dim = int(args.ffn_dim_multiplier * hidden_dim)
    # round the hidden_dim to the nearest multiple of the multiple_of parameter
    hidden_dim = args.multiple_of * ((hidden_dim + args.multiple_of - 1) // args.multiple_of)

    self.w1 = nn.Linear(args.dim, hidden_dim, bias=False)
    self.w2 = nn.Linear(hidden_dim, args.dim, bias=False)
    self.w3 = nn.Linear(args.dim, hidden_dim, bias=False)

  def forward(self, x: torch.Tensor):
    swish = F.silu(self.w1(x))
    x_V = self.w3(x)
    x = swish * x_V
    x= self.w2(x)
    return x

class EncoderBlock(nn.Module):
  def __init__(self, args: ModelArgs):
    super().__init__()

    self.n_head=args.n_heads
    self.dim = args.dim
    self.head_dim = args.dim // args.n_heads

    self.attention = SelfAttention(args)
    self.feed_forward = FeedForward(args)

    # Normalization BEFORE the self attention
    self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
    # Normalization BEFORE the feed forward block
    self.ffn_norm = RMSNorm(args.dim, eps= args.norm_eps)

  def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
    # (B, seq_len, Dim) + (B, seq_len, Dim) -> (B, seq_len, Dim)
    h = x + self.attention.forward(self.attention_norm(x), start_pos,freqs_complex)
    out = h + self.feed_forward.forward(self.ffn_norm(h))


class Transformer(nn.Module):

  def __init__(self, args: ModelArgs) -> None:
    super().__init__()
    assert args.vocab_size != -1, "Vocab size must be set"

    self.args = args
    self.vocab_size = args.vocab_size
    self.n_layers = args.n_layers
    self.tok_embeddings = nn.Embedding(self.vocab_size, args.dim)

    self.layers = nn.ModuleList()
    for _ in range(args.n_layers):
      self.layers.append(EncoderBlock(args))

    self.norm = RMSNorm(args.dim, eps=args.norm_eps)
    self.output = nn.Linear(args.dim, self.vocab_size, bias=False)

    self.freqs_complex = precompute_theta_pos_frequencies(self.args.dim // self.args.n_heads, self.args.max_seq_len * 2, device=self.args.device)

  def forward(self, tokens: torch.Tensor, start_pos: int):
    # (B, seq_len)
    batch_size, seq_len = tokens.shape
    assert seq_len == 1, "only one token at a time can be processed" #this only works for inference for training you must process more than 1 token at a time and must also remove KV cache

    # (B, Seq_Len) -> (B, seq_len, Dim)
    h = self.tok_embeddings(tokens)

    # Retrieve the pairs (m, theta) corresponding to the positions [start_pos, start_pos + seq_len]
    freqs_complex = self.freqs_complex[start_pos:start_pos+seq_len]

    # Consecutively apply all the encoder layers
    for layer in self.layers:
      h = layer(h, start_pos, freqs_complex)
    h = self.norm(h)

    output = self.output(h).float()
    return output


In [3]:
from typing import Optional
import torch
import time
from pathlib import Path
import json
from sentencepiece import SentencePieceProcessor
from tqdm import tqdm

#from model import ModelArgs, Transformer

class LLaMA:

  def __init__(self, model: Transformer, tokenizer: SentencePieceProcessor, model_args: ModelArgs):
    self.model = model
    self.tokenizer = tokenizer
    self.args = model_args

  @staticmethod
  def build(checkpoints_dir: str, tokenizer_path: str, load_model: bool, max_seq_len: int, max_batch_size: int, device: str):
    prev_time = time.time()
    if load_model:
      checkpoints = sorted(Path(checkpoints_dir).glob('*.pth'))
      assert len(checkpoints) > 0, "No checkpoints files found"
      chk_path = checkpoints[0]
      print(f"Loading checkpoint {chk_path}")
      checkpoint = torch.load(chk_path, map_location="cpu")
      print(f"Loaded checkpoint in {(time.time() - prev_time):.2f}s")
      prev_time = time.time()

    with open (Path(checkpoints_dir) / "params.json", "r") as f:
      params = json.loads(f.read())

    model_args: ModelArgs = ModelArgs(
      max_seq_len=max_seq_len,
      max_batch_size=max_batch_size,
      device=device,
      **params
    )

    tokenizer = SentencePieceProcessor()
    tokenizer.load(tokenizer_path)
    model_args.vocab_size = tokenizer.vocab_size()

    if device == "cuda":
      torch.set_default_tensor_type(torch.cuda.HalfTensor)
    else:
      torch.set_default_tensor_type(torch.BFloat16Tensor)

    model = Transformer(model_args).to(device)
    if load_model:
      del checkpoint["rope.freqs"]
      model.load_state_dict(checkpoint, strict=True)
      print(f"loaded state dict in {(time.time() - prev_time):.2f}s")

    return LLaMA(model, tokenizer, model_args)

  def text_completion(self, prompts: list[str], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None):
    if max_gen_len is None:
      max_gen_len = self.args.max_seq_len - 1
    # Convert each prompt into tokens
    prompt_tokens = [self.tokenizer.encode(prompt, out_type=int, add_bos=True, add_eos=False) for prompt in prompts]
    # Make sure the batch size is not too large
    batch_size = len(prompt_tokens)
    assert batch_size <= self.args.max_batch_size
    max_prompt_len = max(len(prompt) for prompt in prompt_tokens)
    # make sure the prompt length is not larger than the maximum seq length
    assert max_prompt_len <= self.args.max_seq_len
    total_len = min(self.args.max_seq_len, max_gen_len + max_prompt_len)

    # Create the list that will contain the generated tokens, along with the initial prompt tokens
    pad_id = self.tokenizer.pad_id()
    tokens = torch.full((batch_size, total_len), pad_id, dtype = torch.long, device=device)
    for k, t in enumerate(prompt_tokens):
      # Populate the initial tokens with the prompt token, False otherwise
      tokens[k, :len(t)] = torch.tensor(t, dtype=torch.long, device=device)

    eos_reached = torch.tensor([False] * batch_size, device=device)
    prompt_tokens_mask = tokens != pad_id # True if the token is a prompt token, false otherwise
    for cur_pos in tqdm(range(1, total_len), desc="Generating..."):
      with torch.inference_mode():
        logits = self.model.forward(tokens[:, cur_pos-1:cur_pos], cur_pos)
      if temperature > 0 :
        # The temperature  is applied BEFORE the softmax
        probs = self.model.forward(logits[:, -1] / temperature, dim = -1)
        next_token = self._sample_top_p(probs, top_p)
      else:
        # Greedly select the token with the maximum probability
        next_token = torch.argmax(logits[:, -1], dim = -1)
      next_token = next_token.reshape(-1)
      # only replace the token if it is a padding token
      next_token = torch.where(prompt_tokens_mask[:, cur_pos], tokens[:, cur_pos], next_token)
      tokens[:, cur_pos] = next_token
      # EOS is reached only if we found an EOS token for a padding position
      eos_reached |= (~promt_tokens_mask[:, cur_pos]) & (next_token == self.tokenizer.eos_id())
      if all(eos_reached):
        break
    out_tokens=[]
    out_text=[]
    for prompt_index, current_prompt_tokens in enumerate(tokens.tolist()):
      # Cut to the EOS token, if present
      if self.tokenizer.eos_id() in current_prompt_tokens:
        eos_idx = current_prompt_tokens.index(self.tokenizer.eos_id())
        current_prompt_tokens = current_prompt_tokens[:eos_idx]
      out_tokens.append(current_prompt_tokens)
      out_text.append(self.tokenizer.decode(current_prompt_tokens))
    return (out_tokens, out_text)

  def _sample_top_p(self, probs, p):
    probs_sort, probs_idx = torch.sort(probs,dim = -1,descending = True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > p
    probs_sort[mask] = 0.0
    probs_sort._div(probs_sort.sum(-1, keepdim=True))
    next_token = torch.multinomial(probs_sort, num_samples=1)
    next_token = torch.gather(probs_idx, -1, next_token)

    return next_token


In [4]:
torch.manual_seed(42)

allow_cuda = False
device = "cuda" if torch.cuda.is_available() and allow_cuda else "cpu"

prompts = [
    " Simply put, the theory of relativity states that",
    "If google was an italian company founded in Milan, it would",
    # Few shot prompt
    """Translate English to French:

    sea otter => loutre de mer
    peppermint -> menthe poivrée
    plush giraffe => girafe peluche
    cheese omelette =>""",
    # Zero shot prompt
    """ Tell me if the following person is actually Doraemon disguised as human:
    Name: Umar Jamil
    Decision:"""]

model = LLaMA.build(
    checkpoints_dir="llama-2-7b",
    tokenizer_path="./tokenizer.model",
    load_model=True,
    max_seq_len=1024,
    max_batch_size=len(prompts),
    device=device
)

Loading checkpoint llama-2-7b/consolidated.00.pth
Loaded checkpoint in 30.23s


  _C._set_default_tensor_type(t)


loaded state dict in 79.11s


In [5]:
# Inference the model
out_tokens, out_text = (model.text_completion(prompts, max_gen_len=64))
assert len(out_text) == len(prompts)
for i in range(len(out_text)):
  print(f"{out_text[i]}")
  print("-" * 50)

Generating...:   0%|          | 0/117 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'float'