# LLaMA Loading
- Environment
    - torch
    - fairscale
    - fire
    - sentencepiece==0.1.97
    - hiq-python (if hiq is needed)
- model and ckpt
    - follow the command bellow (or Just using url to get)

## Tokenizer
- wget https://agi.gpt4.org/llama/LLaMA/tokenizer.model
- wget https://agi.gpt4.org/llama/LLaMA/tokenizer_checklist.chk

In [1]:
from sentencepiece import SentencePieceProcessor
from logging import getLogger
from typing import List
import os

class LLaMA_Tokenizer:
    def __init__(self, model_path: str):
        # reload tokenizer
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)

## LLaMA Model
- wget https://agi.gpt4.org/llama/LLaMA/7B/consolidated.00.pth
- wget https://agi.gpt4.org/llama/LLaMA/7B/checklist.chk
- wget https://agi.gpt4.org/llama/LLaMA/7B/params.json

### LLaMA Transformer Contruction


In [2]:
from typing import Optional, Tuple
from dataclasses import dataclass
import math

import torch
from torch import nn
import torch.nn.functional as F


# The "fairscale" package is a PyTorch extension library that provides various tools 
# and utilities for scalable and efficient training of deep neural networks. 
# It focuses on enabling large-scale model parallelism and mixed-precision training.
import fairscale.nn.model_parallel.initialize as fs_init
from fairscale.nn.model_parallel.layers import (
    ParallelEmbedding,
    RowParallelLinear,
    ColumnParallelLinear,
)

@dataclass
class ModelArgs:
    dim: int = 512
    n_layers: int = 8
    n_heads: int = 8
    vocab_size: int = -1  # defined later by tokenizer
    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
    norm_eps: float = 1e-5

    max_batch_size: int = 32
    max_seq_len: int = 2048

class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight
    
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
    t = torch.arange(end, device=freqs.device)  # type: ignore
    freqs = torch.outer(t, freqs).float()  # type: ignore
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
    return freqs_cis


def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
    ndim = x.ndim
    assert 0 <= 1 < ndim
    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
    return freqs_cis.view(*shape)


def apply_rotary_emb(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
    return xq_out.type_as(xq), xk_out.type_as(xk)


class Attention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()

        self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
        self.head_dim = args.dim // args.n_heads

        self.wq = ColumnParallelLinear(
            args.dim,
            args.n_heads * self.head_dim,
            bias=False,
            gather_output=False,
            init_method=lambda x: x,
        )
        self.wk = ColumnParallelLinear(
            args.dim,
            args.n_heads * self.head_dim,
            bias=False,
            gather_output=False,
            init_method=lambda x: x,
        )
        self.wv = ColumnParallelLinear(
            args.dim,
            args.n_heads * self.head_dim,
            bias=False,
            gather_output=False,
            init_method=lambda x: x,
        )
        self.wo = RowParallelLinear(
            args.n_heads * self.head_dim,
            args.dim,
            bias=False,
            input_is_parallel=True,
            init_method=lambda x: x,
        )

        self.cache_k = torch.zeros(
            (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
        ).cuda()
        self.cache_v = torch.zeros(
            (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
        ).cuda()

    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
        bsz, seqlen, _ = x.shape
        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
        xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
        xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)

        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

        self.cache_k = self.cache_k.to(xq)
        self.cache_v = self.cache_v.to(xq)

        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv

        keys = self.cache_k[:bsz, : start_pos + seqlen]
        values = self.cache_v[:bsz, : start_pos + seqlen]

        xq = xq.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
        output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
        output = output.transpose(
            1, 2
        ).contiguous().view(bsz, seqlen, -1)

        return self.wo(output)


class FeedForward(nn.Module):
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        multiple_of: int,
    ):
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)

        self.w1 = ColumnParallelLinear(
            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
        )
        self.w2 = RowParallelLinear(
            hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x
        )
        self.w3 = ColumnParallelLinear(
            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
        )

    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))


class TransformerBlock(nn.Module):
    def __init__(self, layer_id: int, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
        )
        self.layer_id = layer_id
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
        h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out


class Transformer(nn.Module):
    def __init__(self, params: ModelArgs):
        super().__init__()
        self.params = params
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers

        self.tok_embeddings = ParallelEmbedding(
            params.vocab_size, params.dim, init_method=lambda x: x
        )

        self.layers = torch.nn.ModuleList()
        for layer_id in range(params.n_layers):
            self.layers.append(TransformerBlock(layer_id, params))

        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
        self.output = ColumnParallelLinear(
            params.dim, params.vocab_size, bias=False, init_method=lambda x: x
        )

        self.freqs_cis = precompute_freqs_cis(
            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
        )

    @torch.inference_mode()
    def forward(self, tokens: torch.Tensor, start_pos: int):
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        self.freqs_cis = self.freqs_cis.to(h.device)
        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]

        mask = None
        if seqlen > 1:
            mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device)
            mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)

        for layer in self.layers:
            h = layer(h, start_pos, freqs_cis, mask)
        h = self.norm(h)
        output = self.output(h[:, -1, :])  # only compute last logits
        return output.float()


### LLaMA Generator Construction


In [4]:

class LLaMA:
    def __init__(self, model: Transformer, tokenizer: LLaMA_Tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate(
        self,
        prompts: List[str],
        max_gen_len: int,
        temperature: float = 0.8,
        top_p: float = 0.95,
    ) -> List[str]:
        bsz = len(prompts)
        params = self.model.params
        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)

        prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]

        min_prompt_size = min([len(t) for t in prompt_tokens])
        max_prompt_size = max([len(t) for t in prompt_tokens])

        total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)

        tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
        for k, t in enumerate(prompt_tokens):
            tokens[k, : len(t)] = torch.tensor(t).long()
        input_text_mask = tokens != self.tokenizer.pad_id
        start_pos = min_prompt_size
        prev_pos = 0
        for cur_pos in range(start_pos, total_len):
            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
            if temperature > 0:
                probs = torch.softmax(logits / temperature, dim=-1)
                next_token = sample_top_p(probs, top_p)
            else:
                next_token = torch.argmax(logits, dim=-1)
            next_token = next_token.reshape(-1)
            # only replace token if prompt has already been generated
            next_token = torch.where(
                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
            )
            tokens[:, cur_pos] = next_token
            prev_pos = cur_pos

        decoded = []
        for i, t in enumerate(tokens.tolist()):
            # cut to max gen len
            t = t[: len(prompt_tokens[i]) + max_gen_len]
            # cut to eos tok if any
            try:
                t = t[: t.index(self.tokenizer.eos_id)]
            except ValueError:
                pass
            decoded.append(self.tokenizer.decode(t))
        return decoded


def sample_top_p(probs, p):
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > p
    probs_sort[mask] = 0.0
    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
    next_token = torch.multinomial(probs_sort, num_samples=1)
    next_token = torch.gather(probs_idx, -1, next_token)
    return next_token

### Load Function

In [7]:
temperature: float = 0.8,
top_p: float = 0.95,
max_seq_len: int = 512,
max_batch_size: int = 32,
llama_tokenizer_n_word = 32000

In [8]:
from fairscale.nn.model_parallel.initialize import initialize_model_parallel

def setup_model_parallel() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))

    torch.distributed.init_process_group("nccl")
    initialize_model_parallel(world_size)
    torch.cuda.set_device(local_rank)

    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size

In [9]:
import json
import time
from pathlib import Path

def load(
    ckpt_dir: str,
    tokenizer_path: str,
    local_rank: int,
    world_size: int,
    max_seq_len: int,
    max_batch_size: int,
):
    start_time = time.time()
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
    assert world_size == len(
        checkpoints
    ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
    ckpt_path = checkpoints[local_rank]
    print("Loading")

    checkpoint = torch.load(ckpt_path, map_location="cpu")
    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())

    model_args: ModelArgs = ModelArgs(
        max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
    )

    tokenizer = LLaMA_Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)
    model.load_state_dict(checkpoint, strict=False)
    
    generator = LLaMA(model, tokenizer)
    print(f"Loaded in {time.time() - start_time:.2f} seconds")
    return generator

# Tokenization Test

In [3]:
tokenizer_pth = 'pyllama_data/tokenizer.model'
llama_tokenizer = LLaMA_Tokenizer(
    model_path = tokenizer_pth
)

In [4]:
llama_tokenizer.encode(
    s = 'sss',
    bos=True, 
    eos=False
)

[1, 269, 893]

In [5]:
print('the number of ',llama_tokenizer.n_words)
print('the token_id of begining of the sentence is:', llama_tokenizer.bos_id)
print('the token_id of end of the sentence is:', llama_tokenizer.eos_id)
print('the token_id of padding is:', llama_tokenizer.pad_id)

the number of  32000
the token_id of begining of the sentence is: 1
the token_id of end of the sentence is: 2
the token_id of padding is: -1


## Text Test

In [6]:
text = """
to find happiness and be satisfied with what you have.
People have different definitions of happiness. Some people feel that if they could only win the lottery, they would be happy. Some people feel that if they could only get that promotion, they would be happy. Some people feel that if they could only be the top scorer in a game, they would be happy.
If you do not know what happiness is, I suggest you ask a psychologist. A psychologist has studied the subject of happiness and he or she knows what happiness is. A psychologist has a Ph.D. in psychology and is an expert on the subject of happiness. A psychologist knows how to make people happy.
Although you might know what happiness is, you might have forgotten it. If that is the case, I suggest you consult a psychologist. A psychologist can make you happy again. A psychologist can help you discover your happiness and how to be happy.
Happiness is a big word. Happiness is a nice word. Happiness is a beautiful word.
I believe that the meaning of life is to find happiness and be satisfied with what you have.
People have different definitions of happiness. Some people feel
1) there is no absolute time or space and 2) the speed of light in a vacuum is the fastest speed possible. There are two key principles in relativity:
(1) The laws of physics are the same in all inertial reference frames.
(2) The speed of light is constant in all inertial reference frames.
The second of these principles has allowed us to prove the first.
Before Einstein, scientists believed that the speed of light was constant in all frames, but that the speed of light was not constant. This was called the constancy of the speed of light hypothesis. In the late 19th century, scientists such as Michelson and Morley and Lorentz had set up experiments to test this hypothesis.
For example, when Michelson and Morley set up their Michelson-Morley interferometer, they expected that the light would take a different path depending on whether it was moving at the same speed as the Earth or at a different speed. They found that it didn't, but the constancy of the speed of light hypothesis said that it would.
Why didn't the constancy of the speed of light hypothesis work? Because it was wrong
1. Decide what you need
What is it that you need to do? Do you want people to buy a product, do you want to have a blog or do you want to provide a service? These are all different types of websites. The type of website you need will have a large impact on how you design your website.
2. Do you want a responsive website?
Some websites are better suited for a mobile or tablet. If you are planning to have a mobile site, then it is recommended to have a responsive website. This allows your website to be displayed on any device.
3. What is your target audience?
Having a clear idea of who your target audience is can help you design a website that will be targeted to them. Also, you will be able to know how to market your website and get people to your website.
4. What are you going to use?
There are many different web designing platforms. Some are free and others cost money. There are pros and cons to both types of websites.
5. Decide on the look and feel
What is your website going to look like? What colors are you going to use? Is there a certain logo or icon that you
Positive
###
Tweet: "My heart is broken"
Sentiment: Negative
###
Tweet: "I have some great news"
Sentiment: Positive
###
Tweet: "My favorite band just announced a new album"
Sentiment: Positive
###
Tweet: "That food was so good"
Sentiment: Positive
###
Tweet: "My company just moved to a new building"
Sentiment: Positive
###
Tweet: "I just ate the best lunch ever"
Sentiment: Positive
###
Tweet: "We are at 87% to our goal"
Sentiment: Negative
###
Tweet: "I just got an awesome new job"
Sentiment: Positive
###
Tweet: "My favorite sports team just won the championship"
Sentiment: Positive
###
Tweet: "My favorite sports team just lost"
Sentiment: Negative
###
Tweet: "My favorite sports team just won"
Sentiment: Positive
##
fromage

Tell me what you need, and I'll do my best to give you the best answers.

Answer: You will find many translators on the web. For example: http://www.french-translator.com/translator.html

You can also use the Wikipedia as a guide to understand the french meaning.

Comment: Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer).

Answer: There is no automatic tool to translate English to French. There are a couple of tools on the web, but I do not know if you will find anything to help you.

You can find a translation in a dictionary, a bilingual dictionary, a French to English one, if you find a French definition for the term.

Then you can try to match your translation with the French definition.

Answer: You can use Google Translate to translate words from English to French. But it is important to note that
"""

In [7]:
prompt = """
I believe the meaning of life is
Simply put, the theory of relativity states that 
Building a website can be done in 10 simple steps:\n
Tweet: "I hate it when my phone battery dies."
Sentiment: Negative
###
Tweet: "My day has been 👍"
Sentiment: Positive
###
Tweet: "This is the link to the article"
Sentiment: Neutral
###
Tweet: "This new music video was incredibile"
Sentiment:
Translate English to French:

sea otter => loutre de mer

peppermint => menthe poivrée

plush girafe => girafe peluche

cheese =>
"""

In [8]:
print('length of generated text is:', str(len(llama_tokenizer.encode(s=text,bos=True,eos=False))))
print('length of prompt is:', str(len(llama_tokenizer.encode(s=prompt,bos=True,eos=False))))

length of generated text is: 1288
length of prompt is: 182


# Generator Test

In [10]:
import sys

def gen(
    ckpt_dir: str,
    tokenizer_path: str,
    temperature: float = 0.8,
    top_p: float = 0.95,
    max_seq_len: int = 512,
    max_batch_size: int = 32,
):
    local_rank, world_size = setup_model_parallel()
    if local_rank > 0:
        sys.stdout = open(os.devnull, "w")

    generator = load(
        ckpt_dir, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
    )

    prompts = [
        # For these prompts, the expected answer is the natural continuation of the prompt
        "I believe the meaning of life is",
        "Simply put, the theory of relativity states that ",
        "Building a website can be done in 10 simple steps:\n",
        # Few shot prompts: https://huggingface.co/blog/few-shot-learning-gpt-neo-and-inference-api
        """Tweet: "I hate it when my phone battery dies."
Sentiment: Negative
###
Tweet: "My day has been 👍"
Sentiment: Positive
###
Tweet: "This is the link to the article"
Sentiment: Neutral
###
Tweet: "This new music video was incredibile"
Sentiment:""",
        """Translate English to French:

sea otter => loutre de mer

peppermint => menthe poivrée

plush girafe => girafe peluche

cheese =>""",
    ]
    start = time.time()
    results = generator.generate(
        prompts, max_gen_len=256, temperature=temperature, top_p=top_p
    )
    stop = time.time()
    for result in results:
        print(result)
        print("\n==================================\n")
    print(stop-start)

In [None]:
# Run the command on At Least 40G RAM
!torchrun --nproc_per_node 1 example.py --ckpt_dir ../pyllama/pyllama_data/7B --tokenizer_path ../pyllama/pyllama_data/tokenizer.model