# llama v1 model inference

## preparation

Acquire weights file and save them to /mnt/LLaMa.

    ├── 7B
    │   ├── checklist.chk
    │   ├── consolidated.00.pth
    │   └── params.json
    ├── tokenizer_checklist.chk
    └── tokenizer.model

In [None]:
!git submodule update

In [None]:
import sys
sys.path.append("/mnt/llama-inference/llama")
sys.path.append("/mnt/llama-inference/transformers")
print(sys.path)

## approach 1. use transformers LlamaForCausalLM

Convert original weights file to hf format.

output /mnt/LLaMA/

    ├── hf_7B
    │   ├── config.json
    │   ├── generation_config.json
    │   ├── pytorch_model-00001-of-00002.bin
    │   ├── pytorch_model-00002-of-00002.bin
    │   ├── pytorch_model.bin.index.json
    │   ├── special_tokens_map.json
    │   ├── tokenizer_config.json
    │   ├── tokenizer.json
    │   └── tokenizer.model

In [None]:
!python3 transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py  --input_dir /mnt/LLaMA/ --model_size 7B --output_dir /mnt/LLaMA/hf_7B/

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

model = LlamaForCausalLM.from_pretrained("/mnt/LLaMA/hf_7B", device_map="auto")
tokenizer = LlamaTokenizer.from_pretrained("/mnt/LLaMA/hf_7B")

print(f"Mem needed: {model.get_memory_footprint() / 1024 / 1024 / 1024:.2f} GB")
print(model.device)
print(model)

In [None]:
prompt = "I believe the meaning of life in its simplest form is",

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
outputs = model.generate(input_ids, 
                         max_length=64,
                         top_p=0,
                         temperature=1,
                         do_sample=True)

decoded = tokenizer.batch_decode(outputs, 
                                 skip_special_tokens=True, 
                                 clean_up_tokenization_spaces=False)
print(decoded[0])


## approach 2. load weights directly to llama

Refer to [example.py](https://github.com/facebookresearch/llama/blob/llama_v1/example.py).

In [None]:
from pathlib import Path
import torch
import json

from llama import ModelArgs, Tokenizer, Transformer
from fairscale.nn.model_parallel.initialize import initialize_model_parallel

import os

# load checkpoint
checkpoint=torch.load("/mnt/LLaMA/7B/consolidated.00.pth", map_location="cuda")

# load tokenizer
tokenizer = Tokenizer(model_path="/mnt/LLaMA/tokenizer.model")

# init model args
with open("/mnt/LLaMA/7B/params.json", "r") as f:
    params = json.loads(f.read())
model_args: ModelArgs = ModelArgs(
    max_seq_len=512, max_batch_size=32, **params
)
model_args.vocab_size = tokenizer.n_words
print(model_args)

# load model
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "49954"
if not torch.distributed.is_initialized():
    torch.distributed.init_process_group("nccl")
    world_size = 1 # no. pt files
    initialize_model_parallel(world_size)
    local_rank = 0
    torch.cuda.set_device(local_rank)
    
torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = Transformer(model_args)
torch.set_default_tensor_type(torch.FloatTensor)
model.load_state_dict(checkpoint, strict=False)
print(model)

In [None]:
prompt = "I believe the meaning of life in its simplest form is"
prompt_token = tokenizer.encode(prompt, bos=True, eos=False)
print('prompt_token', prompt_token)

max_seq_len = 64

tokens = torch.full((1, max_seq_len), tokenizer.pad_id).cuda().long()
tokens[0,:len(prompt_token)] = torch.tensor(prompt_token).long()
print(tokens)

## method 1
change start_pos so that x from 2nd iteration have shape (1,1)

In [None]:
# from generation.py
def sample_top_p(probs, top_p):
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > top_p
    probs_sort[mask] = 0.0
    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
    next_token = torch.multinomial(probs_sort, num_samples=1)
    next_token = torch.gather(probs_idx, -1, next_token)
    return next_token

temperature = 1
top_p = 0
start_pos = 0
for i in range(len(prompt_token), max_seq_len):
    x = tokens[:, start_pos:i]
    logits = model.forward(x, start_pos)
    probs = torch.softmax(logits / temperature, dim=-1)
    next_token = sample_top_p(probs, top_p)  
    tokens[:,i] = next_token
    start_pos = i
print(tokenizer.decode(tokens[0,:].tolist()))

## method 2
everytime start_pos=0, ignore temperature and top_p

In [None]:
for i in range(len(prompt_token), max_seq_len):
    x = tokens[:, 0:i]
    logits = model.forward(x, start_pos=0)    
    next_token = torch.argmax(logits, dim=-1)
    tokens[:,i] = next_token
print(tokenizer.decode(tokens[0,:].tolist()))