In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
lora_model_path = "/data/user_data/wenkail/llm_personality/generator/generator_whole_o_1e-6/"
cache_dir = "/data/user_data/wenkail/.cache/"

In [3]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    padding_side="left",
    cache_dir=cache_dir
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
# load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir=cache_dir
)
model = PeftModel.from_pretrained(
    model,
    lora_model_path
)
model.eval()

Loading checkpoint shards: 100%|██████████| 4/4 [03:32<00:00, 53.02s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
                

In [7]:
## 1. test on one example with message format
# messages = [
#     {"role": "user", "content": "How are you?"}
# ]

## 2. test on one example with alpaca format
alpaca_example = {
    "instruction": "Help me complete the sentence with certain Big Five Personality: Openness - high",
    "input": "my phones acting a little",
    "output": "slow.. then i remembered it's probably because becky spilt nail polish remover all over it."
}
messages = [
    {"role": "user", "content": alpaca_example['instruction'] + "\n" + alpaca_example['input']}
]

In [8]:
input_ids = tokenizer.apply_chat_template(
    messages, # uncomment either 1 or 2 above
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)
outputs = model.generate(
    input_ids,
    max_new_tokens=1024, # can be changed
    eos_token_id=terminators,
    do_sample=False # IMPORTANT! Must have
)
response = outputs[0][input_ids.shape[-1]:]
result = tokenizer.decode(response, skip_special_tokens=True)
result

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


'weird.'

This version has the same results as using llamafactory for inference. However, the code is slow. You should still use llamafactory inference code if you need to run inference faster. 

Here is the llamafactory inference code. Copy and paste it into a `.yaml` file:

You should customize the following arguments:
1. `model_name_or_path`
2. `adapter_name_or_path`
3. `dataset`
4. `dataset_dir`
5. `output_dir`