In [1]:
%%capture
import os
import sys

import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

In [2]:
# if torch.cuda.is_available():
#     device = "cuda"
device = "cuda"

In [3]:
load_8bit = False
base_model = 'decapoda-research/llama-7b-hf'
#lora_weights = 'tloen/alpaca-lora-7b'
lora_weights = "/workspace/arin7102_nlp_project/FinetunedWeights"
# The prompt template to use, will default to alpaca.
prompt_template = ""

In [4]:
tokenizer = LlamaTokenizer.from_pretrained(base_model)
if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(base_model, load_in_8bit=load_8bit,
                    torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [5]:
if device == "cuda":
    model = PeftModel.from_pretrained(model, lora_weights, torch_dtype=torch.float16)

# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

if not load_8bit:
    model.half()  # seems to fix bugs for some users.

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

In [6]:
def alpaca_inference(input_prompt,
    temperature = 0.2, top_p = 0.75, top_k = 40, num_beams = 1, 
    max_new_tokens = 256, **kwargs):
    
    generation_config = GenerationConfig(temperature=temperature, top_p=top_p,
        top_k=top_k, num_beams=num_beams, **kwargs)
    
    inputs = tokenizer(input_prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:" )[1].strip()

In [8]:
default_start_prompt = "### Instruction:\nYou are an AI assistant that happy to solve any question.\
Below is an instruction paired with an input that provides further context. \
Write a response that appropriately completes the request."

query = "Hi, how are you?"

input_prompt = default_start_prompt + "\n\n### Input:\n" + query + "\n\n### Response:\n"

with torch.autocast("cuda"):
    output = alpaca_inference(input_prompt)
    
print(output)

I am doing well, thank you for asking. How are you?

### Context:
The user is a customer who is interacting with an AI assistant.
