In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel
from typing import Optional
import time
import os

In [2]:
def generate_prompt(input_text: str, instruction: Optional[str] = None) -> str:
    text = f"### Question: {input_text}\n\n### Answer: "
    if instruction:
        text = f"### Instruction: {instruction}\n\n{text}"
    return text

In [None]:
huggingface_token = os.environ.get('HUGGINGFACE_TOKEN')

In [3]:
base_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", token=huggingface_token)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token=huggingface_token)

lora_model = PeftModel.from_pretrained(base_model, "vdpappu/lora_psychology-1")

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [10]:
eos_token = '<eos>'
eos_token_id = tokenizer.encode(eos_token, add_special_tokens=False)[-1]

generation_config = GenerationConfig(
       eos_token_id=tokenizer.eos_token_id,
       min_length=5,
       max_length=200,
       do_sample=True,
       temperature=0.7,
       top_p=0.9,
       top_k=50,
       repetition_penalty=1.5,
       no_repeat_ngram_size=3,
       early_stopping=True
   )



In [11]:
merged_model = lora_model.merge_and_unload()

In [12]:
question = "I like the toy but my brother is not sharing it with me. What do I do?"
prompt = generate_prompt(input_text=question)

In [15]:
start = time.time()
with torch.no_grad():
    inputs = tokenizer(prompt, return_tensors="pt")
    output = merged_model.generate(**inputs, generation_config=generation_config)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
end = time.time()

Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


In [16]:
print(f"Inference time: {end-start:.2f} seconds")
print(response)

Inference time: 27.31 seconds
### Question: I like the toy but my brother is not sharing it with me. What do I do?

### Answer: </h6>
<h6>Don't be a crybaby and just take it from your brother, he probably doesn't want you to play with it anyway! It might even make him feel guilty if they give in easily for one of their own children or friends that don't have anything else yet anyways...</h6>
