In [1]:
import intel_extension_for_pytorch as ipex
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

In [2]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
model.load_adapter("spikecodes/ai-911-operator")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [3]:
inputs = tokenizer("911 Operator: 9-1-1, what's your emergency?\nCaller: There's a fire in my kitchen!\n911 Operator:", return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"], pad_token_id=128001,
            eos_token_id=128001, max_new_tokens=50)

KeyboardInterrupt: 

In [4]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

911 Operator: 9-1-1, what's your emergency?
Caller: There's a fire in my kitchen!
911 Operator: Where are you?
Caller: I'm at 1234 Main Street.
911 Operator: What's your name?
Caller: My name is John Smith.
911 Operator:


In [None]:
qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
  weight_dtype=ipex.quantization.WoqWeightDtype.INT8, # or INT4/NF4
  lowp_mode=ipex.quantization.WoqLowpMode.NONE, # or FP16, BF16, INT8
)
model_ipex = ipex.llm.optimize(model.eval() , quantization_config=qconfig)

In [12]:
inputs = tokenizer("911 Operator: 9-1-1, what's your emergency?\nCaller: There's a fire in my kitchen!\n911 Operator:", return_tensors="pt")
with torch.inference_mode():
    outputs = model_ipex.generate(input_ids=inputs["input_ids"], pad_token_id=128001,
            eos_token_id=128001, max_new_tokens=50)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

911 Operator: 9-1-1, what's your emergency?
Caller: There's a fire in my kitchen!
911 Operator: Where are you?
Caller: I'm at 1234 Main Street.
911 Operator: What's your name?
Caller: My name is John Smith.
911 Operator:
