<a href="https://colab.research.google.com/github/towardsai/ragbook-notebooks/blob/main/notebooks/Chapter%2011%20-%20Benchmark_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y transformers
!pip install optimum[neural-compressor]===1.22.0
!pip install onnx===1.14.1 evaluate===0.4.0

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = "aman-mehra/opt-1.3b-finetune-squad-ep-0.4-lr-2e-05-wd-0.01"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="./opt-1.3b")
model = AutoModelForQuestionAnswering.from_pretrained(model_name, cache_dir="./opt-1.3b")

In [None]:
import evaluate
from datasets import load_dataset

task_evaluator = evaluate.evaluator("question-answering")

eval_dataset = load_dataset("squad", split="validation", cache_dir="./squad-ds")
eval_dataset = eval_dataset.select(range(64)) # Use a subset of dataset

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

def eval_fn(model):
    qa_pipeline.model = model
    metrics = task_evaluator.compute(model_or_pipeline=qa_pipeline,
data=eval_dataset, metric="squad")
    return metrics["f1"]

In [13]:
from neural_compressor.config import AccuracyCriterion, TuningCriterion, PostTrainingQuantConfig

# Set the accepted accuracy loss to 1%
accuracy_criterion = AccuracyCriterion(tolerable_loss=0.01)

# Set the maximum number of trials to 10
tuning_criterion = TuningCriterion(max_trials=10)

quantization_config = PostTrainingQuantConfig(
    approach= "dynamic",
    accuracy_criterion=accuracy_criterion,
		tuning_criterion=tuning_criterion
)

In [None]:
from optimum.intel import INCQuantizer

quantizer = INCQuantizer.from_pretrained(model, eval_fn=eval_fn)

quantizer.quantize(
    quantization_config=quantization_config,
    save_directory="opt1.3b-quantized"
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

In [None]:
from optimum.intel import INCModelForCausalLM

model = INCModelForCausalLM.from_pretrained("./opt1.3b-quantized")

In [None]:
inputs = tokenizer("What does life mean? Describe in great details.", return_tensors="pt")

generation_output = model.generate(**inputs,
                                   return_dict_in_generate=True,
                                   output_scores=True,
                                   min_length=512,
                                   max_length=512,
                                   num_beams=1,
                                   do_sample=True,
                                   repetition_penalty=1.5)

In [None]:
print(tokenizer.decode(generation_output.sequences[0]))