In [1]:
## ref : https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/optimize-llama-2-gptq.ipynb

In [6]:
# !sudo pip install -q transformers --upgrade

In [5]:
# !sudo -H pip install auto-gptq --no-cache-dir

In [4]:
# !sudo -H pip install --upgrade optimum

In [1]:
import torch
torch.cuda.empty_cache()

In [2]:
import gc
gc.collect()

0

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.gptq import GPTQQuantizer, load_quantized_model
import torch
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

CUDA extension not installed.
CUDA extension not installed.


In [4]:
model_name = "mistralai/Mistral-7B-v0.1"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16,device_map='auto')

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
quantizer = GPTQQuantizer(bits=4, dataset="wikitext2")
quantizer.quant_method = "gptq"

In [None]:
quantized_model = quantizer.quantize_model(model, tokenizer)

In [None]:
quant_path = "/data/quantization-trials/GPTQ-quantized/ravi"

In [8]:
# save the quantize model to disk

quantized_model.save_pretrained(quant_path, safe_serialization=True)

### Inference on quantized model

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

In [10]:
gptq_config = GPTQConfig(bits=4, use_exllama=True)

model_id = "/data/quantization-trials/GPTQ-quantized"
quant_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.float16)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


In [11]:
def predict_from_quant(user_query):
    _inputs = tokenizer.encode(user_query, return_tensors="pt").to('cuda')
    outputs = quant_model.generate(input_ids=_inputs, max_length= 1000, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    return output

import time

In [12]:
# Using quant model
start = time.time()
output1 = predict_from_quant("what is science")
print("time taken is :", time.time()-start)

time taken is : 133.77150201797485


In [8]:
print(output1)

</s>what is science?

Science is the study of the world around us. It is the study of the physical, chemical, and biological world around us. It is the study of the world around us.

What is the world around us?

The world around us is made up of the physical, chemical, and biological world around us. The world around us is made up of the physical, chemical, and biological world around us.

What is the world around us made of?

The world around us is made up of the physical, chemical, and biological world around us. The world around us is made up of the physical, chemical, and biological world around us.

What is the world around us made of?

The world around us is made up of the physical, chemical, and biological world around us. The world around us is made up of the physical, chemical, and biological world around us.

What is the world around us made of?

The world around us is made up of the physical, chemical, and biological world around us. The world around us is made up of the ph

### Inference on un-quantized model

In [9]:
model_id = "/data/quantization-trials/merged-model"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.float16)

def predict_from_normal(user_query):
    _inputs = tokenizer.encode(user_query, return_tensors="pt").to('cuda')
    outputs = model.generate(input_ids=_inputs, max_length= 1000, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    return output

In [10]:
# Using unquant model
start = time.time()
output2 = predict_from_normal("what is science")
print("time taken is :", time.time()-start)

time taken is : 23.793633222579956


In [11]:
print(output2)

</s>what is science?

The word "science" is used to describe a variety of fields of study, including:

biology

chemistry

biology of the brain

biology of the immune system

biology of the nervous system

biology of the reproductive system

biology of the immune system

biology of the nervous system

biology of the reproductive system

biology of the immune system

biology of the reproductive system

biology of the nervous system

biology of the reproductive system

biology of the immune system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive system

biology of the reproductive sy