Load both the 1-bit and full-precision models. It runs on a T4 instance.

In [1]:
!pip install -qqqU git+https://github.com/mobiusml/hqq.git
!pip install -qqqU accelerate einops

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m14.1 MB/s

In [2]:
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM
from threading import Thread
import transformers
import torch

In [3]:
# Load the quantized and full models along with their tokenizers
quantized_model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
full_model_id = 'NousResearch/Llama-2-7b-chat-hf'

# Specify the data type for the model to be loaded in PyTorch
torch_dtype = torch.bfloat16

# Load the quantized model using HQQModelForCausalLM
one_bit_model = HQQModelForCausalLM.from_quantized(quantized_model_id, adapter='adapter_v0.1.lora')

# Load the full model using AutoModelForCausalLM with automatic device mapping
fullbit_model = AutoModelForCausalLM.from_pretrained(
    full_model_id,
    device_map='auto',
    trust_remote_code=True,
    torch_dtype=torch_dtype
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

adapter_v0.1.lora:   0%|          | 0.00/93.3M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

qmodel.pt:   0%|          | 0.00/3.00G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

100%|██████████| 32/32 [00:00<00:00, 888.42it/s]
100%|██████████| 32/32 [00:02<00:00, 15.03it/s]
100%|██████████| 32/32 [00:00<00:00, 61.32it/s]
100%|██████████| 32/32 [00:00<00:00, 640.85it/s]
100%|██████████| 32/32 [00:00<00:00, 5805.77it/s]


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [4]:
# Load tokenizer from the quantized model
tokenizer = AutoTokenizer.from_pretrained(quantized_model_id)

# Set up inference mode by configuring the tokenizer and model
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if not tokenizer.pad_token:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
# Configure both models to use cache for faster inference and set them to evaluation mode
one_bit_model.config.use_cache = True
one_bit_model.eval()

fullbit_model.config.use_cache = True
fullbit_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
 

In [6]:
# Define a function for streaming inference with chat interaction
def chat_processor(chat, model, tokenizer, max_new_tokens=100, do_sample=True, device='cuda'):
    # Configuration for generating responses
    tokenizer.use_default_system_prompt = False
    streamer = transformers.TextIteratorStreamer(
        tokenizer,
        timeout=10.0,
        skip_prompt=True,
        skip_special_tokens=True
    )

    # Generate the model input
    model_input = tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to(device)

    # Define generation parameters correctly
    generate_params = {
        "input_ids": model_input["input_ids"],
        "streamer": streamer,
        "max_new_tokens": max_new_tokens,
        "do_sample": do_sample,
        "pad_token_id": tokenizer.pad_token_id,
        "top_p": 0.90 if do_sample else None,
        "top_k": 50 if do_sample else None,
        "temperature": 0.6 if do_sample else None,
        "num_beams": 1,
        "repetition_penalty": 1.2,
    }

    # Run the model generation in a separate thread to avoid blocking
    t = Thread(target=model.generate, kwargs=generate_params)
    t.start()

    # Print interaction logs
    print("User: ", chat)
    print("Assistant: ")
    outputs = ""
    for text in streamer:
        outputs += text
        print(text, end="", flush=True)

    # Clear the GPU cache after generating the response
    torch.cuda.empty_cache()

    return outputs

In [7]:
import random

In [8]:
%%time

# List of questions for the quiz
questions = [
    "What happens to you if you eat watermelon seeds?",
    "Can I give grapes to a dog?",
    "What is the solution to x^2 - 1 = 0",
    "Are rats good at cooking?",
    "What movie does this emoji describe? 👦👓⚡️\n  choice: harry potter\n. choice: shutter island\n. choice: inglourious basterds\n. choice: die hard\n. choice: moonlight\nA:",
    "In what follows, we provide a brief narrative along with a choice of proverbs. Please select the one that best fits the narrative.\n\nNarrative: David really wanted a better job than his current one. He couldn't manage his bills with his existing salary. Despite his extensive search, it seemed futile. However, he wouldn’t settle for less. He believed that the right job was out there. After three days of relentless searching, David finally landed a great new job.\n\nChoices:\n1. A golden key can open any door.\n2. Look before you leap.\n3. Strike while the iron is hot.\n4. Seek and you shall find.\n5. The best things in life are free.\n\nA:",
    "Which of the following sentences makes more sense? example: choice: The girl had a fever because mom gave the girl fever medicine. choice: Mom gave the girl fever medicine because the girl had a fever.",
    "Is the given example sentence an example of irony? Respond 'ironic' or 'not ironic' and give your reason. Example: Honesty is the best policy, but insanity is the best defense. Ironic/Not ironic?",
    "Is the given example sentence an example of irony? Respond 'ironic' or 'not ironic' and give your reason. Example: I am a master of keeping your secrets safe. It’s just that the folks I tell them to are not as good at it. Ironic/Not ironic?",
]

# Select a random question from the list
curr_question = random.choice(questions)



-------------------------
Output as per 1-bit model
-------------------------
User:  Can I give grapes to a dog?
Assistant: 
No, you cannot give grapes to a dog.You can't feed dogs grapes because they are not suitable for their digestive system and may cause health problems such as diarrhea or vomiting. Instead, it is best to provide the dog with fresh fruits that are safe for their consumption.


----------------------------------
Output as per full precision model
----------------------------------
User:  Can I give grapes to a dog?
Assistant: 
Grapes and raisins are not safe for dogs to eat, as they can cause kidney failure in dogs. Here's why:

Grapes and raisins contain a compound that is toxic to dogs, and the exact mechanism of how it causes kidney damage is still unknown. Even small amounts can be harmful, and the symptoms can develop within hours or days after ingestion. The toxicity can vary depending on factors such as the size of the dog, the amount consumed, and individual

In [11]:
%%time

print("-------------------------")
print("Output as per 1-bit model")
print("-------------------------")
one_bit_outputs = chat_processor(curr_question,
                                 one_bit_model,
                                 tokenizer,
                                 max_new_tokens=256,
                                 do_sample=False)

-------------------------
Output as per 1-bit model
-------------------------
User:  Can I give grapes to a dog?
Assistant: 
No, you cannot give grapes to a dog.You can't feed dogs grapes because they are not suitable for their digestive system and may cause health problems such as diarrhea or vomiting. Instead, it is best to provide the dog with fresh fruits that are safe for their consumption.CPU times: user 1min 7s, sys: 2.45 s, total: 1min 9s
Wall time: 1min 2s


In [12]:
%%time

print("\n\n")
print("----------------------------------")
print("Output as per full precision model")
print("----------------------------------")
fullbit_outputs = chat_processor(curr_question,
                                 fullbit_model,
                                 tokenizer,
                                 max_new_tokens=256,
                                 do_sample=False)




----------------------------------
Output as per full precision model
----------------------------------
User:  Can I give grapes to a dog?
Assistant: 
Grapes and raisins are not safe for dogs to eat, as they can cause kidney failure in dogs. Here's why:

Grapes and raisins contain a compound that is toxic to dogs, and the exact mechanism of how it causes kidney damage is still unknown. Even small amounts can be harmful, and the symptoms can develop within hours or days after ingestion. The toxicity can vary depending on factors such as the size of the dog, the amount consumed, and individual sensitivity.

If your dog has eaten grapes or raisins, you should monitor their behavior closely and seek veterinary attention immediately if you notice any signs of illness, including:

* Lethargy
* Vomiting
* Diarrhea
* Increased thirst and urination
* Abdominal pain
* Weight loss

It's important to note that some dogs may experience no initial symptoms but can still develop kidney failure la