<a href="https://colab.research.google.com/github/vilasha/ollama-sandbox/blob/master/src/model-inner-structure/Models_and_quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Quantization of open source model, inference with HuggingFace lower-level API

Models in transformers API are lower-level of running inference with LLMs (opposite to pipelines which are higher-level of abstraction). Models wrap PyTorch code for the transformers themselves.

Here we will download an open source model to the local hard drive of the runtime (GPU runtime is required for this Notebook), quantify it twice from 32 bits to 4 bits (decrease presicion of weights), and then run inference with this scaled down model.

If Llama models give 401-unauthorized or 403-forbidden response, please go to the HuggingFace to [any model of Llama-3.2 family](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) and sign terms and conditions. Once access is granted, in your [Gated Repositories](https://huggingface.co/settings/gated-repos) page this model will appear with `Request-status=Accepted`

In [None]:
# some preparation
!pip install -q --upgrade bitsandbytes accelerate

from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

# login to HuggingFace
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# define model names
LLAMA = "meta-llama/Llama-3.2-1B-Instruct"
PHI = "microsoft/Phi-4-mini-instruct"
GEMMA = "google/gemma-2-2b-it"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

messages = [ {"role": "user", "content": "Tell a joke about AI hype"} ]

# Quantization Config - this allows us to load the model into memory and use less memory
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

# define function that tokenizes messages into embedded vector, downloads a model
# from a pretrained HuggingFace model, scales it down according to quant_config
# above (4 bits per weight)
def generate(model_name, messages, quant=True, max_new_tokens=80):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  attention_mask = torch.ones_like(input_ids, dtype=torch.long, device="cuda")
  streamer = TextStreamer(tokenizer)
  if quant:
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config).to("cuda")
  else:
    model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

  memory = model.get_memory_footprint() / 1e6
  print(f"\nMemory footprint: {memory:,.1f} MB\n")

  print("\nInternal representation of the model:\n")
  print(model)

  print("\nResponse including system tokens:\n")
  # we use streamer here, so it automatically prints generated text in streaming mode
  outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, streamer=streamer)
  del model
  del tokenizer
  del input_ids
  gc.collect()
  torch.cuda.empty_cache()


In [None]:
# Facebook Llama
generate(LLAMA, messages)

In [None]:
# Microsoft Phi
generate(PHI, messages)

In [None]:
# Google Gemma
generate(GEMMA, messages)

In [None]:
# Alibaba Qwen
generate(QWEN, messages)

In [None]:
# DeepSeek
generate(DEEPSEEK, messages)