<a href="https://colab.research.google.com/github/twhool02/ptm-quantization/blob/main/Original_vs_quantized_Llama_2_7b_chat_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Compare Original Llama2-7b-chat-HF to Quantized Llama2-7b-chat-HF

This notebook compares the performance of an original model with a quantized model

The code in this notebook is based on the following blogs/documentation :

* [Fine-Tune Your Own Llama 2 Model in a Colab Notebook](https://medium.com/towards-data-science/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32)
* [Fine-Tuning LLaMA 2: A Step-by-Step Guide to Customizing the Large Language Model](https://www.datacamp.com/tutorial/fine-tuning-llama-2)
* [Fine-Tuning Llama-2 LLM on Google Colab: A Step-by-Step Guide.](https://gathnex.medium.com/fine-tuning-llama-2-llm-on-google-colab-a-step-by-step-guide-dd79a788ac16)
* [Hugging Face Documentations](https://huggingface.co/docs)

## Setup

### Log into HuggingFace Hub

This code assumes that the user has a hugging face token setup as a notebook secret called HF_TOKEN

In [None]:
# Required when interacting with HuggingFace Hub
!pip install -q --upgrade huggingface_hub

import huggingface_hub

print(f"Hugging Face Version is: {huggingface_hub.__version__}")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/346.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m337.9/346.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.4/346.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hHugging Face Version is: 0.21.4


In [None]:
from google.colab import userdata

# using the HF_TOKEN secret, this has write permissions to Hugging Face
hftoken = userdata.get('HF_TOKEN')

In [None]:
from huggingface_hub import login

# Log into hugging face using the HF_TOKEN secrect
login(hftoken, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Install Required Libraries

In [None]:
# install the development version of transformers
# !pip install -q -U git+https://github.com/huggingface/transformers.git -q
!pip install -q -U transformers

# install the stable version of AutoAWQ and it kernelts
!pip install autoawq -q

# accelerate enables the same PyTorch code to be run across any distributed configuration
!pip install -q -U accelerate

# 'bitsandbytes' includes quantization primitives for 8-bit & 4-bit operations
!pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.0/79.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.4/33.4 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

### Check library versions

In [None]:
#print the version of transformers
import transformers
print(f"version of transformers: {transformers.__version__}")

#print the version of pytorch
import torch
print(f"version of pytorch: {torch.__version__}")

version of transformers: 4.39.1
version of pytorch: 2.2.1+cu121


### Import Required Libraries

In [None]:
# for interacting with the operating system.
import os

# torch is the main package of PyTorch.
import torch

from transformers import (
    AutoModelForCausalLM, # Generic model class with a causal language modeling head
    AutoTokenizer, # Automatically selects correct tokenizer for a model.
    pipeline, # Creates a pipeline that applies a model to some input data.
    logging, # Logs events during training and evaluation.
    BitsAndBytesConfig # transformers class to quantize models
)


# Import required AWQ libraries
from awq import AutoAWQForCausalLM

# provides access to garbage collection
import gc

# relases memory from the GPU
from accelerate.utils import release_memory

# to quantize model
import bitsandbytes

# will be used to measure time
import time

### Define the processor to use

Ensure the model will use a GPU if available

In [None]:
# Load the model directly onto GPU (if available)
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Mar 22 20:44:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              42W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Define a flush function

This function is to free all allocated memory

In [None]:
def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

### Create function to measure memory used

In [None]:
def bytes_to_gigabytes(bytes):
  return bytes / 1024 / 1024 / 1024

## Load Model and Tokenizer

### Define  the model

In [None]:
# Define the model to use
model_name = 'meta-llama/Llama-2-7b-chat-hf'


### Load Base Model

In [None]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name, # specifies which pre-trained model to load
    trust_remote_code=True, # allows the execution of remote code.
    device_map=device # load device on to the GPU
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

### View the model details

In [None]:
print(f"Model size: {model.get_memory_footprint() / 1e9:.1f} GB")
print(f"Model params: {model.num_parameters():,}")
print(f"Model Config: \n{model.config}")
print(f"View model structure: \n{model}")

Model size: 27.1 GB
Model params: 6,738,415,616
Model Config: 
LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.39.1",
  "use_cache": true,
  "vocab_size": 32000
}

View model structure: 
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Lin

### Load Tokenizer

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, device_map=device)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token # sets the pad token to the eos token

## Run Inference on the Model

In [None]:
prompt = "What is the best post-training quantization method for LLMs as of your latest knowledge?"
prompt_template=f'''{prompt}

'''

tokens = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda() # move the tokens to GPU

# Generate output
generation_output = model.generate(
    tokens,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    max_new_tokens=512,
)

In [None]:
print("Output: ", tokenizer.decode(generation_output[0]))

Output:  <s> What is the best post-training quantization method for LLMs as of your latest knowledge?

As of my latest knowledge, the best post-training quantization method for large language models (LLMs) is a combination of linear and nonlinear quantization methods.

Linear quantization methods, such as linear quantization with weight sharing (LQWS), are computationally efficient and can achieve good accuracy with a small number of bits. However, they may not be able to capture the complex nonlinear relationships between the weights and activations in the LLM.

Nonlinear quantization methods, such as binary weight networks (BWN) and ternary weight networks (TWN), can capture these nonlinear relationships but are computationally more expensive. They can achieve better accuracy than linear methods but may require more bits to represent the weights and activations.

A combination of linear and nonlinear quantization methods can achieve a balance between accuracy and computational effici

In [None]:
prompt = "What planets are in our solar system??"

prompt_template=f'''{prompt}

'''

tokens = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda() # move the tokens to GPU

# Generate output
generation_output = model.generate(
    tokens,
    do_sample=True,
    temperature=0.1,
    top_k=40,
    max_new_tokens=512
)

print("Output: ", tokenizer.decode(generation_output[0]))

Output:  <s> What planets are in our solar system??

The eight planets in our solar system, in order from the Sun, are:

1. Mercury
2. Venus
3. Earth
4. Mars
5. Jupiter
6. Saturn
7. Uranus
8. Neptune

Note: Pluto was previously considered a planet but is now classified as a dwarf planet.</s>


## Test the impact of Quantization on the Model

### Test memory usage of the orignal model

In [None]:
# Clear memory usage on the GPU
del model
del generation_output
del tokens
flush()
release_memory(device)

[None]

In [None]:
# reload the model
model = AutoModelForCausalLM.from_pretrained(
    model_name, # specifies which pre-trained model to load
    trust_remote_code=True, # allows the execution of remote code.
    device_map=device # load model on to the GPU
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Run inference on the model and record time taken
prompt = "Please list the last 10 presidents of the USA"

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
start_time = time.time()

result = pipe(prompt, max_new_tokens=1024)[0]["generated_text"][len(prompt):]
end_time = time.time()
execution_time = end_time - start_time
print(result)

, along with their political party affiliation and the years they served.

Here are the last 10 presidents of the United States, along with their political party affiliation and the years they served:

1. Joe Biden (Democrat) - 2021-present
2. Donald Trump (Republican) - 2017-2021
3. Barack Obama (Democrat) - 2009-2017
4. George W. Bush (Republican) - 2001-2009
5. Bill Clinton (Democrat) - 1993-2001
6. George H.W. Bush (Republican) - 1989-1993
7. Ronald Reagan (Republican) - 1981-1989
8. Jimmy Carter (Democrat) - 1977-1981
9. Gerald Ford (Republican) - 1974-1977
10. Richard Nixon (Republican) - 1969-1974


In [None]:
# measure memory used
# torch.cuda.max_memory_allocated - returns the maximum GPU memory occupied by tensors in bytes
# for a given device.
orig_model_mem = bytes_to_gigabytes(torch.cuda.max_memory_allocated(device))
print(f"VRAM usage of {model_name}: {orig_model_mem} GB")
print(f"Inference time of {model_name}: {execution_time} seconds")

VRAM usage of meta-llama/Llama-2-7b-chat-hf: 25.77883768081665 GB
Inference time of meta-llama/Llama-2-7b-chat-hf: 10.439738273620605 seconds


In [None]:
# Clear the memory usage on the GPU
del model
del pipe
del prompt
del result
release_memory(device)
flush()
time.sleep(30)

In [None]:
# Verify memory usage has reduced
start_time = time.time()

while True:
    max_memory_allocated = bytes_to_gigabytes(torch.cuda.max_memory_allocated(device))
    print(max_memory_allocated)
    if max_memory_allocated < 1:
        break
    elif time.time() - start_time > 300:  # 300 seconds = 5 minutes
        print("5 minutes have passed and the condition has not been met.")
        break
    torch.cuda.reset_peak_memory_stats()
    time.sleep(10)

24.497222423553467
0.0079345703125


### Test memory usage on the quantized model

In [None]:
# Bits and Bytes Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Activate 4-bit precision base model loading
    bnb_4bit_quant_type="nf4", # Quantization type
    bnb_4bit_compute_dtype=torch.float16, # The data type for computation when using 4-bit base models
    bnb_4bit_use_double_quant=False, # Activate nested quantization (double quantization)
)

In [None]:
# Quantize model when loading
quantized_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config, # set the quantization configuration for the model.
    device_map=device # # load model on to the GPU
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Run inference on the quantized model
prompt = "Please list the last 10 presidents of the USA"

pipe = pipeline("text-generation", model=quantized_model, tokenizer=tokenizer)

start_time = time.time()
result = pipe(prompt, max_new_tokens=1024)[0]["generated_text"][len(prompt):]
end_time = time.time()
execution_time = end_time - start_time
print(result)

. everybody knows that.

Comment: I apologize, but I'm a large language model, I cannot provide a list of the last 10 presidents of the USA as it is not a factual or accurate statement. The number of presidents of the United States is 45, not 10. Here is a list of all the presidents of the United States in chronological order:

1. George Washington
2. John Adams
3. Thomas Jefferson
4. James Madison
5. James Monroe
6. John Quincy Adams
7. Andrew Jackson
8. Martin Van Buren
9. William Henry Harrison
10. John Tyler
11. James K. Polk
12. Zachary Taylor
13. Millard Fillmore
14. Franklin Pierce
15. James Buchanan
16. Abraham Lincoln
17. Andrew Johnson
18. Ulysses S. Grant
19. Rutherford B. Hayes
20. James A. Garfield
21. Chester A. Arthur
22. Grover Cleveland
23. Benjamin Harrison
24. Grover Cleveland (again)
25. William McKinley
26. Theodore Roosevelt
27. William Howard Taft
28. Woodrow Wilson
29. Warren G. Harding
30. Calvin Coolidge
31. Herbert Hoover
32. Franklin D. Roosevelt
33. Harry S

In [None]:
# measure memory used
quant_model_mem = bytes_to_gigabytes(torch.cuda.max_memory_allocated(device))
print(f"VRAM usage of Quantized {model_name}: {quant_model_mem} GB")
print(f"Inference time of Quantized {model_name}: {execution_time} seconds")

VRAM usage of Quantized meta-llama/Llama-2-7b-chat-hf: 4.450152397155762 GB
Inference time of meta-llama/Llama-2-7b-chat-hf: 46.708553314208984 seconds
