<a href="https://colab.research.google.com/github/JiHa-Kim/quantize-hf-models/blob/main/Quantize_HuggingFace_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Variables
MODEL_ID = "macadeliccc/WestLake-7B-v2-laser-truthy-dpo"
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m", "q6_k", "q8_0"]
SOTA_QUANTIZATION_METHODS = ["iq2_xxs", "iq2_xs", "iq3_xxs"]

# Dataset URL for the imatrix computation
DATASET_NAME = "wikitext/wikitext-2-raw-v1"

In [None]:
# Install the required packages
!pip install -U pip
!pip install -U huggingface_hub
!pip install -U datasets
!pip install -r /content/llama.cpp/requirements.txt

[31mERROR: Operation cancelled by user[0m[31m
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3109, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2902, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 180, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 245, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 444, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-p

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
!cd /content/llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make

In [None]:
%cd /content/llama.cpp
%pwd

/content/llama.cpp


'/content/llama.cpp'

In [None]:
# Constants
MODEL_NAME = MODEL_ID.split('/')[-1]
ORIG_PATH = f"{MODEL_NAME}/orig"
QUANT_PATH = f"{MODEL_NAME}/quant"
MODEL_FP16_PATH = f"{QUANT_PATH}/{MODEL_NAME.lower()}.fp16.bin"
IMATRIX_PATH = f"imatrix_{MODEL_NAME.lower()}.dat"
QUESTIONS_PATH = f"{MODEL_NAME.lower()}/questions.txt"

# Imports
import os
import argparse
from huggingface_hub import snapshot_download
from datasets import load_dataset
import pandas
import torch

N_GPU_LAYERS = 35
HAS_GPU = torch.cuda.is_available()
USE_GPU = f"-ngl {N_GPU_LAYERS}"

# Functions
def download_model(model_id: str=MODEL_ID):
  """Downloads a model from HuggingFace Hub.

  Args:
    model_id: The ID of the model to download.
  """
  print(f"Downloading {model_id}")
  os.makedirs(MODEL_NAME, exist_ok=True)
  os.makedirs(ORIG_PATH, exist_ok=True)
  os.makedirs(QUANT_PATH, exist_ok=True)
  snapshot_download(
      repo_id=model_id,
      local_dir=ORIG_PATH,
      local_dir_use_symlinks=False,
      revision="main",
      ignore_patterns="*.bin"
  )
  print(f"Downloaded {model_id}")
  return

def convert_model(output_path: str=MODEL_FP16_PATH):
  """Converts safetensors to FP16 GGUF.

  Args:
    outfile: The path to the output file.
  """
  !python convert.py {ORIG_PATH} --outtype f16 --outfile {output_path}

  print(f"Outputted the FP16 GGUF file to {output_path}")
  return

def quantize_model(methods: list, imatrix_path: str = None, fp16_path: str=MODEL_FP16_PATH, quant_path: str=QUANT_PATH) -> None:
  """Quantizes a model using different methods and saves the results in a given path.

  Args:
    fp16_path: The path of the model file to quantize.
    quant_path: The path to save the quantized model files.
    methods: The list of quantization methods to use.
    imatrix_path: The path of the importance matrix file to use for SOTA methods. Default is None.
  """
  print(f"Quantizing {fp16_path} using {methods}")
  for method in methods:
    model_name = fp16_path.split('/')[-1].split('.')[0]
    qtype = f"{quant_path}/{model_name.lower()}.{method.upper()}.gguf"
    if imatrix_path:
      !/content/llama.cpp/quantize --imatrix {imatrix_path} {fp16_path} {qtype} {method}

    elif not imatrix_path:
      !/content/llama.cpp/quantize {fp16_path} {qtype} {method}

def prepare_dataset(dataset_name: str=DATASET_NAME):
  """Load the dataset and write it to a file

  Args:
    dataset_name: Name of the dataset on HuggingFace,
                  in the format tree/node e.g. "wikitext/wikitext-2-raw-v1"
  """
  print("Writing dataset to 'wiki.train.raw'...")
  ds_tree, ds_root = DATASET_NAME.split('/')
  dataset = load_dataset(ds_tree, ds_root)
  # Convert the 'text' column of the training split to a raw text file
  with open('wiki.train.raw', 'w') as file:
    for article in dataset['train']['text']:
      file.write(article + '\n')
  print("Wrote dataset to 'wiki.train.raw'")

def compute_imatrix(fp16_path: str=MODEL_FP16_PATH) -> str:
  """Computes the importance matrix for a model using a given dataset.

  Args:
    fp16_path: The path of the model file to use.
    dataset: The path of the dataset file to use.

  Returns:
    The path of the importance matrix file.
  """

  model_name = fp16_path.split('/')[-1].split('.')[0]
  IMATRIX_PATH = f"{model_name.lower()}.imatrix"

  if HAS_GPU:
    !/content/llama.cpp/imatrix -m {fp16_path} -f wiki.train.raw -o {IMATRIX_PATH} --chunks 100 {USE_GPU}
  else:
    !/content/llama.cpp/imatrix -m {fp16_path} -f wiki.train.raw -o {IMATRIX_PATH} --chunks 100
  return IMATRIX_PATH

def generate_questions(prompt: str, quants: list= ["IQ2_XSS","Q4_K_M"], quant_path: str=QUANT_PATH) -> None:
  """Generates questions using a quantized model and a given prompt.

  Args:
    quant_path: The path of the quantized model file to use.
    prompt: The prompt to use for generating questions.
  """
  model_name = quant_path.split('/')[0]
  for quant in quants:
    qtype = f"{quant_path}/{model_name.lower()}.{quant}.gguf"
    print(f"Generating questions using {qtype} and {prompt}...")
    !main -m {qtype} -n 128 -p {prompt}

In [None]:
# Download the model from HuggingFace
download_model(MODEL_ID)

In [None]:
convert_model()

In [None]:
# Quantize the model using classic methods
quantize_model(QUANTIZATION_METHODS)

In [None]:
prepare_dataset()

In [None]:
# Compute the imatrix using the wikitext dataset
IMATRIX_PATH = compute_imatrix()

In [None]:
# !cat /proc/cpuinfo

In [None]:
# !/content/llama.cpp/quantize --help

usage: /content/llama.cpp/quantize [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]

  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
  --pure: Disable k-quant mixtures and quantize all tensors to the same type
  --imatrix file_name: use data in file_name as importance matrix for quant optimizations
  --include-weights tensor_name: use importance matrix for this/these tensor(s)
  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)
Note: --include-weights and --exclude-weights cannot be used together

Allowed quantization types:
   2  or  Q4_0    :  3.56G, +0.2166 ppl @ LLaMA-v1-7B
   3  or  Q4_1    :  3.90G, +0.1585 ppl @ LLaMA-v1-7B
   8  or  Q5_0    :  4.33G, +0.0683 ppl @ LLaMA-v1-7B
   9  or  Q5_1    :  4.70G, +0.0349 ppl @ LLaMA-v1-7B
  19  or 

In [None]:
# Quantize the model using imatrix methods
IMATRIX_PATH = f"{MODEL_NAME.lower()}.imatrix"
quantize_model(SOTA_QUANTIZATION_METHODS, IMATRIX_PATH)

Quantizing WestLake-7B-v2-laser-truthy-dpo/quant/westlake-7b-v2-laser-truthy-dpo.fp16.bin using ['iq2_xxs', 'iq2_xs', 'iq3_xxs']
load_imatrix: loaded 224 importance matrix entries from westlake-7b-v2-laser-truthy-dpo.imatrix
prepare_imatrix: have 224 importance matrix entries
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes
main: build = 2061 (9392ebd4)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing 'WestLake-7B-v2-laser-truthy-dpo/quant/westlake-7b-v2-laser-truthy-dpo.fp16.bin' to 'WestLake-7B-v2-laser-truthy-dpo/quant/westlake-7b-v2-laser-truthy-dpo.IQ2_XXS.gguf' as IQ2_XXS
llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from WestLake-7B-v2-laser-truthy-dpo/quant/westlake-7b-v2-laser-truthy-dpo.fp16.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/val

In [None]:
# Generate questions using the prompt
PROMPT = "User: Tell me story about what a quantization is and what we need to build."

generate_questions(
    QUANT_PATH,
    PROMPT,
)

In [None]:
USER = "Ji-Ha"

In [None]:
readme_message = f"""
Thanks to @s3nh for the great quantization notebook code.
---
license: openrail
pipeline_tag: text-generation
library_name: transformers
language:
- en
---


## Original model card

Buy @s3nh a coffee if you like this project ;)
<a href="https://www.buymeacoffee.com/s3nh"><img src="https://www.buymeacoffee.com/assets/img/guidelines/download-assets-sm-1.svg" alt=""></a>

#### Description

GGUF Format model files for [This project](https://huggingface.co/{MODEL_ID}).

### GGUF Specs

GGUF is a format based on the existing GGJT, but makes a few changes to the format to make it more extensible and easier to use. The following features are desired:

Single-file deployment: they can be easily distributed and loaded, and do not require any external files for additional information.
Extensible: new features can be added to GGML-based executors/new information can be added to GGUF models without breaking compatibility with existing models.
mmap compatibility: models can be loaded using mmap for fast loading and saving.
Easy to use: models can be easily loaded and saved using a small amount of code, with no need for external libraries, regardless of the language used.
Full information: all information needed to load a model is contained in the model file, and no additional information needs to be provided by the user.
The key difference between GGJT and GGUF is the use of a key-value structure for the hyperparameters (now referred to as metadata), rather than a list of untyped values.
This allows for new metadata to be added without breaking compatibility with existing models, and to annotate the model with additional information that may be useful for
inference or for identifying the model.

# Original model card

"""

In [None]:
from huggingface_hub import HfApi, create_repo
import pathlib
from google.colab import userdata
# Obtains the HuggingFace token from the colab secrets tab
HF_TOKEN: str = userdata.get("HF_TOKEN")
REPOSITORY_NAME: str = f"{MODEL_NAME}-GGUF"

api = HfApi()
with open(f'{QUANT_PATH}/README.md', 'w') as outfile:
    outfile.writelines(readme_message)
gguf_files = list(pathlib.Path(f'/content/{QUANT_PATH}').rglob('*.gguf'))
if len(gguf_files) > 0:

    try:
        create_repo(REPOSITORY_NAME, token=HF_TOKEN)

        TYPE = 'model'
        api.upload_folder(
            folder_path = f'/content/{QUANT_PATH}',
            repo_id = f'{USER}/{REPOSITORY_NAME}',
            repo_type=TYPE,
            path_in_repo = "./",
            token= HF_TOKEN
        )
        !rm -rf {QUANT_PATH}/
        !rm -rf {ORIG_PATH}/
        !rm -rf {MODEL_NAME}/
    except:
        TYPE = 'model'
        api.upload_folder(
            folder_path = f'/content/{QUANT_PATH}',
            repo_id = f'{USER}/{REPOSITORY_NAME}',
            repo_type=TYPE,
            path_in_repo = "./",
            token= HF_TOKEN
        )
        !rm -rf {QUANT_PATH}/
        !rm -rf {ORIG_PATH}/
        !rm -rf {MODEL_NAME}/
else:
    print("Something went wrong")