<a href="https://www.kaggle.com/code/william2020/llama-cpp-quantization-llama3-1-8b?scriptVersionId=193455862" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Llama.cpp Quantization Walkthrough with LLama3.1 8B


We will be quanitizing (Q-4) the Llama 3.1 8b model.

Open this in Google Colab for best experience.  If you can, please connect runtime to a GPU.

## Download Model from HuggingFace.

You'll need a HF Access Token

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import snapshot_download

model_name = "meta-llama/Meta-Llama-3.1-8B"
base_model = "./original_model/"
snapshot_download(repo_id=model_name, local_dir=base_model, ignore_patterns=["*.pth"])

## Clone llama.cpp Repository

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

In [None]:
!mkdir models

## Convert model to GGUF format

In [None]:
!python llama.cpp/convert_hf_to_gguf.py ./original_model/ --outfile models/llama_3.1_FP16.gguf

## Build llama.cpp and quantize the Model

In [None]:
!mkdir llama.cpp/build && cd llama.cpp/build && cmake .. && cmake --build . --config Release

In [None]:
!cd llama.cpp/build/bin && ./llama-quantize /content/models/llama_3.1_FP16.gguf /content/models/llama_3.1-Q4_K_M.gguf q4_K_M

## Now Inference using Quantized Model

In [None]:
!pip install llama-cpp-python==0.2.85

In [None]:
from llama_cpp import Llama

In [None]:
model_path = "/content/models/llama_3.1-Q4_K_M.gguf"

In [None]:
llm = Llama(model_path=model_path)

In [None]:
generation_kwargs = {
    "max_tokens":300,
    "echo":False,
    "top_k":1
}

prompt = "Which country hosted 2018 fifa world cup?"
res = llm(prompt, **generation_kwargs)
res.get("choices")[0].get("text")

## Save model to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir "/content/drive/My Drive/llama_models"

In [None]:
import shutil

source_file_path = '/content/models/llama_3.1-Q4_K_M.gguf'
destination_file_path = '/content/drive/My Drive/llama_models/llama_3.1-Q4_K_M.gguf'

shutil.copy(source_file_path, destination_file_path)

## Upload model to Huggingface Hub

In [None]:
from huggingface_hub import login
login('<hf_access_token_here>')

In [None]:
from huggingface_hub import HfApi
api = HfApi()

model_id = "hf_profile/llama3.1-Q4_K_M-gguf"
api.create_repo(model_id, exist_ok=True, repo_type="model")
api.upload_file(
    path_or_fileobj='/content/models/llama_3.1-Q4_K_M.gguf',
    path_in_repo="llama3.1-Q4_K_M.gguf",
    repo_id=model_id,
)

## Inference by utilizing GPU

In [None]:
from huggingface_hub import snapshot_download

model_name = "hf_profile/llama3.1-Q4_K_M-gguf"
base_model = "./quantized_models/"
snapshot_download(repo_id=model_name, local_dir=base_model)

In [None]:
!nvidia-smi

In [None]:
!pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122

In [None]:
from llama_cpp import Llama
model_path = "./quantized_models/llama3.1-Q4_K_M.gguf"

model = Llama(model_path=model_path, n_gpu_layers=-1)

In [None]:
generation_kwargs = {
    "max_tokens":200,
    "echo":False,
    "top_k":1
}

prompt = "Which country hosted 2018 fifa world cup?"
res = model(prompt, **generation_kwargs)
res

In [None]:
output = model("Provide Information about world war 2 in 1000 words.", max_tokens=2048, stop=["\n"], echo=False)

In [None]:
print(output['choices'][0]['text'])