In [None]:
!git clone https://github.com/ggerganov/llama.cpp

**Build llama.cpp and Install Dependencies**

 Changes to the llama.cpp directory, enables CUDA support (CUBLAS) for GPU acceleration, compiles the llama.cpp codebase, and installs Python dependencies required for converting Hugging Face models to GGUF format.
 Assumes llama.cpp is a subdirectory in your project or a cloned repository.

In [None]:
!cd llama.cpp && LLAMA_CUBLAS=1 make && pip install -r requirements/requirements-convert-hf-to-gguf.txt

In [3]:
from huggingface_hub import snapshot_download

In [4]:
model_name = "Qwen/Qwen1.5-1.8B"

In [5]:
methods = ['q4_k_m']

In [6]:
base_model = "./original_model/"
quantized_path = "./quantized_model/"

In [None]:
snapshot_download(repo_id=model_name, local_dir=base_model , local_dir_use_symlinks=False)
original_model = quantized_path+'/FP16.gguf'

In [8]:
!mkdir ./quantized_model/

**Convert Hugging Face Model to GGUF Format**

Converts a Hugging Face model (located in ./original_model/) to GGUF format with 16-bit floating-point precision (FP16) and saves it as FP16.gguf in ./quantized_model/.
GGUF is a format optimized for use with llama.cpp, enabling efficient inference.



In [None]:
!python llama.cpp/convert-hf-to-gguf.py ./original_model/ --outtype f16 --outfile ./quantized_model/FP16.gguf

In [12]:
import os

In [13]:
for m in methods:
    qtype = f"{quantized_path}/{m.upper()}.gguf"
    os.system("./llama.cpp/quantize "+quantized_path+"/FP16.gguf "+qtype+" "+m)

**Run Interactive Chat with Quantized Model**

 Runs the llama.cpp main executable with a quantized model (Q4_K_M.gguf), generating up to 90 tokens, applying a repeat penalty of 1.0 to avoid repetition, enabling colored output, and using an interactive mode with a chat prompt from chat-with-bob.txt.
 The -r "User:" flag sets the user prompt delimiter for interactive chat.


In [None]:
! ./llama.cpp/main -m ./quantized_model/Q4_K_M.gguf -n 90 --repeat_penalty 1.0 --color -i -r "User:" -f llama.cpp/prompts/chat-with-bob.txt

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [19]:
from huggingface_hub import HfApi, HfFolder, create_repo, upload_file

In [24]:
model_path = "./quantized_model/Q4_K_M.gguf" # Your model's local path
repo_name = "qwen1.5-llm"  # Desired HF Hub repository name
repo_url = create_repo(repo_name, private=False)

In [None]:
api = HfApi()
api.upload_file(
    path_or_fileobj=model_path,
    path_in_repo="Q4_K_M.gguf",
    repo_id="skuma307/qwen1.5-llm",
    repo_type="model",
)