In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!cmake -B build
!cmake --build build --config Release

In [None]:
!pip install llama-cpp-python transformers huggingface_hub

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="microsoft/Phi-4-mini-instruct",
    local_dir="./models/Phi-4-mini-instruct",
    local_dir_use_symlinks=False
)

In [None]:
!python convert_hf_to_gguf.py ./models/Phi-4-mini-instruct/ \
  --outfile ./models/Phi-4-mini-instruct/ggml-model-f16.gguf \
  --outtype f16

In [None]:
!./build/bin/llama-quantize ./models/Phi-4-mini-instruct/ggml-model-f16.gguf \
           ./models/Phi-4-mini-instruct/ggml-model-Q4_K_M.gguf \
           Q4_K_M

In [None]:
from llama_cpp import Llama

# ─── 1) Initialize the model ───────────────────────────────────────────────────
model_path = "/content/llama.cpp/models/Phi-4-mini-instruct/ggml-model-Q4_K_M.gguf"
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    verbose=False
)

# ─── 2) Build a proper chat‐style prompt ────────────────────────────────────────
# Phi-4-mini-instruct expects the ChatML format:
#
#   <|system|>…<|end|><|user|>…<|end|><|assistant|>
#
# You can also call llm.create_chat_completion to have the library format
# it for you under the hood.

messages = [
    {"role": "system",    "content": "You are a thoughtful, detailed AI assistant."},
    {"role": "user",      "content": "What does life mean? Describe in great detail."}
]

# ─── 3) Send it via the chat endpoint ──────────────────────────────────────────
response = llm.create_chat_completion(
    messages=messages,
    max_tokens=512,
    temperature=0.7,
    top_p=0.95,
)

# ─── 4) Print the assistant’s answer ───────────────────────────────────────────
print(response["choices"][0]["message"]["content"])