# Llama2 cpp

- ポイント
    - cmake のインストール
    - export CMAKE_ARGS="-DLLAMA_CUBLAS=ON"　を設定
- cf. [Llama.cpp で Llama 2 を試す](https://note.com/npaka/n/n0ad63134fbe2)
- cf. [llama-cpp-python 0.1.77](https://pypi.org/project/llama-cpp-python/)


In [None]:
import pathlib

# llama2_model_file = "../backend/data/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-13b-chat.ggmlv3.q4_K_M.bin"
llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q4_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q5_K_M.bin"
pathlib.Path(llama2_model_file).exists()


In [None]:
from llama_cpp import Llama

n_gqa = 8 if "70b" in llama2_model_file else 1
llm = Llama(model_path=llama2_model_file, n_gqa=n_gqa, n_gpu_layers=16)


In [None]:
prompt = """### Instruction: What is the height of Mount Fuji?
### Response:"""

prompt_formatted = f"""[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

{prompt} [/INST]
"""


# 推論の実行
output = llm(
    prompt_formatted,
    temperature=0.1,
    stop=["Instruction:", "Input:", "Response:", "\n"],
    echo=True,
)


In [None]:
print(output["choices"][0]["text"])


# LangChain

In [None]:
import pathlib

# llama2_model_file = "../backend/data/llama-2-7b-chat.ggmlv3.q5_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-13b-chat.ggmlv3.q4_K_M.bin"
llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q4_K_M.bin"
# llama2_model_file = "../backend/data/llama-2-70b-chat.ggmlv3.q5_K_M.bin"
pathlib.Path(llama2_model_file).exists()


In [None]:
from app.llama2cpp.component.llama2cpp import LlamaCppCustom


n_gqa = 8 if "70b" in llama2_model_file else 1
llm = LlamaCppCustom(
    model_path=llama2_model_file,
    n_ctx=512,
    temperature=0,
    max_tokens=128,
    n_gqa=n_gqa,
    n_gpu_layers=16,
    verbose=True,
    streaming=True
)


In [None]:
_prompt = "富士山の高さは？"

prompt = f"""[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
you have to answer in Japanese.
<</SYS>>

{_prompt} [/INST]

"""

# response = llm(prompt=prompt)

In [None]:
# print(response)

In [None]:
for token in llm.stream(prompt=prompt, stop=None, run_manager=None):
    tkn = token["choices"][0]["text"]
    print(tkn, sep="", end="")
print("")