In [7]:
import os
import llama_cpp
from llama_cpp import Llama
from tokenizer import Tokenizer

model_path = os.path.join(os.environ["MODELS_PATH"], "Meta", "LLaMA", "7B", "ggml-model-f16.bin")
tokenizer_path = os.path.join(os.environ["MODELS_PATH"], "Meta", "LLaMA2", "tokenizer.model")

llama_cpp_llm = Llama(model_path=model_path)
meta_tokenizer = Tokenizer(model_path=tokenizer_path)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


In [8]:
def get_llama_ccp_vocab_id_to_piece():
    lparams = llama_cpp.llama_context_default_params()

    ctx = llama_cpp.llama_init_from_file(str(model_path).encode("utf8"), lparams)

    n_vocab = llama_cpp.llama_n_vocab(ctx)

    strings = (llama_cpp.c_char_p * n_vocab)()
    scores = (llama_cpp.c_float * n_vocab)()
    n_vocab = llama_cpp.c_int(n_vocab)

    assert llama_cpp.llama_get_vocab(llama_cpp.llama_context_p(ctx), strings, scores, n_vocab) == n_vocab.value

    return strings[:]

llama_ccp_id_to_piece = get_llama_ccp_vocab_id_to_piece()


In [9]:
for prompt in ["Hello world", " Hello world"]:
    print(f"For prompt \"{prompt}\":")
    print("llama.cpp tokenizer:", llama_cpp_llm.tokenize(text=prompt.encode("utf-8"), add_bos=False))
    print("llama.cpp tokenizer + space:", llama_cpp_llm.tokenize(text=b' ' + prompt.encode("utf-8"), add_bos=False))
    print("Meta tokenizer:", meta_tokenizer.encode(prompt, bos=False, eos=False))
    print()

For prompt "Hello world":
llama.cpp tokenizer: [10994, 3186]
llama.cpp tokenizer + space: [15043, 3186]
Meta tokenizer: [15043, 3186]

For prompt " Hello world":
llama.cpp tokenizer: [15043, 3186]
llama.cpp tokenizer + space: [29871, 15043, 3186]
Meta tokenizer: [29871, 15043, 3186]



In [10]:
print("Doing the reverse process:")
for tokens in [[10994, 3186], [15043, 3186], [29871, 15043, 3186]]:
    print(f"For tokens \"{tokens}\":")
    print("llama.cpp tokenizer:", f"|{llama_cpp_llm.detokenize(tokens)}|")
    print("Meta tokenizer:", f"|{meta_tokenizer.decode(tokens)}|")
    print()

print("*Adding | to ease visualization.")

Doing the reverse process:
For tokens "[10994, 3186]":
llama.cpp tokenizer: |b'Hello world'|
Meta tokenizer: |Hello world|

For tokens "[15043, 3186]":
llama.cpp tokenizer: |b' Hello world'|
Meta tokenizer: |Hello world|

For tokens "[29871, 15043, 3186]":
llama.cpp tokenizer: |b'  Hello world'|
Meta tokenizer: | Hello world|

*Adding | to ease visualization.


In [11]:
print("Looking the id_to_piece for llama.cpp:")
for i in [10994, 3186, 15043, 29871]:
    print(f"id {i}", f"|{llama_ccp_id_to_piece[i]}|")

print("\nLooking the id_to_piece for Meta:")
for i in [10994, 3186, 15043, 29871]:
    print(f"id {i}", f"|{meta_tokenizer.sp_model.id_to_piece(i)}|")

print("\n*Adding | to ease visualization.")

print("Note, the 29871 token is not the underline character but \"\\u2581\" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).")
print(meta_tokenizer.sp_model.id_to_piece(29871) == "\u2581")

Looking the id_to_piece for llama.cpp:
id 10994 |b'Hello'|
id 3186 |b' world'|
id 15043 |b' Hello'|
id 29871 |b' '|

Looking the id_to_piece for Meta:
id 10994 |Hello|
id 3186 |▁world|
id 15043 |▁Hello|
id 29871 |▁|

*Adding | to ease visualization.
Note, the 29871 token is not the underline character but "\u2581" (https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol).
True


In [12]:
print("Using the llama.cpp detokenizer:")
for i in [10994, 3186, 15043, 29871]:
    print(f"id {i}", f"|{llama_cpp_llm.detokenize([i])}|")

print("\nUsing the Meta detokenizer:")
for i in [10994, 3186, 15043, 29871]:
    print(f"id {i}", f"|{meta_tokenizer.decode([i])}|")

Using the llama.cpp detokenizer:
id 10994 |b'Hello'|
id 3186 |b' world'|
id 15043 |b' Hello'|
id 29871 |b' '|

Using the Meta detokenizer:
id 10994 |Hello|
id 3186 |world|
id 15043 |Hello|
id 29871 ||
