In [1]:
from llama_cpp import Llama
import time, json

In [2]:
modelPath = '../../data/llama-2-7b.Q4_K_M.gguf'
template="<|prompter|>{user_prompt}</s><|assistant|>"

llm = Llama(
        model_path=modelPath, 
    )

AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [3]:
llm.n_ctx(), llm.n_tokens

(512, 0)

In [4]:
def reverse_cumsum(d):
    tmp = [(k,v) for k,v in d.items()]
    d = {tmp[0][0]:tmp[0][1]}
    for i, e in enumerate(tmp[1:]):
        d[e[0]] = e[1] - tmp[i][1]
    return d

In [5]:
def askTheQuestion(in_state, prompt, max_tokens=10):
    d_time = {}
    t0 = time.time()
    llm.load_state(in_state)
    d_time['load_state'] = time.time() - t0
    print("Loaded state. ntokens: ", llm.n_tokens)
    llm.eval(llm.tokenize(" {prompt}</s><|assistant|> ".format(
        prompt=prompt).encode())
    )
    d_time['eval_prompt'] = time.time() - t0
    print("Tokenized.")
    token = llm.sample()
    counter = 0
    while token is not llm.token_eos() :
        counter += 1
        if counter > max_tokens: break
        print(llm.detokenize([token]).decode(), end='', flush=True)
        llm.eval([token])
        token = llm.sample()
    d_time['main_sample'] = time.time() - t0
    d_time = reverse_cumsum(d_time)
    print(json.dumps(d_time, indent=2))

def createState(prefix) :
    d_time = {}
    t0 = time.time()
    llm.reset()
    d_time['reset'] = time.time() - t0
    output = llm.eval(llm.tokenize(prefix.encode()))
    d_time['eval_time'] = time.time() - t0
    d_time = reverse_cumsum(d_time)
    print(json.dumps(d_time, indent=2))
    return llm.save_state()

In [6]:
first_state = createState("<|prompter|>You are a superhero named Fred. You live in Metropolis. Your nemesis is Lex Luthor. Everyone thinks you're Superman, but you're a different hero that wishes he could be recognized on his own merits.");
print("Saved First State")


{
  "reset": 0.0,
  "eval_time": 7.0601561069488525
}
Saved First State


Llama.save_state: saving llama state
Llama.save_state: got state size: 276181804
Llama.save_state: allocated state
Llama.save_state: copied llama state: 39209248
Llama.save_state: saving 39209248 bytes of llama state


In [7]:

second_state = createState("<|prompter|>You are a hobbit named Barney that lives in the shire.");
print("Saved Second State")


{
  "reset": 0.0,
  "eval_time": 2.8706045150756836
}
Saved Second State


Llama.save_state: saving llama state
Llama.save_state: got state size: 276181804
Llama.save_state: allocated state
Llama.save_state: copied llama state: 20334736
Llama.save_state: saving 20334736 bytes of llama state


In [8]:

askTheQuestion(first_state, "What is happening today?")


Loaded state. ntokens:  60
Tokenized.
 I am here at the Daily Planet, waiting for{
  "load_state": 0.037410736083984375,
  "eval_prompt": 1.8970098495483398,
  "main_sample": 1.7665233612060547
}


In [11]:
askTheQuestion(second_state, "What is happening today?")

Loaded state. ntokens:  24
Tokenized.
 Today you're going to be doing your ch{
  "load_state": 0.027421951293945312,
  "eval_prompt": 1.9321198463439941,
  "main_sample": 1.6253106594085693
}


In [12]:
askTheQuestion(first_state, "What is your dream?")
askTheQuestion(first_state, "What is happening today?")

Loaded state. ntokens:  60
Tokenized.
 You are the president of the United States and you{
  "load_state": 0.041242122650146484,
  "eval_prompt": 1.873370885848999,
  "main_sample": 1.822178602218628
}
Loaded state. ntokens:  60
Tokenized.
 I am here at the Daily Planet, waiting for{
  "load_state": 0.033985137939453125,
  "eval_prompt": 1.812589168548584,
  "main_sample": 1.7143278121948242
}


In [13]:

# failure happens on the line below.
askTheQuestion(second_state, "What is happening today?")
askTheQuestion(second_state, "What is your favourite thing?")

Loaded state. ntokens:  24
Tokenized.
 Today you're going to be doing your ch{
  "load_state": 0.029329538345336914,
  "eval_prompt": 1.988227367401123,
  "main_sample": 1.7650163173675537
}
Loaded state. ntokens:  24
Tokenized.
 My favourite things are my pipe and my to{
  "load_state": 0.023984432220458984,
  "eval_prompt": 2.1846811771392822,
  "main_sample": 1.7713751792907715
}
