In [2]:
from llama_cpp import Llama
import time, json

In [12]:
modelPath = '../../data/llama-2-7b.Q4_K_M.gguf'
# template="<|prompter|>{user_prompt}</s><|assistant|>"
big_text = """The following in an excerpt of an article.

Alaska Airlines Flight 1282 was bound for Ontario, California when it returned to Portland, Oregon shortly after takeoff on Friday after a pressurization issue was detected. No serious injuries were reported on the flight, according to federal safety officials. The flight returned to Portland, Oregon, shortly after takeoff on Friday after a pressurization issue was reported.

Images and video of Alaska's Boeing 737 Max 9 shared on social media showed a gaping hole on the side of the plane and passengers using oxygen masks before it returned to Portland.

“Safety will continue to drive our decision-making as we assist the NTSB's investigation into Alaska Airlines Flight 1282,” FAA Administrator Mike Whitaker said in a statement.

Question:
"""

llm = Llama(
        model_path=modelPath, 
    )

AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [13]:
len(llm.tokenize( big_text.encode() ))

194

In [14]:
llm.n_ctx(), llm.n_tokens

(512, 0)

In [15]:
def reverse_cumsum(d):
    tmp = [(k,v) for k,v in d.items()]
    d = {tmp[0][0]:tmp[0][1]}
    for i, e in enumerate(tmp[1:]):
        d[e[0]] = e[1] - tmp[i][1]
    return d

In [16]:
def askTheQuestion(in_state, question_text, max_tokens=10):
    d_time = {}
    t0 = time.time()
    llm.load_state(in_state)
    d_time['load_state'] = time.time() - t0
    print("Loaded state. ntokens: ", llm.n_tokens)
    llm.eval(
        llm.tokenize( 
            (question_text).encode() 
        )
    )
    d_time['eval_prompt'] = time.time() - t0
    print("Tokenized.")
    token = llm.sample()
    counter = 0
    while token is not llm.token_eos() :
        counter += 1
        if counter > max_tokens: break
        print(llm.detokenize([token]).decode(), end='', flush=True)
        llm.eval([token])
        token = llm.sample()
    d_time['main_sample'] = time.time() - t0
    d_time = reverse_cumsum(d_time)
    print(json.dumps(d_time, indent=2))

def createState(prefix) :
    d_time = {}
    t0 = time.time()
    llm.reset()
    d_time['reset'] = time.time() - t0
    output = llm.eval(llm.tokenize(prefix.encode()))
    d_time['eval_time'] = time.time() - t0
    d_time = reverse_cumsum(d_time)
    print(json.dumps(d_time, indent=2))
    return llm.save_state()

In [17]:
first_state = createState(big_text)
print("Saved First State")


{
  "reset": 0.0,
  "eval_time": 21.17530083656311
}
Saved First State


Llama.save_state: saving llama state
Llama.save_state: got state size: 293333804
Llama.save_state: allocated state
Llama.save_state: copied llama state: 126616376
Llama.save_state: saving 126616376 bytes of llama state


In [18]:
q1 = "What was the destination of the plane?"
askTheQuestion(first_state, q1)


Loaded state. ntokens:  194
Tokenized.


Comment: The flight from Portland to{
  "load_state": 0.09851551055908203,
  "eval_prompt": 1.1630008220672607,
  "main_sample": 1.6533801555633545
}


In [19]:
q2 = "When did the flight occur?"
askTheQuestion(first_state, q2)

Loaded state. ntokens:  194
Tokenized.


Answer: The incident happened on Friday,{
  "load_state": 0.07854390144348145,
  "eval_prompt": 0.8646790981292725,
  "main_sample": 1.63368821144104
}
