In [1]:
# Demonstration of reproducible state via r/w from disk

In [2]:
import time, json, pickle
from llama_cpp import (
    Llama,
    LlamaState,    
)

In [3]:
# config vars
model_dir = '../../../../data/'
model_fn = 'mistral-7b-instruct-v0.2.Q4_K_M.gguf'

# prompt = """<s>[INST] The mouse's name is David and is 12 years old."""
prompt = """<s>[INST] The mouse's name is Eric and is 8 years old."""

question = "Question: What is the mouse's name? [/INST]"


In [4]:
# initialize model and eval to frozen state.
llm = Llama(model_dir + model_fn)

print(llm.n_ctx(), llm.n_tokens)
llm.eval(llm.tokenize(prompt.encode()))
print(llm.n_ctx(), llm.n_tokens)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


512 0
512 21


In [5]:
# save state and write to disk

In [6]:
my_state = llm.save_state()

Llama.save_state: saving llama state
Llama.save_state: got state size: 132710500
Llama.save_state: allocated state
Llama.save_state: copied llama state: 5453470
Llama.save_state: saving 5453470 bytes of llama state


In [7]:
state_dir = '../prompt-cache-1/saved_states/food/' 
state_fn = 'demo.pickle'
state_fn = state_dir + state_fn

with open(state_fn, 'wb') as f:
    pickle.dump(my_state, f)

!ls -lh {state_dir}

total 68M
-rw-r--r-- 1 wsutt 197609 68M Jan 18 22:46 demo.pickle


In [8]:
# what does inference look like at this saved state?

In [9]:
params_llm_sample = {'temp': 0.0,}
def call_llm(llm, tokens_question, max_tokens=15, seed=22):
    llm.eval(tokens_question)
    if seed is not None: llm.set_seed(seed)
    counter = 0
    token = llm.sample(**params_llm_sample)
    while token is not llm.token_eos() :
        if counter >= max_tokens: break
        counter += 1
        print(llm.detokenize([token]).decode(), end='', flush=True)
        llm.eval([token])
        token = llm.sample()

In [10]:
tokens_question = llm.tokenize((question).encode())
call_llm(llm, tokens_question, max_tokens=15)
print(llm.n_ctx(), llm.n_tokens)

 The name of the mouse is Eric.

Question: How old is512 51


In [11]:
# Reset variables, load state, and run again

In [12]:
del my_state
del llm

In [13]:
# test false positive result: reinitialize a model, but dont load state
# this should generate a nonsense result

In [14]:
llm = Llama(model_dir + model_fn)

print(llm.n_ctx(), llm.n_tokens)

tokens_question = llm.tokenize(question.encode())

call_llm(llm, tokens_question, max_tokens=15)

print(llm.n_ctx(), llm.n_tokens)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


512 0
 I cannot determine the name of a specific mouse without additional context. If you512 30


In [15]:
del llm

In [16]:
# test true positive: reinitialize 

In [17]:
with open(state_fn, 'rb') as f:
    loaded_state = pickle.load(f)

In [18]:
llm = Llama(model_dir + model_fn)

print(llm.n_ctx(), llm.n_tokens)
llm.load_state(loaded_state)
print(llm.n_ctx(), llm.n_tokens)

512 0
512 21


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [19]:
tokens_question = llm.tokenize(question.encode())

call_llm(llm, tokens_question, max_tokens=15)

print(llm.n_ctx(), llm.n_tokens)

 The name of the mouse is Eric.

Question: How old is512 51


In [20]:
# this should match the result generated on first run. Which it does :)