In [2]:
# Run an LLM chat model only with OpenVINO (supports only the stateful, KV-caching enabled LLM models)
#  - Without 'optimum-intel', 'PyTorch' and HF-Tokenizers.
#  This program uses sampling method to generate the output text.

import numpy as np
from transformers import LlamaTokenizer, AutoTokenizer
import openvino as ov

In [75]:
#model_id = 'stabilityai/japanese-stablelm-base-alpha-7b'
model_id = 'stabilityai/japanese-stablelm-base-gamma-7b'
model_vendor, model_name = model_id.split('/') 

#tokenizer = LlamaTokenizer.from_pretrained("novelai/nerdstash-tokenizer-v1", additional_special_tokens=['▁▁'])
tokenizer = AutoTokenizer.from_pretrained(model_id)

device = 'CPU'
ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "./cache"}
print('Compiling the model...', end='', flush=True)
#compiled_model = ov.compile_model('stateless-int8/openvino_model.xml', device, ov_config)
compiled_model = ov.compile_model('openvino_model_int8.xml', device, ov_config)
infer_request = compiled_model.create_infer_request()
print('finished.')

print(compiled_model)

Compiling the model...finished.
<CompiledModel:
inputs[
<ConstOutput: names[input_ids] shape[?,?] type: i32>,
<ConstOutput: names[attention_mask, 336] shape[?,?] type: i32>,
<ConstOutput: names[position_ids] shape[?,?] type: i32>,
<ConstOutput: names[42, key_states.1] shape[?,8,?,128] type: f32>,
<ConstOutput: names[43] shape[?,8,?,128] type: f32>,
<ConstOutput: names[44] shape[?,8,?,128] type: f32>,
<ConstOutput: names[45] shape[?,8,?,128] type: f32>,
<ConstOutput: names[46] shape[?,8,?,128] type: f32>,
<ConstOutput: names[47] shape[?,8,?,128] type: f32>,
<ConstOutput: names[48] shape[?,8,?,128] type: f32>,
<ConstOutput: names[49] shape[?,8,?,128] type: f32>,
<ConstOutput: names[50] shape[?,8,?,128] type: f32>,
<ConstOutput: names[51] shape[?,8,?,128] type: f32>,
<ConstOutput: names[52] shape[?,8,?,128] type: f32>,
<ConstOutput: names[53] shape[?,8,?,128] type: f32>,
<ConstOutput: names[54] shape[?,8,?,128] type: f32>,
<ConstOutput: names[55] shape[?,8,?,128] type: f32>,
<ConstOutput:

In [79]:
def build_prompt(user_query, inputs="", sep="\n\n### "):
    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
    p = sys_msg
    roles = ["指示", "応答"]
    msgs = [": \n" + user_query, ": "]
    if inputs:
        roles.insert(1, "入力")
        msgs.insert(1, ": \n" + inputs)
    for role, msg in zip(roles, msgs):
        p += sep + role + msg
    return p

# Infer with prompt without any additional input
user_inputs = {
    "user_query": "VR とはどのようなものですか？",
    "inputs": ""
}

In [87]:
prompt = build_prompt(**user_inputs)

# Tokenize the input text (text -> token IDs)
# - The model input for the 1st iteration
tokens = tokenizer(
    prompt, 
    add_special_tokens=False, 
    return_tensors="pt"
)
num_tokens     = tokens.input_ids.shape[-1]
input_ids      = tokens.input_ids
attention_mask = tokens.attention_mask
position       = num_tokens
position_ids   = np.array([[ n for n in range(position)]], dtype=np.int32)

# Generates initial KV cache input
seq_len = 0
past_key_values = [ ov.Tensor(type=ov.Type.f32 , shape=(1,8,seq_len,128)) for _ in range(32 * 2) ]

# generate lists that contains kv_cache output node names
#"""
# KV cache input/output name for models that is converted with OpenVINO convert_model() API.
o_past_kv_names = []
for n in range(32 * 2):
    past_kv_name = str(107 + n)
    o_past_kv_names.append(past_kv_name)
i_past_kv_names = []
for n in range(32 * 2):
    past_kv_name = str(42 + n)
    i_past_kv_names.append(past_kv_name)
#"""
    
"""
# KV cache input/output name for models that is converted with optimum-intel.
o_past_kv_names = []
for n in range(32):
    past_kv_name = f'present.{n}.key'
    o_past_kv_names.append(past_kv_name)
    past_kv_name = f'present.{n}.value'
    o_past_kv_names.append(past_kv_name)

i_past_kv_names = []
for n in range(32):
    past_kv_name = f'past_key_values.{n}.key'
    i_past_kv_names.append(past_kv_name)
    past_kv_name = f'past_key_values.{n}.value'
    i_past_kv_names.append(past_kv_name)
"""


"\n# KV cache input/output name for models that is converted with optimum-intel.\no_past_kv_names = []\nfor n in range(32):\n    past_kv_name = f'present.{n}.key'\n    o_past_kv_names.append(past_kv_name)\n    past_kv_name = f'present.{n}.value'\n    o_past_kv_names.append(past_kv_name)\n\ni_past_kv_names = []\nfor n in range(32):\n    past_kv_name = f'past_key_values.{n}.key'\n    i_past_kv_names.append(past_kv_name)\n    past_kv_name = f'past_key_values.{n}.value'\n    i_past_kv_names.append(past_kv_name)\n"

In [88]:
print('*** Start inferencing')

num_max_token_for_generation = 30
generated_text_ids = []
prev_output = ''

#infer_request.reset_state()                                     # Initialize model internal state

for token_count in range(num_max_token_for_generation):

    # Run inference (to generate the logits for the next word prediction)
    inputs={'input_ids'            : input_ids,
            'attention_mask'       : attention_mask,
            'position_ids'         : position_ids,
    }
    for n in range(64):
        input_name = i_past_kv_names[n]
        inputs[input_name] = past_key_values[n]

    response = infer_request.infer(inputs)

    # Sample the predicted token ID
    logits = response['logits'][0, -1, :]
    sampled_token_id = np.argmax(logits)                                                    # Greedy sampling

    if sampled_token_id == tokenizer.eos_token_id:
        print('\n*** EOS token detected.')
        break
    # Display the text of the last generated portion
    generated_text_ids = np.append(generated_text_ids, sampled_token_id).astype(np.int32)   # Append the predicted word to the bottom of the generated text ID array
    output_text = tokenizer.decode(generated_text_ids)                                      # Decode and generate the text from the array of token IDs
    print(output_text[len(prev_output):], end='', flush=True)                               # Print only the last generated word
    prev_output = output_text

    # Setup input data for the next iteration
    input_ids      = np.array([[sampled_token_id]], dtype=np.int32)
    attention_mask = np.array([[1]], dtype=np.int32)
    position_ids   = np.array([[position]], dtype=np.int32)
    position      += 1
    past_key_values = [ response[o_past_kv_names[n]] for n in range(32 * 2) ]

print(f'\n\n*** Completed.')

*** Start inferencing

VR は、��想現実の略です。VR は、コンピューターの画面

*** Completed.
