In [1]:
# Run an LLM chat model only with OpenVINO (supports only the stateful, KV-caching enabled LLM models)
#  - Without 'optimum-intel', 'PyTorch' and HF-Tokenizers.
#  This program uses sampling method to generate the output text.

import numpy as np
from transformers import LlamaTokenizer
import openvino as ov

In [120]:
tokenizer = LlamaTokenizer.from_pretrained("novelai/nerdstash-tokenizer-v1", additional_special_tokens=['▁▁'])

device = 'CPU'
ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "./cache"}
print('Compiling the model...', end='', flush=True)
#compiled_model = ov.compile_model('openvino_model_int8.xml', device, ov_config)
compiled_model = ov.compile_model('openvino_model_int8_no_kv_out.xml', device, ov_config)
infer_request = compiled_model.create_infer_request()
print('finished.')

print(compiled_model)

Compiling the model...finished.
<CompiledModel:
inputs[
<ConstOutput: names[input_ids] shape[?,?] type: i32>,
<ConstOutput: names[attention_mask] shape[?,?] type: i32>,
<ConstOutput: names[position_ids] shape[?,?] type: i32>,
<ConstOutput: names[use_cache] shape[] type: i32>,
<ConstOutput: names[output_attentions] shape[] type: i32>,
<ConstOutput: names[output_hidden_states] shape[] type: i32>,
<ConstOutput: names[return_dict] shape[] type: i32>
]
outputs[
<ConstOutput: names[6430] shape[?,?,65536] type: f32>
]>


In [121]:
def build_prompt(user_query, inputs="", sep="\n\n### "):
    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
    p = sys_msg
    roles = ["指示", "応答"]
    msgs = [": \n" + user_query, ": "]
    if inputs:
        roles.insert(1, "入力")
        msgs.insert(1, ": \n" + inputs)
    for role, msg in zip(roles, msgs):
        p += sep + role + msg
    return p

# Infer with prompt without any additional input
user_inputs = {
    "user_query": "VR とはどのようなものですか？",
    "inputs": ""
}

In [None]:
prompt = build_prompt(**user_inputs)

tokens = tokenizer(
    prompt, 
    add_special_tokens=False, 
    return_tensors="pt"
)

# Tokenize the input text (text -> token IDs)
# - The model input for the 1st iteration
num_tokens     = tokens.input_ids.shape[-1]
input_ids      = tokens.input_ids
attention_mask = tokens.attention_mask
position       = num_tokens
position_ids   = np.array([range(position)], dtype=np.int64)
beam_idx       = np.array([0], dtype=np.int32)

num_max_token_for_generation = 20
generated_text_ids = []
prev_output = ''

#past_key_values = ov.Tensor(type=ov.Type.f32 , shape=(32,2,1,32,0,128))
past_key_value = ov.Tensor(type=ov.Type.f32 , shape=(1,32,0,128))
past_key_values = tuple([ (past_key_value, past_key_value) for _ in range(32) ])

print(len(past_key_values), len(past_key_values[0]), past_key_values[0][1].shape)

# generate lists that contains kv_cache output node names
o_past_kv_names = []
o_hidden_states_names = []
for n in range(32):
    past_kv_name = str(7204 + n)
    o_past_kv_names.append(past_kv_name)
    hidden_states_name = str(7172 + n)
    o_hidden_states_names.append(hidden_states_name)

i_past_kv_names = []
for n in range(64):
    past_kv_name = str(42 + n)
    i_past_kv_names.append(past_kv_name)

#print(past_kv_names)
print(tokens.input_ids.shape)

In [None]:
print('*** Start inferencing')

infer_request.reset_state()                                     # Initialize model internal state
for i in range(num_max_token_for_generation):

    # Run inference (to generate the logits for the next word prediction)
    #print('inf', i, ' : ', end='', flush=True)
    inputs={'input_ids'            : input_ids,
            'attention_mask'       : attention_mask,
            'position_ids'         : position_ids,
            'use_cache'            : 1,
            'output_attentions'    : 0,
            'output_hidden_states' : 0,
            'return_dict'          : 0,
    }
    """
    for n in range(32):
        input_name = str(42 + n)
        inputs[input_name] = past_key_values[n][0]
        input_name = str(42 + 32 + n)
        inputs[input_name] = past_key_values[n][1]
    """
    response = infer_request.infer(inputs)

    #print(response)
    #print(response[o_past_kv_names[0]].shape)  # [1, 32, num_seq, 128]
    #print(response['hidden_states.1'].shape) # [1, num_seq,4096]
    #(1, 53, 4096)
    #(1, 53, 4096)
    #(1, 32, 53, 53)

    #logits_name = '7237'
    logits_name = '6430'
    logits = response[logits_name][0, -1, :].ravel()
    sampled_token_id = np.argmax(logits)
    #print(logits, logits.shape, sampled_id)

    if sampled_token_id == tokenizer.eos_token_id:
        print('\n*** EOS token detected.')
        break
    generated_text_ids = np.append(generated_text_ids, sampled_token_id)  # Append the predicted word to the bottom of the generated text ID array
    output_text = tokenizer.decode(generated_text_ids)              # Decode and generate the text from the array of token IDs
    print(output_text[len(prev_output):], end='', flush=True)       # Print only the last generated word
    #print()
    prev_output = output_text
    #print(output_text)

    #input_ids = np.append(input_ids, [[sampled_token_id]], axis=1)
    #attention_mask = np.append(attention_mask, [[1]], axis=1)
    #position_ids = np.append(position_ids, [[position]], axis=1)
    """
    past_key_values = []
    for n in range(32):
        past_key_value      = response[o_past_kv_names[n]] #  1,32,seq_len,seq_len   .reshape(1,32,-1,128)
        hidden_states_value = response[o_hidden_states_names[n]].reshape(1, 32, -1, 128)
        #past_key_values.append((past_key_value, hidden_states_value))
        past_key_values.append((past_key_value, hidden_states_value))
    #print(response[o_past_kv_names[0]].shape, response[o_hidden_states_names[0]].shape)
    print(past_key_values[0][0].shape, past_key_values[0][1].shape)
    """

    """
    input_ids      = np.array([[sampled_token_id]], dtype=np.int64)
    attention_mask = np.array([[1]], dtype=np.int64)
    position_ids   = np.array([[position]], dtype=np.int64)
    """
    input_ids      = np.append(input_ids, [[sampled_token_id]], axis=1)
    attention_mask = np.append(attention_mask, [[1]], axis=1)
    position_ids   = np.append(position_ids, [[position]], axis=1)
    #"""
    position      += 1

print(f'\n\n*** Completed.')