In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
from optimum.intel import OVModelForCausalLM
import openvino as ov

model_id = 'stabilityai/japanese-stablelm-base-gamma-7b'
model_vendor, model_name = model_id.split('/')

  from .autonotebook import tqdm as notebook_tqdm


INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


## モデル変換 (OpenVINOモデル生成)

In [3]:
import os
import nncf

if not os.path.exists(f'{model_name}/INT4'):
    ov_model=OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False)
    compressed_model = nncf.compress_weights(ov_model.half()._original_model, mode=nncf.CompressWeightsMode.INT4_ASYM, group_size=128, ratio=0.8)
    ov.save_model(compressed_model, f'{model_name}/INT4/openvino_model.xml')


This architecture : mistral was not validated, only :blenderbot, gpt-bigcode, gpt-neox, bart, gpt-neo, marian, opt, gpt2, pegasus, codegen, bloom, llama, blenderbot-small architectures were validated, use at your own risk.
Framework not specified. Using pt to export to ONNX.
Loading checkpoint shards: 100%|██████████| 2/2 [00:42<00:00, 21.09s/it]
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.2.0+cpu
Overriding 1 configuration item(s)
	- use_cache -> True
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if seq_len > self.max_seq_len_cached:
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
Exception ignored in: <finalize object at 0x1ec05334820; dead>
Traceback (most recent call last):
  File "C:\Users\yas_s\AppData\Local\Programs\Python\Python310\lib\weakref.py", line 591, in __call__
    return info.func(*info.args, **(info.kwargs

INFO:nncf:Statistics of the bitwidth distribution:
+--------------+---------------------------+-----------------------------------+
| Num bits (N) | % all parameters (layers) |    % ratio-defining parameters    |
|              |                           |             (layers)              |
| 8            | 23% (85 / 226)            | 20% (83 / 224)                    |
+--------------+---------------------------+-----------------------------------+
| 4            | 77% (141 / 226)           | 80% (141 / 224)                   |
+--------------+---------------------------+-----------------------------------+


## OpenVINOモデルの読み込み

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

ov_model = OVModelForCausalLM.from_pretrained(
    model_id = f'{model_name}/INT4',
    device='CPU',
    ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "./cache"},
    config=AutoConfig.from_pretrained(model_id)
)

Compiling the model to CPU ...


In [17]:
def build_prompt(user_query, inputs="", sep="\n\n### "):
    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
    p = sys_msg
    roles = ["指示", "応答"]
    msgs = [": \n" + user_query, ": "]
    if inputs:
        roles.insert(1, "入力")
        msgs.insert(1, ": \n" + inputs)
    for role, msg in zip(roles, msgs):
        p += sep + role + msg
    return p

In [18]:
# Infer with prompt without any additional input
user_inputs = {
    "user_query": "VR とはどのようなものですか？",
    "inputs": ""
}
prompt = build_prompt(**user_inputs)

In [20]:
print(f'** Prompt:\n{prompt}\n-------------------------')
input_tokens = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
response = ov_model.generate(**input_tokens, eos_token_id=tokenizer.eos_token_id, max_new_tokens=300, num_return_sequences=1, temperature=1.0, do_sample=True, top_k=5, top_p=0.90, repetition_penalty=1.2, streamer=streamer)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


** Prompt:
以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。

### 指示: 
VR とはどのようなものですか？

### 応答: 
-------------------------

バーチャルリアリティ（Virtual Reality, VR）とは、仮想空間へユーザが没頭して疑似体験を行えるコンピュータ技術やそれらシステムの事です。現実世界に存在しない物理的オブジェクトを視認可能な状態まで表示出来るだけでなく、自由に触り動作させ操作可能な様相で提供されることが多々見受けられます。


In [None]:
# pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]

In [None]:
# !optimum-cli export openvino -m stabilityai/japanese-stablelm-base-gamma-7b --trust-remote-code --weight-format int4_asym_g64 --disable-stateful japanese-stablelm-base-gamma-7b/INT4