In [None]:
!pip install git+https://github.com/mobiusml/hqq

In [None]:
!pip install bitblas

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Settings
###################################################
backend       = "bitblas" #bitblas or gemlite for 2-bit runtime
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
device        = 'cuda:0'
cache_dir     = '.'

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype, device=device, adapter='adapter_v0.1.lora').eval();
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

#Use optimized inference kernels
###################################################
prepare_for_inference(model, backend=backend) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=10, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)
#gen.generate("Tell me a funny joke!", print_tokens=True)
#gen.generate("How to make a yummy chocolate cake?", print_tokens=True)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=100, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Who is Napoleon Bonaparte?", print_tokens=True)

00%|██████████| 99/99 [03:12<00:00,  1.94s/it]
 41%|████▏     | 41/99 [00:03<00:04, 12.19it/s]
100%|██████████| 99/99 [00:03<00:00, 31.01it/s]
 25%|██▌       | 25/99 [00:00<00:02, 28.88it/s]
100%|██████████| 99/99 [00:03<00:00, 30.41it/s]
Napoleon Bonaparte (1768-1828) was a French military and political leader who rose to power through military conquests and reforms. He was the first French Emperor from 1793 to 1815 and is considered one of the most famous and influential figures in European history.

Born on August 29, 1768, in Corsica, Napoleon Bonaparte was the son of a French aristocrat and a Corsican noble family. He was educated at the École
{'output_text': 'Napoleon Bonaparte (1768-1828) was a French military and political leader who rose to power through military conquests and reforms. He was the first French Emperor from 1793 to 1815 and is considered one of the most famous and influential figures in European history.\n\nBorn on August 29, 1768, in Corsica, Napoleon Bonaparte was the son of a French aristocrat and a Corsican noble family. He was educated at the É',
 'output_tokens': tensor([   45,   391, 60203, 13789,   391, 20430,   320, 10967,    23,    12,
         10828,    23,     8,   574,   264,  8753,  6411,   323,  5054,  7808,
           889, 16392,   311,  2410,  1555,  6411, 62366,    82,   323, 31343,
            13,  1283,   574,   279,  1176,  8753, 35414,   505,   220, 11128,
            18,   311,   220, 10562,    20,   323,   374,  6646,   832,   315,
           279,  1455, 11495,   323, 32549, 12678,   304,  7665,  3925,   382,
         59204,   389,  6287,   220,  1682,    11,   220, 10967,    23,    11,
           304, 53618,  3074,    11, 70527, 13789,   391, 20430,   574,   279,
          4538,   315,   264,  8753, 64838, 73632,   323,   264, 53618,  7210,
         35482,  3070,    13,  1283,   574, 33142,   520,   279, 29124],
        dtype=torch.int32),
 'input_tokens': tensor([128000, 128000, 128006,    882, 128007,    271,  15546,    374,  70527,
          13789,    391,  20430,     30, 128009, 128006,  78191, 128007,    271],
        dtype=torch.int32)}

In [None]:
!pip install vllm

https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html?device=cuda

In [None]:
!pip install git+https://github.com/snowflake-labs/vllm.git@swiftkv

In [None]:
!git clone https://github.com/Snowflake-Labs/vllm.git && cd vllm/examples

In [None]:
%cd vllm/examples


In [None]:
!dir


In [None]:
!python /content/vllm/examples/swiftkv/offline_inference_swiftkv.py

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
!python /content/vllm/examples/swiftkv/offline_inference_swiftkv.py

In [None]:
!pip install vllm

In [None]:
print("=" * 80)

In [None]:
from vllm import LLM, SamplingParams

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="Snowflake/Llama-3.1-SwiftKV-8B-Instruct")

print("=" * 80)

conversation = [
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]

sampling_params = SamplingParams(temperature=0.1, max_tokens=800)

outputs = llm.chat(conversation, sampling_params=sampling_params)

print(outputs[0].outputs[0].text)

In [None]:
#!pip install vllm #Ensure vllm is properly installed
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0" #Explicitly set the CUDA device

from vllm import LLM, SamplingParams

llm = LLM(model="Snowflake/Llama-3.1-SwiftKV-8B-Instruct")

print("=" * 80)

conversation = [
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]

sampling_params = SamplingParams(temperature=0.1, max_tokens=800)

outputs = llm.chat(conversation, sampling_params=sampling_params)

print(outputs[0].outputs[0].text)

In [None]:
import argparse
from typing import List, Tuple

from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.utils import FlexibleArgumentParser


def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
    """Create a list of test prompts with their sampling parameters."""
    return [
        ("A robot may not injure a human being",
         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
        ("To be or not to be,",
         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
        ("What is the meaning of life?",
         SamplingParams(n=2,
                        best_of=5,
                        temperature=0.8,
                        top_p=0.95,
                        frequency_penalty=0.1)),
    ]


def process_requests(engine: LLMEngine,
                     test_prompts: List[Tuple[str, SamplingParams]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
            engine.add_request(str(request_id), prompt, sampling_params)
            request_id += 1

        request_outputs: List[RequestOutput] = engine.step()

        for request_output in request_outputs:
            if request_output.finished:
                print(request_output)


def initialize_engine(args: argparse.Namespace) -> LLMEngine:
    """Initialize the LLMEngine from the command line arguments."""
    engine_args = EngineArgs.from_cli_args(args)
    return LLMEngine.from_engine_args(engine_args)


def main(args: argparse.Namespace):
    """Main function that sets up and runs the prompt processing."""
    engine = initialize_engine(args)
    test_prompts = create_test_prompts()
    process_requests(engine, test_prompts)


if __name__ == '__main__':
    parser = FlexibleArgumentParser(
        description='Demo on using the LLMEngine class directly')
    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)

In [None]:
!python /content/llm_engine_example.py --model openai-community/gpt2

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# حدد الجهاز الذي سيتم استخدامه
device = "cuda" if torch.cuda.is_available() else "cpu"

# تحميل النموذج والوسم الخاص به
tokenizer = AutoTokenizer.from_pretrained("Snowflake/Llama-3.1-SwiftKV-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",
    device_map="auto",  # توزيع النموذج على الأجهزة المتاحة
    torch_dtype=torch.float16  # استخدام float16 لتقليل استهلاك الذاكرة
)

# تعريف الحوار
conversation = [
    {"role": "user", "content": "Hello"},
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {"role": "user", "content": "Write an essay about the importance of higher education."},
]

# تحويل الحوار إلى نص واحد
prompt = ""
for msg in conversation:
    prompt += f"{msg['role']}: {msg['content']}\n"

# توليد النص
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=800, temperature=0.1)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# طباعة النص المولَّد
print(generated_text)

In [None]:
from vllm import LLM, SamplingParams
import torch

def run_inference():
    # حدد الجهاز الذي سيتم استخدامه
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # تحميل النموذج مع تحديد الجهاز
    llm = LLM(
        model="openai-community/gpt2",
        tokenizer="openai-community/gpt2",
        device=device  # تحديد الجهاز بشكل صريح
    )

    # تعريف الإعدادات
    sampling_params = SamplingParams(temperature=0.1, max_tokens=100)

    # تعريف الإدخال
    prompt = "Write an essay about the importance of higher education."

    # توليد النص
    outputs = llm.generate([prompt], sampling_params=sampling_params)

    # طباعة النص المولَّد
    for output in outputs:
        print(output.outputs[0].text)

if __name__ == "__main__":
    run_inference()

https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html

In [None]:
from vllm import LLM, SamplingParams

# تحميل النموذج
model = LLM(model="openai-community/gpt2")

# ضبط معلمات العينة
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=10)

# إدخال النص
prompt = "Once upon a time"

# تشغيل الاستدلال
outputs = model.generate([prompt], sampling_params)

# طباعة النتيجة
for output in outputs:
    print("Generated text:", output.outputs[0].text)


In [None]:
from vllm import LLM, SamplingParams

# تحميل النموذج مع تحديد الجهاز إلى CPU
model = LLM(model="openai-community/gpt2", device="cpu")

# ضبط معلمات العينة
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=100)

# إدخال النص
prompt = "Once upon a time"

# تشغيل الاستدلال
outputs = model.generate([prompt], sampling_params)

# طباعة النتيجة
for output in outputs:
    print("Generated text:", output.outputs[0].text)


In [None]:
from vllm import LLM, SamplingParams
import torch

# التحقق من توفر GPU
#assert torch.cuda.is_available(), "يجب تشغيل هذا الكود على بيئة تحتوي على GPU!"

# تحميل النموذج مع استخدام CUDA
model = LLM(model="openai-community/gpt2")

# ضبط معلمات العينة
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=100)

# إدخال النص
prompt = "Once upon a time"

# تشغيل الاستدلال
outputs = model.generate([prompt], sampling_params)

# طباعة النتيجة
for output in outputs:
    print("Generated text:", output.outputs[0].text)


https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html#pre-built-wheels

In [None]:
!sudo apt-get update  -y
!sudo apt-get install -y gcc-12 g++-12 libnuma-dev
!sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

In [10]:
!git clone https://github.com/Snowflake-Labs/vllm.git

Cloning into 'vllm'...
remote: Enumerating objects: 43378, done.[K
remote: Counting objects: 100% (927/927), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 43378 (delta 825), reused 805 (delta 788), pack-reused 42451 (from 1)[K
Receiving objects: 100% (43378/43378), 17.68 MiB | 16.15 MiB/s, done.
Resolving deltas: 100% (35104/35104), done.


In [11]:
%cd /content/vllm
!pip install --upgrade pip
!pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
!pip install -v -r /content/vllm/requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

/content/vllm
Using pip 25.0 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Ignoring fastapi: markers 'python_version < "3.9"' don't match your environment
Ignoring six: markers 'python_version > "3.11"' don't match your environment
Ignoring setuptools: markers 'python_version > "3.11"' don't match your environment
Collecting lm-format-enforcer==0.10.6 (from -r /content/vllm/requirements-common.txt (line 20))
  Obtaining dependency information for lm-format-enforcer==0.10.6 from https://files.pythonhosted.org/packages/4f/6e/d140b5eb41541afebea1c27013bc19b5a1cafd0cd330d9aa3458833ee44a/lm_format_enforcer-0.10.6-py3-none-any.whl.metadata
  Using cached lm_format_enforcer-0.10.6-py3-none-any.whl.metadata (16 kB)
Collecting outlines<0.1,>=0.0.43 (from -r /content/vllm/requirements-common.txt (line 21))
  Obtaining dependency information for outlines<0.1,>=0.0.43 from https://files.pythonhosted.

In [2]:
!sudo apt-get install -y libdnnl-dev


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libdnnl2
The following NEW packages will be installed:
  libdnnl-dev libdnnl2
0 upgraded, 2 newly installed, 0 to remove and 31 not upgraded.
Need to get 6,736 kB of archives.
After this operation, 45.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libdnnl2 amd64 2.2.4+ds-2 [6,649 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libdnnl-dev amd64 2.2.4+ds-2 [86.7 kB]
Fetched 6,736 kB in 2s (3,734 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: fa

In [3]:
!wget https://github.com/oneapi-src/oneDNN/releases/download/v3.3/dnnl-3.3.0-linux-x64.tar.gz
!tar -xvzf dnnl-3.3.0-linux-x64.tar.gz
!mv dnnl-3.3.0 /usr/local/onednn


--2025-02-05 22:00:36--  https://github.com/oneapi-src/oneDNN/releases/download/v3.3/dnnl-3.3.0-linux-x64.tar.gz
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-02-05 22:00:37 ERROR 404: Not Found.

tar (child): dnnl-3.3.0-linux-x64.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now
mv: cannot stat 'dnnl-3.3.0': No such file or directory


In [None]:
%cd /content/vllm
!VLLM_TARGET_DEVICE=cpu python /content/vllm/setup.py install

In [2]:
!sudo apt-get update -y
!sudo apt-get install -y libdnnl-dev onednn

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 257 kB in 2s (137 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry miss

In [5]:
!sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
!find / -name *libtcmalloc* # find the dynamic link library path
!export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
!python examples/offline_inference/basic.py # run vLLM

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtcmalloc-minimal4 is already the newest version (2.9.1-0ubuntu3).
libtcmalloc-minimal4 set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.
/var/lib/dpkg/info/libtcmalloc-minimal4:amd64.shlibs
/var/lib/dpkg/info/libtcmalloc-minimal4:amd64.md5sums
/var/lib/dpkg/info/libtcmalloc-minimal4:amd64.triggers
/var/lib/dpkg/info/libtcmalloc-minimal4:amd64.list
find: ‘/proc/72/task/72/net’: Invalid argument
find: ‘/proc/72/net’: Invalid argument
/usr/share/doc/libtcmalloc-minimal4
/usr/lib/x86_64-linux-gnu/libtcmalloc_and_profiler.so.4.6.4
/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4
/usr/lib/x86_64-linux-gnu/libtcmalloc_and_profiler.so.4
/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal_debug.so.4.5.9
/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4
/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4.5.9
/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal_debug.so.4
/usr

In [7]:
from vllm import LLM, SamplingParams

llm = LLM(model="openai-community/gpt2")
sampling_params = SamplingParams(temperature=0.5)


def print_outputs(outputs):
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    print("-" * 80)


print("=" * 80)

# In this script, we demonstrate how to pass input to the chat method:

conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
outputs = llm.chat(conversation,
                   sampling_params=sampling_params,
                   use_tqdm=False)
print_outputs(outputs)

# You can run batch inference with llm.chat API
conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
conversations = [conversation for _ in range(10)]

# We turn on tqdm progress bar to verify it's indeed running batch inference
outputs = llm.chat(messages=conversations,
                   sampling_params=sampling_params,
                   use_tqdm=True)
print_outputs(outputs)

INFO 02-05 22:09:14 config.py:1670] Downcasting torch.float32 to torch.float16.
INFO 02-05 22:09:14 llm_engine.py:237] Initializing an LLM engine (v0.4.0.post2.dev1967+gfd47e57f) with config: model='openai-community/gpt2', speculative_config=None, tokenizer='openai-community/gpt2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=openai-community/gpt2, use_v

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-05 22:09:18 cpu_executor.py:214] # CPU blocks: 7281


ValueError: As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.

شغال

In [3]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")
prompts = ["Hello, my name is", "The capital of France is"]
outputs = generator(prompts, max_length=20)

for prompt, output in zip(prompts, outputs):
    print(f"Prompt: {prompt!r}, Generated text: {output[0]['generated_text']!r}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: 'Hello, my name is', Generated text: "Hello, my name is Michael. I'm from Kansas.\n\nWhat time do you live?"
Prompt: 'The capital of France is', Generated text: 'The capital of France is now home to its largest mosque where a congregation of over 1000 Jews is holding'


In [4]:
!pip install vllm[all]

Collecting lm-format-enforcer<0.11,>=0.10.9 (from vllm[all])
  Using cached lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)
Collecting outlines==0.1.11 (from vllm[all])
  Using cached outlines-0.1.11-py3-none-any.whl.metadata (17 kB)
Collecting torch==2.5.1 (from vllm[all])
  Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.20.1 (from vllm[all])
  Using cached torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Using cached outlines-0.1.11-py3-none-any.whl (87 kB)
Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)
Using cached torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)
Using cached lm_format_enforcer-0.10.9-py3-none-any.whl (43 kB)
Installing collected packages: torch, lm-format-enforcer, torchvision, outlines
  Attempting uninstall: torch
    Found existing installation: torch 2.4.0+cpu
    Uninstalling torch-2.4.0+cpu:
      Successfully uninstalled torch-2.4.0+cp

In [2]:
from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="openai-community/gpt2", cpu_offload_gb=7)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

RuntimeError: Failed to infer device type

In [None]:
!python examples/offline_inference/basic.py

In [4]:
# SPDX-License-Identifier: Apache-2.0

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="openai-community/gpt2", cpu_offload_gb=10)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 02-05 22:01:25 config.py:1670] Downcasting torch.float32 to torch.float16.
INFO 02-05 22:01:40 llm_engine.py:237] Initializing an LLM engine (v0.4.0.post2.dev1967+gfd47e57f) with config: model='openai-community/gpt2', speculative_config=None, tokenizer='openai-community/gpt2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=openai-community/gpt2, use_v

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-05 22:01:45 cpu_executor.py:214] # CPU blocks: 7281


Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

ERROR 02-05 22:01:51 _custom_ops.py:62] Error in calling custom op reshape_and_cache: '_OpNamespace' '_C_cache_ops' object has no attribute 'reshape_and_cache'
ERROR 02-05 22:01:51 _custom_ops.py:62] Possibly you have built or installed an obsolete version of vllm.
ERROR 02-05 22:01:51 _custom_ops.py:62] Please try a clean build and install of vllm,or remove old built files such as vllm/*cpython*.so and build/ .


AttributeError: '_OpNamespace' '_C_cache_ops' object has no attribute 'reshape_and_cache'

In [None]:
!pip uninstall vllm -y # Remove any existing installations
%cd /content/vllm  # Navigate to the vllm directory
!VLLM_TARGET_DEVICE=cpu MAX_JOBS=2 python setup.py install # Reinstall with custom ops for CPU

In [None]:
%cd /content/vllm  # Navigate to the vllm directory
!VLLM_TARGET_DEVICE=cpu MAX_JOBS=2 python /content/vllm/setup.py install

In [None]:
!pip uninstall -y vllm
!rm -rf vllm build *.so


In [None]:
!pip install -U vllm


In [None]:
from vllm import LLM, SamplingParams

In [None]:
llm = LLM(model="facebook/opt-125m")

vllm cpu






https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html

In [None]:
!sudo apt-get update  -y
!sudo apt-get install -y gcc-12 g++-12 libnuma-dev
!sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

In [None]:
!git clone https://github.com/Snowflake-Labs/vllm.git

In [None]:
%cd /content/vllm
!pip install --upgrade pip
!pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
!pip install -v -r /content/vllm/requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

In [None]:
!VLLM_USE_CUDA=0 pip install -e .


In [None]:
from vllm import LLM, SamplingParams

# التحقق من أن vLLM يعمل على CPU فقط
llm = LLM(model="openai-community/gpt2", tensor_parallel_size=1)

# إنشاء معلمات التوليد
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# تجربة توليد نص
outputs = llm.generate(["The future of AI is"], sampling_params)

# طباعة النتيجة
for output in outputs:
    print("Generated text:", output.outputs[0].text)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import deepspeed

model_id = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# تمكين DeepSpeed
ds_model = deepspeed.init_inference(model, dtype=torch.float16, replace_method="auto")

# تشغيل الاستدلال
prompt = "ما هي الذكاء الاصطناعي؟"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import deepspeed

model_id = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# تمكين DeepSpeed
ds_model = deepspeed.init_inference(model, dtype=torch.float16, replace_method="auto")

# تشغيل الاستدلال
prompt = "ما هي الذكاء الاصطناعي؟"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ModuleNotFoundError: No module named 'deepspeed'

In [2]:
!pip install deepspeed
#0.16.3

Collecting deepspeed
  Downloading deepspeed-0.16.3.tar.gz (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
Building wheels for collected packages: deepspeed
  Building wheel for deepspeed (setup.py) ... [?25l[?25hdone
  Created wheel for deepspeed: filename=deepspeed-0.16.3-py3-none-any.whl size=1549934 sha256=ee71651bfbde669b1a0a6ed0752a29cb93bda2ec5fd89299704448e49f1865bf
  Stored in directory: /root/.cache/pip/wheels/ab/dc/d4/7e7e07b11bc7c0e2a1a495b967acf58de61261eed4596fb23b
Successfully built deepspeed
Installing collected packages: hjson, deepspeed
Successfully installed deepspeed-0.16.3 h

### شغال

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import deepspeed

model_id = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# تمكين DeepSpeed
ds_model = deepspeed.init_inference(model, dtype=torch.float32, replace_method="auto")

# تشغيل الاستدلال
prompt = "ما هي الذكاء الاصطناعي؟"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

[2025-02-05 22:18:29,067] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.3, git-hash=unknown, git-branch=unknown
[2025-02-05 22:18:29,077] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ما هي الذكاء الاصطناعي؟ في الأحمد بحمد بحمد بحم


In [3]:
!huggingface-cli login




    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
The token `read` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate w

In [5]:
!pip install hf_transfer

Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf_transfer
Successfully installed hf_transfer-0.1.9


### شغال سريع جدا ع المعالج

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import deepspeed

model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# تمكين DeepSpeed
ds_model = deepspeed.init_inference(model, dtype=torch.float32, replace_method="auto")

# تشغيل الاستدلال
prompt = "ما هي الذكاء الاصطناعي؟"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

[2025-02-05 22:27:43,569] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cpu (auto detect)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

[2025-02-05 22:28:59,362] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.3, git-hash=unknown, git-branch=unknown
[2025-02-05 22:28:59,368] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


ما هي الذكاء الاصطناعي؟ - 2016
The following is a guest post by Dr. Andrew P. N. Ware, who is a professor of computer science at the University of California, Irvine. Dr. Ware


[2025-02-05 22:27:43,548] [WARNING] [real_accelerator.py:181:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.
[2025-02-05 22:27:43,569] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cpu (auto detect)
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
tokenizer_config.json: 100%
 50.5k/50.5k [00:00<00:00, 1.88MB/s]
tokenizer.json: 100%
 9.09M/9.09M [00:00<00:00, 19.1MB/s]
special_tokens_map.json: 100%
 301/301 [00:00<00:00, 8.78kB/s]
config.json: 100%
 843/843 [00:00<00:00, 10.8kB/s]
model.safetensors: 100%
 2.47G/2.47G [00:58<00:00, 43.3MB/s]
generation_config.json: 100%
 185/185 [00:00<00:00, 6.63kB/s]
[2025-02-05 22:28:59,362] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.3, git-hash=unknown, git-branch=unknown
[2025-02-05 22:28:59,365] [WARNING] [config_utils.py:70:_process_deprecated_field] Config parameter replace_method is deprecated. This parameter is no longer needed, please remove from your call to DeepSpeed-inference
[2025-02-05 22:28:59,368] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
ما هي الذكاء الاصطناعي؟ - 2016
The following is a guest post by Dr. Andrew P. N. Ware, who is a professor of computer science at th

In [2]:
# تشغيل الاستدلال
prompt = "Who is Pythagoras?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Who is Pythagoras? What was his life like? How did he create mathematics? Why did he write down his formulae for the first time? And what is it about Pythagoras that makes us so fascinated by him? In this


In [None]:
Who is Pythagoras?

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import deepspeed

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# تمكين DeepSpeed
ds_model = deepspeed.init_inference(model, dtype=torch.float32, replace_method="auto")

# تشغيل الاستدلال
prompt = "Who is Pythagoras?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

[2025-02-05 22:35:38,979] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cpu (auto detect)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

[2025-02-05 22:37:24,172] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.3, git-hash=unknown, git-branch=unknown
[2025-02-05 22:37:24,179] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Who is Pythagoras? Explain in detail.
Pythagoras, also known as Pythag, was an ancient Greek mathematician, philosopher, and religious figure. He is best known for his contributions to mathematics, particularly the Pythagorean theorem


[2025-02-05 22:35:38,971] [WARNING] [real_accelerator.py:181:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.
[2025-02-05 22:35:38,979] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cpu (auto detect)
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
tokenizer_config.json: 100%
 3.06k/3.06k [00:00<00:00, 55.3kB/s]
tokenizer.json: 100%
 7.03M/7.03M [00:00<00:00, 16.2MB/s]
config.json: 100%
 679/679 [00:00<00:00, 19.3kB/s]
model.safetensors: 100%
 3.55G/3.55G [01:25<00:00, 42.8MB/s]
generation_config.json: 100%
 181/181 [00:00<00:00, 8.21kB/s]
[2025-02-05 22:37:24,172] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.3, git-hash=unknown, git-branch=unknown
[2025-02-05 22:37:24,174] [WARNING] [config_utils.py:70:_process_deprecated_field] Config parameter replace_method is deprecated. This parameter is no longer needed, please remove from your call to DeepSpeed-inference
[2025-02-05 22:37:24,179] [INFO] [logging.py:128:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Who is Pythagoras? Explain in detail.
Pythagoras, also known as Pythag, was an ancient Greek mathematician, philosopher, and religious figure. He is best known fo

In [3]:
# تشغيل الاستدلال
prompt = "Who is python?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = ds_model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Who is python? What is its purpose? What is its purpose in the context of data science?
What is a Python script? What is its purpose?
What is a Python variable? What is its purpose?
What is a Python function? What is its purpose?
What is a Python module? What is its purpose?
What is a Python class? What is its purpose?
What is a Python list? What is its purpose?
What is a Python tuple? What is its purpose?
What is a Python file? What is its purpose?
What is a Python string? What is its purpose?
What is a Python integer? What is


 3. ONNX Runtime (أفضل خيار للأداء العام على CPU)
ONNX Runtime هو إطار عام يدعم تشغيل النماذج بسرعة على المعالج مع تحسينات مخصصة للبنية التحتية.

يعمل على جميع الأجهزة (Windows, Linux, Mac).
✅ أسرع من PyTorch على CPU بنسبة 30-50%.
✅ يدعم تشغيل النماذج في بيئات الإنتاج (مثل Flask و FastAPI).

💡 إذا كنت تستخدم Phi-3 أو Llama-2 أو Mistral، فجرب llama.cpp لأنه الأسرع على المعالج!
💡 إذا كنت تريد تشغيل النماذج مع PyTorch، فاستخدم DeepSpeed.
💡 إذا كنت تريد تشغيل النماذج في بيئة إنتاج، فاستخدم ONNX Runtime.

In [2]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
Installing collected packages: humanfriendly, coloredlogs, onnxruntime
Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime-1.20.1


In [5]:
!pip install optimum

Collecting optimum
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Downloading optimum-1.24.0-py3-none-any.whl (433 kB)
Installing collected packages: optimum
Successfully installed optimum-1.24.0


### شغال

In [3]:
#!pip install onnxruntime
#!pip install optimum
#!pip install onnx
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer,GenerationConfig,TextIteratorStreamer
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import logging
logging.set_verbosity_error()
import time
model_id="brianwoo/GPT2-Onnx-Quantized"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForCausalLM.from_pretrained(model_id)
streamer = TextStreamer(tokenizer,skip_prompt=True, skip_special_tokens=True,return_text=True)

onnx_gen = pipeline("text-generation", model=model, tokenizer=tokenizer,streamer=streamer,return_text=True)

while True:
    text = input("\nBrian:")
    print("Bot:\n")
    t0=time.time()
    gen = onnx_gen(text)
    t=time.time()-t0
    text=gen[0]["generated_text"]
    print(t,len(text.split(" "))/t,"words /sec")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/891 [00:00<?, ?B/s]

model_quantized.onnx:   0%|          | 0.00/127M [00:00<?, ?B/s]


Brian:hi
Bot:

 "Seed", "PTSC", FSC, FSU, CA, UK, HUB, HSL, US

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
1.4529685974121094 20.647383607211587 words /sec


KeyboardInterrupt: Interrupted by user

In [None]:
!pip install onnxruntime
!pip install optimum
!pip install onnx

In [2]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [4]:
from transformers import AutoTokenizer
import onnxruntime as ort
import numpy as np

# تحميل النموذج
tokenizer = AutoTokenizer.from_pretrained("gpt2")
ort_session = ort.InferenceSession("gpt2.onnx")

# تحضير الإدخال
prompt = "The future of AI is"
tokens = tokenizer(prompt, return_tensors="np")["input_ids"]

# تشغيل النموذج
outputs = ort_session.run(None, {"input_ids": tokens})
print(outputs)


NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from gpt2.onnx failed:Load model gpt2.onnx failed. File doesn't exist

In [None]:
!pip install onnxruntime
!pip install optimum
!pip install onnx

In [1]:
#!pip install onnxruntime
#!pip install optimum
#!pip install onnx
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer,GenerationConfig,TextIteratorStreamer
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import logging
logging.set_verbosity_error()
import time
model_id="onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForCausalLM.from_pretrained(model_id)
streamer = TextStreamer(tokenizer,skip_prompt=True, skip_special_tokens=True,return_text=True)

onnx_gen = pipeline("text-generation", model=model, tokenizer=tokenizer,streamer=streamer,return_text=True)

while True:
    text = input("\nBrian:")
    print("Bot:\n")
    t0=time.time()
    gen = onnx_gen(text)
    t=time.time()-t0
    text=gen[0]["generated_text"]
    print(t,len(text.split(" "))/t,"words /sec")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/6.63k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a3ed6a-69ccb7533a455bc6594eb354;deff888e-541c-464e-80df-ea9500b0de50)

Entry Not Found for url: https://huggingface.co/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX/resolve/main/model.onnx.

In [2]:
!vllm serve

2025-02-05 23:02:16.197056: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738796536.284322   31708 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738796536.334367   31708 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 02-05 23:02:22 __init__.py:187] No platform detected, vLLM is running on UnspecifiedPlatform
usage: vllm serve <model_tag> [options]
vllm serve: error: the following arguments are required: model_tag


In [3]:
!vllm serve "onnx-community/YuE-s2-1B-general-ONNX"


2025-02-05 23:02:55.496707: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738796575.536919   31892 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738796575.548762   31892 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 02-05 23:03:00 __init__.py:187] No platform detected, vLLM is running on UnspecifiedPlatform
INFO 02-05 23:03:02 api_server.py:838] vLLM API server version 0.7.1
INFO 02-05 23:03:02 api_server.py:839] args: Namespace(subparser='serve', model_tag='onnx-community/YuE-s2-1B-general-ONNX', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers

In [4]:
from transformers import pipeline

pipe = pipeline("text-generation", model="onnx-community/YuE-s2-1B-general-ONNX")

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

ValueError: Could not load model onnx-community/YuE-s2-1B-general-ONNX with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForCausalLM'>, <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/transformers/pipelines/base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 3880, in from_pretrained
    raise EnvironmentError(
OSError: onnx-community/YuE-s2-1B-general-ONNX does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.

while loading with TFAutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/transformers/pipelines/base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/auto/auto_factory.py", line 567, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers.models.llama.configuration_llama.LlamaConfig'> for this kind of AutoModel: TFAutoModelForCausalLM.
Model type should be one of BertConfig, CamembertConfig, CTRLConfig, GPT2Config, GPT2Config, GPTJConfig, MistralConfig, OpenAIGPTConfig, OPTConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoFormerConfig, TransfoXLConfig, XGLMConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.

while loading with LlamaForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/transformers/pipelines/base.py", line 289, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py", line 3880, in from_pretrained
    raise EnvironmentError(
OSError: onnx-community/YuE-s2-1B-general-ONNX does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.




https://huggingface.co/docs/transformers/main/en/serialization

https://onnx.ai/supported-tools.html#deployModel

https://github.com/onnx/models

In [5]:
!pip install optimum[exporters]



In [6]:
!optimum-cli export onnx --help

2025-02-05 23:06:19.141097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738796779.184388   32760 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738796779.198103   32760 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
usage: optimum-cli export onnx [-h] -m MODEL [--task TASK] [--opset OPSET] [--device DEVICE]
                               [--fp16] [--dtype {fp32,fp16,bf16}] [--optimize {O1,O2,O3,O4}]
                               [--monolith] [--no-post-process] [--variant VARIANT]
                               [--framework {pt,tf}] [--atol ATOL] [--cache_dir CACHE_DIR]
                               [--trust-remote-code] [--pad_token_id PAD_TO

### شغال

In [8]:
!optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/

2025-02-05 23:07:06.675129: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738796826.717005   32976 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738796826.729904   32976 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
config.json: 100% 451/451 [00:00<00:00, 2.27MB/s]
model.safetensors: 100% 265M/265M [00:04<00:00, 55.2MB/s]
tokenizer_config.json: 100% 48.0/48.0 [00:00<00:00, 201kB/s]
vocab.txt: 100% 232k/232k [00:00<00:00, 3.16MB/s]
tokenizer.json: 100% 466k/466k [00:00<00:00, 3.52MB/s]


### شغال

In [9]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)

In [10]:
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-4.7652, -1.0452, -7.0409, -4.6864, -4.0277, -6.2021, -4.9473,  2.6287,
          7.6111, -1.2488, -2.0551, -0.9350,  4.9758, -0.7707,  2.1493, -2.0703,
         -4.3232, -4.9472]]), end_logits=tensor([[ 0.4382, -1.6502, -6.3654, -6.0661, -4.1482, -3.5779, -0.0774, -3.6168,
         -1.8750, -2.8910,  6.2582,  0.5425, -3.7699,  3.8232, -1.5073,  6.2311,
          3.3604, -0.0772]]), hidden_states=None, attentions=None)


### شغال

In [None]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

onnx-community/YuE-s2-1B-general-ONNX

### شغال

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("echarlaix/t5-small-onnx")
model = ORTModelForSeq2SeqLM.from_pretrained("echarlaix/t5-small-onnx")
translator = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
results = translator("My name is Eustache and I have a pet raccoon")
print(results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

encoder_model.onnx:   0%|          | 0.00/141M [00:00<?, ?B/s]

decoder_model_merged.onnx:   0%|          | 0.00/167M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Device set to use cpu


[{'translation_text': "Mon nom est Eustache et j'ai un raccoon de compagnie."}]


In [2]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX")
model = ORTModelForSeq2SeqLM.from_pretrained("eonnx-community/YuE-s2-1B-general-ONNX")
translator = pipeline("text-generation", model=model, tokenizer=tokenizer)
results = translator("what is ai?")
print(results)


tokenizer_config.json:   0%|          | 0.00/990 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-67a3f252-7c067f0a3b2608e2780db895;b348bc76-7ff4-41f5-8c86-28d836616284)

Repository Not Found for url: https://huggingface.co/api/models/eonnx-community/YuE-s2-1B-general-ONNX/tree/main?recursive=True&expand=False.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

# Corrected model id
tokenizer = AutoTokenizer.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX")
# Corrected model id
model = ORTModelForSeq2SeqLM.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX")

translator = pipeline("text-generation", model=model, tokenizer=tokenizer)
results = translator("what is ai?")
print(results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


FileNotFoundError: Could not find any ONNX model file for the regex ['(.*)?decoder((?!(with_past|merged)).)*?\\.onnx'] in onnx-community/YuE-s2-1B-general-ONNX.

In [2]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

# Corrected model id
tokenizer = AutoTokenizer.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX")
# Corrected model id
# Assuming the decoder file is named 'decoder_model.onnx'
model = ORTModelForSeq2SeqLM.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX", decoder_file_name="decoder_model.onnx")

translator = pipeline("text-generation", model=model, tokenizer=tokenizer)
results = translator("what is ai?")
print(results)

FileNotFoundError: Could not find any ONNX model file for the regex ['(.*)?decoder((?!(with_past|merged)).)*?\\.onnx'] in onnx-community/YuE-s2-1B-general-ONNX.

In [3]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX")
model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/YuE-s2-1B-general-ONNX")
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

RuntimeError: Too many ONNX model files were found in onnx-community/YuE-s2-1B-general-ONNX, specify which one to load by using the file_name argument.

In [4]:
from transformers import pipeline
oracle = pipeline('ner', model='dicta-il/dictabert-ner', aggregation_strategy='simple')
# if we set aggregation_strategy to simple, we need to define a decoder for the tokenizer. Note that the last wordpiece of a group will still be emitted
from tokenizers.decoders import WordPiece
oracle.tokenizer.backend_tokenizer.decoder = WordPiece()
sentence = '''דוד בן-גוריון (16 באוקטובר 1886 - ו' בכסלו תשל"ד) היה מדינאי ישראלי וראש הממשלה הראשון של מדינת ישראל.'''
oracle(sentence)


config.json:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/735M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Device set to use cpu


[{'entity_group': 'PER',
  'score': 0.9999443,
  'word': 'דוד בן - גוריון',
  'start': 0,
  'end': 13},
 {'entity_group': 'TIMEX',
  'score': 0.99987966,
  'word': '16 באוקטובר 1886',
  'start': 15,
  'end': 31},
 {'entity_group': 'TIMEX',
  'score': 0.9998579,
  'word': 'ו\' בכסלו תשל"ד',
  'start': 34,
  'end': 48},
 {'entity_group': 'TTL',
  'score': 0.99963045,
  'word': 'וראש הממשלה',
  'start': 68,
  'end': 79},
 {'entity_group': 'GPE',
  'score': 0.9997943,
  'word': 'ישראל',
  'start': 96,
  'end': 101}]

In [1]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("onnx-community/TinyLlama-1.1B-Chat-v1.0-ONNX")
model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/TinyLlama-1.1B-Chat-v1.0-ONNX")
inputs = tokenizer("What am I using?", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RuntimeError: Too many ONNX model files were found in onnx-community/TinyLlama-1.1B-Chat-v1.0-ONNX, specify which one to load by using the file_name argument.

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "onnx-community/Phi-3.5-mini-instruct-onnx-web",
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("onnx-community/Phi-3.5-mini-instruct-onnx-web")

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])


config.json:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

OSError: onnx-community/Phi-3.5-mini-instruct-onnx-web does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.

https://github.com/microsoft/onnxruntime-inference-examples/tree/main

In [3]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 Microsoft Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import argparse
from pathlib import Path
import json
import os

from azure.ai.ml import MLClient, command
from azure.ai.ml.entities import Environment, BuildContext
from azure.identity import AzureCliCredential

# run test on automode workspace
ws_config = json.load(open("ws_config.json"))
subscription_id = ws_config["subscription_id"]
resource_group = ws_config["resource_group"]
workspace_name = ws_config["workspace_name"]
compute = ws_config["compute"]
nproc_per_node = ws_config["nproc_per_node"]

def get_args(raw_args=None):
    parser = argparse.ArgumentParser()

    parser.add_argument("--experiment_name", default="MISTRAL-7B-ORT-CLM-Stage2-Experiment", help="Experiment name for AML Workspace")

    args = parser.parse_args(raw_args)
    return args

def main(raw_args=None):
    args = get_args(raw_args)

    ml_client = MLClient(
        AzureCliCredential(), subscription_id, resource_group, workspace_name
    )

    root_dir = Path(__file__).resolve().parent
    environment_dir = root_dir / "environment"
    code_dir = root_dir / "inference-code"

    model = "mistralai/Mistral-7B-v0.1"

    # https://huggingface.co/datasets/dair-ai/emotion
    dataset_name = "databricks/databricks-dolly-15k"

    inference_job = command(
        code=code_dir,  # local path where the code is stored
        command=f"bash inference_setup.sh",
        environment=Environment(build=BuildContext(path=environment_dir)),
        experiment_name="MISTRAL-7B-Inference-Experiment",
        compute=compute,
        display_name=model.replace(
            "mistral-ai",
            f"Inference-benchmark"
        ),
        description=f"Mistral AI 7B Inference Benchmark",
        tags={"model": model,
              "dataset_name": dataset_name},
        shm_size="16g"
    )

    print("submitting Inference job for " + model)
    inference_returned_job = ml_client.create_or_update(inference_job)
    print("submitted job")

    inference_aml_url = inference_returned_job.studio_url
    print("Inference Benchmark job link:", inference_aml_url)


if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'azure'

https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/notebooks/bert/Bert-GLUE_OnnxRuntime_quantization.ipynb

https://onnxruntime.ai/docs/tutorials/accelerate-pytorch/pytorch.html

In [4]:
import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision.models import resnet18, ResNet18_Weights


class Predictor(nn.Module):

    def __init__(self):
        super().__init__()
        weights = ResNet18_Weights.DEFAULT
        self.resnet18 = resnet18(weights=weights, progress=False).eval()
        self.transforms = weights.transforms()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            x = self.transforms(x)
            y_pred = self.resnet18(x)
            return y_pred.argmax(dim=1)


In [5]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertForQuestionAnswering.from_pretrained(model_name)


NameError: name 'transformers' is not defined

In [6]:
import os

cache_dir = os.path.join(".", "cache_models")
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

predict_file_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
predict_file = os.path.join(cache_dir, "dev-v1.1.json")
if not os.path.exists(predict_file):
    import wget
    print("Start downloading predict file.")
    wget.download(predict_file_url, predict_file)
    print("Predict file downloaded.")

ModuleNotFoundError: No module named 'wget'

In [7]:
  from transformers import AutoTokenizer, pipeline
- from transformers import AutoModelForCausalLM
+ from optimum.onnxruntime import ORTModelForCausalLM

- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
+ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
  result = pipe("He never went out without a book under his arm")

SyntaxError: unterminated string literal (detected at line 5) (<ipython-input-7-c5e8fbf80a04>, line 5)

https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model

https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines

https://www.sidefx.com/docs/houdini/copernicus/onnx_inference.html

https://onnxruntime.ai/docs/get-started/with-python.html

In [8]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")

config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in onnx-community/Llama-3.2-1B/onnx.

https://onnxruntime.ai/

### ayhgشغال

In [9]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
  from transformers import AutoTokenizer, pipeline
- from transformers import AutoModelForCausalLM
+ from optimum.onnxruntime import ORTModelForCausalLM

- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
+ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
  result = pipe("He never went out without a book under his arm")

In [10]:
from transformers import AutoTokenizer, pipeline
#from transformers import AutoModelForCausalLM
from optimum.onnxruntime import ORTModelForCausalLM

In [12]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# Update the loading to specify the correct decoder file name
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", file_name="decoder_model.onnx", subfolder="onnx") # ONNX checkpoint
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a3fc95-50c3371f008d9c9021b86bea;7be09e01-10d3-4193-b1a2-ba898a891a52)

Entry Not Found for url: https://huggingface.co/onnx-community/Llama-3.2-1B/resolve/main/onnx/decoder_model.onnx.

In [11]:
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx")

FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in onnx-community/Llama-3.2-1B/onnx.

In [None]:
- from diffusers import DiffusionPipeline
+ from optimum.onnxruntime import ORTDiffusionPipeline

  model_id = "runwayml/stable-diffusion-v1-5"
- pipeline = DiffusionPipeline.from_pretrained(model_id)
+ pipeline = ORTDiffusionPipeline.from_pretrained(model_id, revision="onnx")
  prompt = "sailing ship in storm by Leonardo da Vinci"
  image = pipeline(prompt).images[0]

In [15]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# Load the ONNX model and tokenizer
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx")  # ONNX checkpoint
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Create the text generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
result = pipe("He never went out without a book under his arm")
print(result)


FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in onnx-community/Llama-3.2-1B/onnx.

In [14]:
!pip install optimum onnxruntime



In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# Update the loading to specify the correct decoder file name
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", file_name="decoder_model.onnx", subfolder="onnx") # ONNX checkpoint
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")
print(result)

In [18]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# Update the loading to specify the correct decoder file name
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", file_name="decoder_model.onnx", subfolder="onnx") # ONNX checkpoint
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")
print(result)

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a3fd40-3b065c17005613cf70aa9464;26833411-5531-4e68-aea3-de653427bdd6)

Entry Not Found for url: https://huggingface.co/onnx-community/Llama-3.2-1B/resolve/main/onnx/decoder_model.onnx.

In [17]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load PyTorch model
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Set model to evaluation mode
model.eval()

# Example input for tracing
input_ids = tokenizer("He never went out without a book under his arm", return_tensors="pt").input_ids

# Convert model to ONNX
torch.onnx.export(model, input_ids, "llama_3.2_1B.onnx", input_names=["input_ids"], output_names=["logits"])


  if sequence_length != 1:


RuntimeError: Only tuples, lists and Variables are supported as JIT inputs/outputs. Dictionaries and strings are also accepted, but their usage is not recommended. Here, received an input of unsupported type: DynamicCache

https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models

In [20]:
  from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)
  sentences = ["This is an example sentence", "Each sentence is converted"]
  embeddings = model.encode(sentences)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model

In [21]:
!optimum-cli export onnx --model gpt2 gpt2_onnx/

2025-02-06 00:10:30.791634: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738800630.823547   48417 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738800630.832536   48417 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
Found different candidate ONNX initializers (likely duplicate) for the tied weights:
	lm_head.weight: {'onnx::MatMul_3510'}
	transformer.wte.weight: {'transformer.wte.weight'}


In [24]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM # Import ORTModelForCausalLM, not ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("gpt2_onnx")
model = ORTModelForCausalLM.from_pretrained("gpt2_onnx", subfolder="onnx") # Load as a causal language model

# Use the text-generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("What am I using? Using DistilBERT with ONNX Runtime!") # Provide your text as input to the pipeline
print(result)

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-67a3fee9-4f16f99b62f00c753380a7ec;ddcc8169-413d-49c0-b976-fa5775a57893)

Repository Not Found for url: https://huggingface.co/api/models/gpt2_onnx/tree/main?recursive=True&expand=False.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
model = ORTModelForCausalLM.from_pretrained("gpt2_onnx", subfolder="onnx")
tokenizer = AutoTokenizer.from_pretrained("gpt2_onnx")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")


In [26]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = ORTModelForQuestionAnswering.from_pretrained("gpt2_onnx")
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)

ValueError: Input past_key_values.0.key is required by model but not provided.

In [None]:
meta-llama/Llama-3.2-1B

In [None]:
!optimum-cli export onnx --model meta-llama/Llama-3.2-1B onnx/

2025-02-06 00:16:12.027092: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738800972.283371   49750 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738800972.352046   49750 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
!optimum-cli export onnx --model /content/gpt2_onnx --task question-answering gpt2_onnx /

In [1]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("gpt2_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("gpt2_onnx")
inputs = tokenizer("What am I using?", "Using gpt2 with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)

ValueError: Input past_key_values.0.key is required by model but not provided.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load PyTorch model
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Set model to evaluation mode
model.eval()

# Example input for tracing
input_ids = tokenizer("He never went out without a book under his arm", return_tensors="pt").input_ids

# Convert model to ONNX
torch.onnx.export(model, input_ids, "llama_3.2_1B.onnx", input_names=["input_ids"], output_names=["logits"])


In [None]:
!optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/

In [None]:
import os
import numpy as np
from onnxruntime import InferenceSession

# Tokens produced by phonemize() and tokenize() in kokoro.py
tokens = [50, 157, 43, 135, 16, 53, 135, 46, 16, 43, 102, 16, 56, 156, 57, 135, 6, 16, 102, 62, 61, 16, 70, 56, 16, 138, 56, 156, 72, 56, 61, 85, 123, 83, 44, 83, 54, 16, 53, 65, 156, 86, 61, 62, 131, 83, 56, 4, 16, 54, 156, 43, 102, 53, 16, 156, 72, 61, 53, 102, 112, 16, 70, 56, 16, 138, 56, 44, 156, 76, 158, 123, 56, 16, 62, 131, 156, 43, 102, 54, 46, 16, 102, 48, 16, 81, 47, 102, 54, 16, 54, 156, 51, 158, 46, 16, 70, 16, 92, 156, 135, 46, 16, 54, 156, 43, 102, 48, 4, 16, 81, 47, 102, 16, 50, 156, 72, 64, 83, 56, 62, 16, 156, 51, 158, 64, 83, 56, 16, 44, 157, 102, 56, 16, 44, 156, 76, 158, 123, 56, 4]

# Context length is 512, but leave room for the pad token 0 at the start & end
assert len(tokens) <= 510, len(tokens)

# Style vector based on len(tokens), ref_s has shape (1, 256)
voices = np.fromfile('./voices/af.bin', dtype=np.float32).reshape(-1, 1, 256)
ref_s = voices[len(tokens)]

# Add the pad ids, and reshape tokens, should now have shape (1, <=512)
tokens = [[0, *tokens, 0]]

model_name = 'model.onnx' # Options: model.onnx, model_fp16.onnx, model_quantized.onnx, model_q8f16.onnx, model_uint8.onnx, model_uint8f16.onnx, model_q4.onnx, model_q4f16.onnx
sess = InferenceSession(os.path.join('onnx', model_name))

audio = sess.run(None, dict(
    input_ids=tokens,
    style=ref_s,
    speed=np.ones(1, dtype=np.float32),
))[0]


@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

### شغال

In [None]:
!optimum-cli export onnx --model gpt2 gpt2_onnx/

شغال
### /content/gpt2_onnx

In [1]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("gpt2_onnx")
model = ORTModelForCausalLM.from_pretrained("gpt2_onnx")

# Tokenize the input text
inputs = tokenizer("What am I using?", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What am I using?

I'm using the following:

- The following is a list of all the files


@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

In [2]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForCausalLM.from_pretrained("distilbert_base_uncased_squad_onnx")

# Tokenize the input text
inputs = tokenizer("What am I using?", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ValueError: `use_cache` was set to `True` but the loaded model only supports `use_cache=False`. Please load your current model with `use_cache=False` or export the original model once again with `use_cache=True` when calling the `from_pretrained` method. To export your model, simply set `export=True`.

In [None]:
use_cache=False

In [3]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForCausalLM.from_pretrained("distilbert_base_uncased_squad_onnx", use_cache=False)

# Tokenize the input text
inputs = tokenizer("What am I using?", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ValueError: The parameters combination use_cache=False, use_io_binding=True is not supported. Please either pass use_cache=True, use_io_binding=True (default), or use_cache=False, use_io_binding=False.

In [4]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
# Set use_io_binding to False to align with use_cache=False
model = ORTModelForCausalLM.from_pretrained("distilbert_base_uncased_squad_onnx", use_cache=False, use_io_binding=False)

# Tokenize the input text
inputs = tokenizer("What am I using?", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

KeyError: 'logits'

In [None]:
#!pip install transformers optimum onnx onnxruntime

    from transformers import AutoTokenizer
    from optimum.onnxruntime import ORTModelForCausalLM

    # Try GPT2 or another suitable model
    model_id = "gpt2"  # or "facebook/opt-125m", or other causal LM models
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = ORTModelForCausalLM.from_pretrained(model_id, use_cache=False, use_io_binding=False)

    # ... (rest of your code)

In [5]:
#!pip install transformers optimum onnx onnxruntime

from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Try GPT2 or another suitable model
model_id = "gpt2"  # or "facebook/opt-125m", or other causal LM models
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForCausalLM.from_pretrained(model_id, use_cache=False, use_io_binding=False)

# Tokenize the input text
inputs = tokenizer("What am I using?", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a40307-098723232dae63ed4e36147d;8f0d01fe-3088-45d5-8d7d-f809f74ffbe2)

Entry Not Found for url: https://huggingface.co/gpt2/resolve/main/decoder_model.onnx.

In [6]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForCausalLM.from_pretrained("distilbert_base_uncased_squad_onnx")

# Tokenize the input text
inputs = tokenizer("What am I using?", return_tensors="pt")

# Generate text
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


ValueError: `use_cache` was set to `True` but the loaded model only supports `use_cache=False`. Please load your current model with `use_cache=False` or export the original model once again with `use_cache=True` when calling the `from_pretrained` method. To export your model, simply set `export=True`.

### aشغال

In [8]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-4.7652, -1.0452, -7.0409, -4.6864, -4.0277, -6.2021, -4.9473,  2.6287,
          7.6111, -1.2488, -2.0551, -0.9350,  4.9758, -0.7707,  2.1493, -2.0703,
         -4.3232, -4.9472]]), end_logits=tensor([[ 0.4382, -1.6502, -6.3654, -6.0661, -4.1482, -3.5779, -0.0774, -3.6168,
         -1.8750, -2.8910,  6.2582,  0.5425, -3.7699,  3.8232, -1.5073,  6.2311,
          3.3604, -0.0772]]), hidden_states=None, attentions=None)


In [9]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering
import torch

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")

# Define the input text and context
question = "What am I using?"
context = "Using DistilBERT with ONNX Runtime!"

# Tokenize the input text
inputs = tokenizer(question, context, return_tensors="pt")

# Get model outputs
outputs = model(**inputs)

# Extract start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the most likely start and end positions
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert token positions back to the text answer
answer_tokens = inputs.input_ids[0][start_index:end_index+1]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

print(f"Answer: {answer}")


Answer: distilbert


### ayhgشغال

In [11]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering
import torch

# Load the tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")

# Define the input text and context
question = "what is use?"
context = "Using DistilBERT with ONNX Runtime!"

# Tokenize the input text
inputs = tokenizer(question, context, return_tensors="pt")

# Get model outputs
outputs = model(**inputs)

# Extract start and end logits
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the most likely start and end positions
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert token positions back to the text answer
answer_tokens = inputs.input_ids[0][start_index:end_index+1]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

print(f"Answer: {answer}")


Answer: distilbert with onnx runtime


شغال

In [12]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering
import torch

# تحميل المحول والنموذج ONNX
tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")

# النص والسياق
question = "What am I using?"
context = "Using DistilBERT with ONNX Runtime!"

# تحويل النص إلى رموز
inputs = tokenizer(question, context, return_tensors="pt")

# الحصول على النتائج من النموذج
outputs = model(**inputs)

# استخراج قيم بدء ونهاية الإجابة
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# الحصول على أكثر النقاط احتمالًا للبداية والنهاية
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# إذا كانت النقطة النهائية قبل نقطة البداية، نعدلها لتكون بعد النقطة الأخيرة
if end_index < start_index:
    end_index = start_index

# استخراج الإجابة
answer_tokens = inputs.input_ids[0][start_index:end_index + 1]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

# طباعة الإجابة
print(f"Answer: {answer}")


Answer: distilbert


https://huggingface.co/docs/transformers/main/en/serialization

https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model

### شغال

In [1]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import pipeline

# تحميل المحول (Tokenizer) والنموذج (ONNX)
tokenizer = AutoTokenizer.from_pretrained("/content/gpt2_onnx")
model = ORTModelForCausalLM.from_pretrained("/content/gpt2_onnx")

# إنشاء pipeline للاستدلال بواسطة النموذج
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# توليد النص
result = pipe("Once upon a time")

# طباعة النتيجة
print(result)


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Once upon a time of utter destinies, he could be found in a grave, holding a dagger, and a long, unkempt beard, bearing a sword with a red, black, or green cross near that, with a pair'}]


In [2]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import pipeline

# تحميل المحول (Tokenizer) والنموذج (ONNX)
tokenizer = AutoTokenizer.from_pretrained("onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX")
model = ORTModelForCausalLM.from_pretrained("onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX")

# إنشاء pipeline للاستدلال بواسطة النموذج
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# توليد النص
result = pipe("Once upon a time")

# طباعة النتيجة
print(result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a405ab-2df0552371a1540740d8c9e5;8a008150-ccb9-4523-ac9b-096e2a0a0a18)

Entry Not Found for url: https://huggingface.co/onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX/resolve/main/model.onnx.

In [4]:
!git clone https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct

Cloning into 'SmolVLM-500M-Instruct'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 91 (delta 31), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (91/91), 1.16 MiB | 1.73 MiB/s, done.
Filtering content: 100% (25/25), 6.42 GiB | 37.88 MiB/s, done.


In [None]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import pipeline

# تحميل المحول (Tokenizer) والنموذج (ONNX)
tokenizer = AutoTokenizer.from_pretrained("/content/SmolVLM-500M-Instruct/onnx")
model = ORTModelForCausalLM.from_pretrained("/content/SmolVLM-500M-Instruct/onnx")

# إنشاء pipeline للاستدلال بواسطة النموذج
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# توليد النص
result = pipe("Once upon a time")

# طباعة النتيجة
print(result)


In [None]:
# تحميل المحول (Tokenizer) والنموذج (ONNX)
# Assuming the tokenizer is located in the main directory
tokenizer = AutoTokenizer.from_pretrained("/content/SmolVLM-500M-Instruct")
model = ORTModelForCausalLM.from_pretrained("/content/SmolVLM-500M-Instruct/onnx")

# ... (rest of the code remains the same)

In [7]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import pipeline

# تحميل المحول (Tokenizer) والنموذج (ONNX)
tokenizer = AutoTokenizer.from_pretrained("/content/SmolVLM-500M-Instruct")
model = ORTModelForCausalLM.from_pretrained("/content/SmolVLM-500M-Instruct/onnx")


# إنشاء pipeline للاستدلال بواسطة النموذج
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# توليد النص
result = pipe("Once upon a time")

# طباعة النتيجة
print(result)


ValueError: The library name could not be automatically inferred. If using the command-line, please provide the argument --library {transformers,diffusers,timm,sentence_transformers}. Example: `--library diffusers`.

https://huggingface.co/docs/optimum/exporters/task_manager

In [1]:
from optimum.exporters.tasks import TasksManager

model_type = "distilbert"
# For instance, for the ONNX export.
backend = "onnx"
distilbert_tasks = list(TasksManager.get_supported_tasks_for_model_type(model_type, backend).keys())

print(distilbert_tasks)

Not passing the argument `library_name` to `get_supported_tasks_for_model_type` is deprecated and the support will be removed in a future version of Optimum. Please specify a `library_name`. Defaulting to `"transformers`.


['feature-extraction', 'fill-mask', 'text-classification', 'multiple-choice', 'token-classification', 'question-answering']


In [2]:
import onnxruntime as ort
# Load the model and create InferenceSession
model_path = "/content/gpt2_onnx"
session = ort.InferenceSession(model_path)
# "Load and preprocess the input image inputTensor"
...
# Run inference
outputs = session.run(None, {"input": inputTensor})
print(outputs)

InvalidProtobuf: [ONNXRuntimeError] : 7 : INVALID_PROTOBUF : Load model from /content/gpt2_onnx failed:Protobuf parsing failed.

In [3]:
import os
model_path = "/content/gpt2_onnx"
print(os.path.exists(model_path))  # Should print True if the file exists

True


In [8]:
import onnxruntime as ort
import os

# Check if the model file exists
model_path = "/content/gpt2_onnx"

outputs = session.run(None, {"input": inputTensor})
print(outputs)


NameError: name 'session' is not defined

In [10]:
import onnxruntime as ort
import os
import numpy as np

# Check if the model file exists
model_path = "/content/gpt2_onnx"

if not os.path.exists(model_path):
    print(f"Error: Model file not found at {model_path}")
    # Download or copy the model file to the correct location
else:
    print(f"Model file found at {model_path}")

try:
    # Attempt to load the model
    session = ort.InferenceSession(model_path)
    print("Model loaded successfully!")

    # Preprocess the input
    # Replace with your actual input preprocessing logic
    input_text = "hi."

    # Assuming your model expects a single input named 'input_ids'
    input_ids = np.array([[1, 2, 3

SyntaxError: incomplete input (<ipython-input-10-8070967bc7c3>, line 24)

In [11]:
import onnxruntime as ort
import os
import numpy as np

# Check if the model file exists
model_path = "/content/gpt2_onnx"

if not os.path.exists(model_path):
    print(f"Error: Model file not found at {model_path}")
    # Download or copy the model file to the correct location
else:
    print(f"Model file found at {model_path}")

try:
    # Attempt to load the model
    session = ort.InferenceSession(model_path)
    print("Model loaded successfully!")

    # Preprocess the input
    # Replace with your actual input preprocessing logic
    input_text = "This is a sample input text."

    # Assuming your model expects a single input named 'input_ids'
    input_ids = np.array([[1, 2, 3, 4, 5]])  # Replace with actual token IDs

    # Create the input feed dictionary
    input_feed = {"input_ids": input_ids}

    # Run inference
    outputs = session.run(None, input_feed)
    print(outputs)

except ort.ONNXRuntimeError as e:
    if "INVALID_PROTOBUF" in str(e):
        print("Error loading model: Likely a corrupted or incompatible model file.")
    else:
        print(f"Error loading model: {e}")

Model file found at /content/gpt2_onnx


AttributeError: module 'onnxruntime' has no attribute 'ONNXRuntimeError'

InvalidProtobuf: [ONNXRuntimeError] : 7 : INVALID_PROTOBUF : Load model from /content/gpt2_onnx failed:Protobuf parsing failed.

In [12]:
import onnxruntime as ort
import os

# تحديد مسار النموذج
model_path = "/content/gpt2_onnx"

# إنشاء جلسة (InferenceSession) مع النموذج
session = ort.InferenceSession(model_path)

# تحضير المدخلات (inputTensor يجب أن يكون معرّفًا مسبقًا)
# تأكد من أنك قد قمت بتحويل النص إلى تنسيق Tensor مناسب، مثلاً:
inputTensor = ...

# تشغيل الاستدلال
outputs = session.run(None, {"input": inputTensor})

# طباعة النتائج
print(outputs)


InvalidProtobuf: [ONNXRuntimeError] : 7 : INVALID_PROTOBUF : Load model from /content/gpt2_onnx failed:Protobuf parsing failed.

In [13]:
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="onnx-community/gpt2_onnx", filename="model.onnx")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-67a409b2-4eee46a40f9062a73f53185c;3f030613-c908-48f0-81d4-3c180daafb79)

Repository Not Found for url: https://huggingface.co/onnx-community/gpt2_onnx/resolve/main/model.onnx.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.

In [14]:
import onnx
model = onnx.load("/content/gpt2_onnx")
onnx.checker.check_model(model)


IsADirectoryError: [Errno 21] Is a directory: '/content/gpt2_onnx'

In [15]:
import os
print(os.listdir('/content/gpt2_onnx'))


['tokenizer.json', 'tokenizer_config.json', 'generation_config.json', 'merges.txt', 'config.json', 'model.onnx', 'special_tokens_map.json', 'vocab.json']


In [16]:
import onnxruntime as ort
import os

# Check the contents of the directory
model_dir = '/content/gpt2_onnx'
print(os.listdir(model_dir))  # This will show the files in the directory

# Define the full path to the ONNX model
model_path = os.path.join(model_dir, 'model.onnx')  # Adjust if needed

# Load the model
session = ort.InferenceSession(model_path)

# Run inference
inputs = tokenizer("What am I using?", return_tensors="pt")
inputTensor = inputs['input_ids'].numpy()  # Convert to numpy if needed

outputs = session.run(None, {"input_ids": inputTensor})
print(outputs)


['tokenizer.json', 'tokenizer_config.json', 'generation_config.json', 'merges.txt', 'config.json', 'model.onnx', 'special_tokens_map.json', 'vocab.json']


NameError: name 'tokenizer' is not defined

In [17]:
import onnxruntime as ort
import os

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
inputs = tokenizer("What am I using?", return_tensors="pt")
input_tensor = inputs['input_ids'].numpy()  # Convert to numpy if needed

# Run inference
outputs = session.run(None, {"input_ids": input_tensor})
print(outputs)


NameError: name 'tokenizer' is not defined

In [18]:
import onnxruntime as ort
import os
from transformers import AutoTokenizer

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
inputs = tokenizer("What am I using?", return_tensors="pt")
input_tensor = inputs['input_ids'].numpy()  # Convert to numpy if needed

# Run inference
outputs = session.run(None, {"input_ids": input_tensor})
print(outputs)


ValueError: Required inputs (['past_key_values.0.key', 'past_key_values.0.value', 'past_key_values.1.key', 'past_key_values.1.value', 'past_key_values.2.key', 'past_key_values.2.value', 'past_key_values.3.key', 'past_key_values.3.value', 'past_key_values.4.key', 'past_key_values.4.value', 'past_key_values.5.key', 'past_key_values.5.value', 'past_key_values.6.key', 'past_key_values.6.value', 'past_key_values.7.key', 'past_key_values.7.value', 'past_key_values.8.key', 'past_key_values.8.value', 'past_key_values.9.key', 'past_key_values.9.value', 'past_key_values.10.key', 'past_key_values.10.value', 'past_key_values.11.key', 'past_key_values.11.value', 'attention_mask', 'position_ids']) are missing from input feed (['input_ids']).

In [19]:
import onnxruntime as ort
import os
from transformers import AutoTokenizer

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the tokenizer from the same directory
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
inputs = tokenizer("What am I using?", return_tensors="pt")
input_tensor = inputs['input_ids'].numpy()  # Convert to numpy if needed

# Run inference
outputs = session.run(None, {"input_ids": input_tensor})
print(outputs)


ValueError: Required inputs (['past_key_values.0.key', 'past_key_values.0.value', 'past_key_values.1.key', 'past_key_values.1.value', 'past_key_values.2.key', 'past_key_values.2.value', 'past_key_values.3.key', 'past_key_values.3.value', 'past_key_values.4.key', 'past_key_values.4.value', 'past_key_values.5.key', 'past_key_values.5.value', 'past_key_values.6.key', 'past_key_values.6.value', 'past_key_values.7.key', 'past_key_values.7.value', 'past_key_values.8.key', 'past_key_values.8.value', 'past_key_values.9.key', 'past_key_values.9.value', 'past_key_values.10.key', 'past_key_values.10.value', 'past_key_values.11.key', 'past_key_values.11.value', 'attention_mask', 'position_ids']) are missing from input feed (['input_ids']).

In [20]:
import onnxruntime as ort
import os
from transformers import AutoTokenizer
import numpy as np

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the tokenizer from the same directory
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
input_text = "What am I using?"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs['input_ids'].numpy()

# Prepare other inputs as needed (setting past_key_values and position_ids)
attention_mask = inputs.get('attention_mask', None)
position_ids = np.arange(input_ids.shape[1]).reshape(1, -1)  # Generate position ids if not available
past_key_values = [None] * 12  # Set past_key_values to None for initial inference

# Prepare the inputs dictionary for ONNX model
onnx_inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask if attention_mask is not None else np.ones_like(input_ids),
    'position_ids': position_ids,
    # Include past_key_values if required, or set to None for initial run
    **{f'past_key_values.{i}.key': None for i in range(12)},
    **{f'past_key_values.{i}.value': None for i in range(12)}
}

# Run inference
outputs = session.run(None, onnx_inputs)
print(outputs)


RuntimeError: Input must be a list of dictionaries or a single numpy array for input 'attention_mask'.

In [21]:
import onnxruntime as ort
import os
from transformers import AutoTokenizer
import numpy as np

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the tokenizer from the same directory
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
input_text = "What am I using?"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs['input_ids'].numpy()

# Prepare other inputs as needed (setting past_key_values and position_ids)
attention_mask = inputs.get('attention_mask', None)
if attention_mask is None:
    attention_mask = np.ones_like(input_ids)  # Create a mask of 1's if not available
position_ids = np.arange(input_ids.shape[1]).reshape(1, -1)  # Generate position ids if not available
past_key_values = [None] * 12  # Set past_key_values to None for initial inference

# Prepare the inputs dictionary for ONNX model
onnx_inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,  # Ensure this is a numpy array of the correct shape
    'position_ids': position_ids,
    # Include past_key_values if required, or set to None for initial run
    **{f'past_key_values.{i}.key': None for i in range(12)},
    **{f'past_key_values.{i}.value': None for i in range(12)}
}

# Run inference
outputs = session.run(None, onnx_inputs)
print(outputs)


RuntimeError: Input must be a list of dictionaries or a single numpy array for input 'attention_mask'.

In [22]:
import onnxruntime as ort
import os
from transformers import AutoTokenizer
import numpy as np

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the tokenizer from the same directory
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
input_text = "What am I using?"
inputs = tokenizer(input_text, return_tensors="np")  # Use numpy tensors
input_ids = inputs['input_ids']

# Prepare other inputs as needed (setting past_key_values and position_ids)
attention_mask = inputs.get('attention_mask', None)
if attention_mask is None:
    attention_mask = np.ones_like(input_ids)  # Create a mask of 1's if not available
position_ids = np.arange(input_ids.shape[1]).reshape(1, -1)  # Generate position ids if not available
past_key_values = [None] * 12  # Set past_key_values to None for initial inference

# Prepare the inputs dictionary for ONNX model
onnx_inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,  # Ensure this is a numpy array of the correct shape
    'position_ids': position_ids,
    # Include past_key_values if required, or set to None for initial run
    **{f'past_key_values.{i}.key': None for i in range(12)},
    **{f'past_key_values.{i}.value': None for i in range(12)}
}

# Run inference
outputs = session.run(None, onnx_inputs)
print(outputs)


RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Concat node. Name:'/transformer/h.0/attn/Concat_3' Status Message: /onnxruntime_src/include/onnxruntime/core/framework/op_kernel_context.h:42 const T* onnxruntime::OpKernelContext::Input(int) const [with T = onnxruntime::Tensor] Missing Input: past_key_values.0.key


In [23]:
from transformers import AutoTokenizer
from onnxruntime import InferenceSession

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
session = InferenceSession("onnx/model.onnx")
# ONNX Runtime expects NumPy arrays as input
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

ValueError: Required inputs (['position_ids', 'past_key_values.0.key', 'past_key_values.0.value', 'past_key_values.1.key', 'past_key_values.1.value', 'past_key_values.2.key', 'past_key_values.2.value', 'past_key_values.3.key', 'past_key_values.3.value', 'past_key_values.4.key', 'past_key_values.4.value', 'past_key_values.5.key', 'past_key_values.5.value', 'past_key_values.6.key', 'past_key_values.6.value', 'past_key_values.7.key', 'past_key_values.7.value', 'past_key_values.8.key', 'past_key_values.8.value', 'past_key_values.9.key', 'past_key_values.9.value', 'past_key_values.10.key', 'past_key_values.10.value', 'past_key_values.11.key', 'past_key_values.11.value', 'past_key_values.12.key', 'past_key_values.12.value', 'past_key_values.13.key', 'past_key_values.13.value', 'past_key_values.14.key', 'past_key_values.14.value', 'past_key_values.15.key', 'past_key_values.15.value']) are missing from input feed (['input_ids', 'attention_mask']).

لكل بنية نموذج، يمكنك العثور على قائمة المهام المدعومة عبر TasksManager. على سبيل المثال ، بالنسبة إلى DistilBERT ، بالنسبة لتصدير ONNX ، لدينا:

In [2]:
from optimum.exporters.tasks import TasksManager

distilbert_tasks = list(TasksManager.get_supported_tasks_for_model_type("distilbert", "onnx").keys())
print(distilbert_tasks)

Not passing the argument `library_name` to `get_supported_tasks_for_model_type` is deprecated and the support will be removed in a future version of Optimum. Please specify a `library_name`. Defaulting to `"transformers`.


['feature-extraction', 'fill-mask', 'text-classification', 'multiple-choice', 'token-classification', 'question-answering']


In [1]:
from transformers import AutoTokenizer
from onnxruntime import InferenceSession
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
session = InferenceSession("onnx/model.onnx")

# ONNX Runtime expects NumPy arrays as input
inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")

# Add missing inputs with dummy data or appropriate values:
# Replace with actual sequence length or model-specific value if needed
seq_length = inputs['input_ids'].shape[1]
inputs['position_ids'] = np.array([[i for i in range(seq_length)]])

# For past_key_values, provide dummy tensors of the expected shape and type (float32).
# You might need to adjust the shape based on your model's configuration.
num_attention_heads = 12  # Replace with the number of attention heads in your model
hidden_size = 768         # Replace with the hidden size of your model

for i in range(num_attention_heads):
    inputs[f'past_key_values.{i}.key'] = np.zeros((1, seq_length, hidden_size), dtype=np.float32)
    inputs[f'past_key_values.{i}.value'] = np.zeros((1, seq_length, hidden_size), dtype=np.float32)

outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
print(outputs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ValueError: Required inputs (['past_key_values.12.key', 'past_key_values.12.value', 'past_key_values.13.key', 'past_key_values.13.value', 'past_key_values.14.key', 'past_key_values.14.value', 'past_key_values.15.key', 'past_key_values.15.value']) are missing from input feed (['input_ids', 'attention_mask', 'position_ids', 'past_key_values.0.key', 'past_key_values.0.value', 'past_key_values.1.key', 'past_key_values.1.value', 'past_key_values.2.key', 'past_key_values.2.value', 'past_key_values.3.key', 'past_key_values.3.value', 'past_key_values.4.key', 'past_key_values.4.value', 'past_key_values.5.key', 'past_key_values.5.value', 'past_key_values.6.key', 'past_key_values.6.value', 'past_key_values.7.key', 'past_key_values.7.value', 'past_key_values.8.key', 'past_key_values.8.value', 'past_key_values.9.key', 'past_key_values.9.value', 'past_key_values.10.key', 'past_key_values.10.value', 'past_key_values.11.key', 'past_key_values.11.value']).

### ayhgشغال

In [3]:
from optimum.exporters.onnx import main_export
from optimum.exporters.onnx.model_configs import WhisperOnnxConfig
from transformers import AutoConfig

from optimum.exporters.onnx.base import ConfigBehavior
from typing import Dict

class CustomWhisperOnnxConfig(WhisperOnnxConfig):
    @property
    def outputs(self) -> Dict[str, Dict[int, str]]:
        common_outputs = super().outputs

        if self._behavior is ConfigBehavior.ENCODER:
            for i in range(self._config.encoder_layers):
                common_outputs[f"encoder_attentions.{i}"] = {0: "batch_size"}
        elif self._behavior is ConfigBehavior.DECODER:
            for i in range(self._config.decoder_layers):
                common_outputs[f"decoder_attentions.{i}"] = {
                    0: "batch_size",
                    2: "decoder_sequence_length",
                    3: "past_decoder_sequence_length + 1"
                }
            for i in range(self._config.decoder_layers):
                common_outputs[f"cross_attentions.{i}"] = {
                    0: "batch_size",
                    2: "decoder_sequence_length",
                    3: "encoder_sequence_length_out"
                }

        return common_outputs

    @property
    def torch_to_onnx_output_map(self):
        if self._behavior is ConfigBehavior.ENCODER:
            # The encoder export uses WhisperEncoder that returns the key "attentions"
            return {"attentions": "encoder_attentions"}
        else:
            return {}

model_id = "openai/whisper-tiny.en"
config = AutoConfig.from_pretrained(model_id)

custom_whisper_onnx_config = CustomWhisperOnnxConfig(
        config=config,
        task="automatic-speech-recognition",
)

encoder_config = custom_whisper_onnx_config.with_behavior("encoder")
decoder_config = custom_whisper_onnx_config.with_behavior("decoder", use_past=False)
decoder_with_past_config = custom_whisper_onnx_config.with_behavior("decoder", use_past=True)

custom_onnx_configs={
    "encoder_model": encoder_config,
    "decoder_model": decoder_config,
    "decoder_with_past_model": decoder_with_past_config,
}

main_export(
    model_id,
    output="custom_whisper_onnx",
    no_post_process=True,
    model_kwargs={"output_attentions": True},
    custom_onnx_configs=custom_onnx_configs
)

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

  if input_features.shape[-1] != expected_seq_length:
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
  if sequence_length != 1:
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
		-[x] values not close enough, max diff: 0.0013041496276855469 (atol: 0.001)
- last_hidden_state: max diff = 0.0013041496276855469.
 The exported model was saved at: custom_whisper_onnx


In [4]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("custom_whisper_onnx")
model = ORTModelForQuestionAnswering.from_pretrained("custom_whisper_onnx")
inputs = tokenizer("What am I using?", "Using whisper with ONNX Runtime!", return_tensors="pt")
outputs = model(**inputs)

RuntimeError: Too many ONNX model files were found in custom_whisper_onnx, specify which one to load by using the file_name argument.

In [5]:
from optimum.exporters.onnx import main_export

from transformers import AutoConfig

from optimum.exporters.onnx.config import TextDecoderOnnxConfig
from optimum.utils import NormalizedTextConfig, DummyPastKeyValuesGenerator
from typing import Dict


class MPTDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
    """
    MPT swaps the two last dimensions for the key cache compared to usual transformers
    decoder models, thus the redefinition here.
    """
    def generate(self, input_name: str, framework: str = "pt"):
        past_key_shape = (
            self.batch_size,
            self.num_attention_heads,
            self.hidden_size // self.num_attention_heads,
            self.sequence_length,
        )
        past_value_shape = (
            self.batch_size,
            self.num_attention_heads,
            self.sequence_length,
            self.hidden_size // self.num_attention_heads,
        )
        return [
            (
                self.random_float_tensor(past_key_shape, framework=framework),
                self.random_float_tensor(past_value_shape, framework=framework),
            )
            for _ in range(self.num_layers)
        ]

class CustomMPTOnnxConfig(TextDecoderOnnxConfig):
    DUMMY_INPUT_GENERATOR_CLASSES = (MPTDummyPastKeyValuesGenerator,) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
    DUMMY_PKV_GENERATOR_CLASS = MPTDummyPastKeyValuesGenerator

    DEFAULT_ONNX_OPSET = 14  # aten::tril operator requires opset>=14
    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
        hidden_size="d_model",
        num_layers="n_layers",
        num_attention_heads="n_heads"
    )

    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
        """
        Adapted from https://github.com/huggingface/optimum/blob/v1.9.0/optimum/exporters/onnx/base.py#L625
        """
        if direction not in ["inputs", "outputs"]:
            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')

        if direction == "inputs":
            decoder_sequence_name = "past_sequence_length"
            name = "past_key_values"
        else:
            decoder_sequence_name = "past_sequence_length + 1"
            name = "present"

        for i in range(self._normalized_config.num_layers):
            inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 3: decoder_sequence_name}
            inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 2: decoder_sequence_name}


model_id = "/home/fxmarty/hf_internship/optimum/tiny-mpt-random-remote-code"
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

onnx_config = CustomMPTOnnxConfig(
    config=config,
    task="text-generation",
    use_past_in_inputs=False,
    use_present_in_outputs=True,
)
onnx_config_with_past = CustomMPTOnnxConfig(config, task="text-generation", use_past=True)

custom_onnx_configs = {
    "decoder_model": onnx_config,
    "decoder_with_past_model": onnx_config_with_past,
}

main_export(
    model_id,
    output="mpt_onnx",
    task="text-generation-with-past",
    trust_remote_code=True,
    custom_onnx_configs=custom_onnx_configs,
    no_post_process=True,
)

OSError: Incorrect path_or_model_id: '/home/fxmarty/hf_internship/optimum/tiny-mpt-random-remote-code'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models#optimum-inference-with-onnx-runtime

In [None]:
  from transformers import AutoTokenizer, pipeline
- from transformers import AutoModelForCausalLM
+ from optimum.onnxruntime import ORTModelForCausalLM

- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
+ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
  result = pipe("He never went out without a book under his arm")

In [6]:
from transformers import AutoTokenizer, pipeline

from optimum.onnxruntime import ORTModelForCausalLM
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")

FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in onnx-community/Llama-3.2-1B/onnx.

In [8]:
# npm i @huggingface/transformers
# import { pipeline } from '@huggingface/transformers';

# npm i @huggingface/transformers
# import { pipeline } from '@huggingface/transformers';

# // Allocate pipeline
# const pipe = await pipeline('text-generation', 'onnx-community/Llama-3.2-1B');
# Allocate pipeline
# const pipe = await pipeline('text-generation', 'onnx-community/Llama-3.2-1B');

SyntaxError: invalid syntax (<ipython-input-8-b33a0f42cfef>, line 6)

In [9]:
!git clone https://huggingface.co/onnx-community/Llama-3.2-1B

Cloning into 'Llama-3.2-1B'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 37 (delta 5), reused 0 (delta 0), pack-reused 6 (from 1)[K
Unpacking objects: 100% (37/37), 2.24 MiB | 3.40 MiB/s, done.
Filtering content: 100% (10/10), 10.39 GiB | 7.27 MiB/s, done.
Encountered 1 file(s) that may not have been copied correctly on Windows:
	onnx/model.onnx_data

See: `git lfs help smudge` for more details.


In [16]:
!rm -rf /root/.cache

In [20]:
import onnxruntime as ort
import os
from transformers import AutoTokenizer
import numpy as np

# Define the model directory and file path
model_dir = '/content/gpt2_onnx'
model_path = os.path.join(model_dir, 'model.onnx')  # Full path to the ONNX model file

# Load the tokenizer from the same directory
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the ONNX model
session = ort.InferenceSession(model_path)

# Example input for the model
input_text = "What am I using?"
inputs = tokenizer(input_text, return_tensors="np")  # Use numpy tensors
input_ids = inputs['input_ids']

# Prepare other inputs as needed (setting past_key_values and position_ids)
attention_mask = inputs.get('attention_mask', None)
if attention_mask is None:
    attention_mask = np.ones_like(input_ids)  # Create a mask of 1's if not available
position_ids = np.arange(input_ids.shape[1]).reshape(1, -1)  # Generate position ids if not available

# Initialize past_key_values to None or an empty tensor
past_key_values = [np.zeros_like(input_ids)] * 12  # Creating empty past_key_values, adjusted for shape

# Prepare the inputs dictionary for ONNX model
onnx_inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask,  # Ensure this is a numpy array of the correct shape
    'position_ids': position_ids,
    # Create placeholders for past_key_values (empty tensors for initial step)
    **{f'past_key_values.{i}.key': past_key_values[i] for i in range(12)},
    **{f'past_key_values.{i}.value': past_key_values[i] for i in range(12)}
}

# Run inference
outputs = session.run(None, onnx_inputs)
print(outputs)


InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Unexpected input data type. Actual: (tensor(int64)) , expected: (tensor(float))

In [19]:
from transformers import AutoTokenizer, pipeline

from optimum.onnxruntime import ORTModelForCausalLM
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")

config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in onnx-community/Llama-3.2-1B/onnx.

In [21]:
from huggingface_hub import model_info

model_name = "onnx-community/Llama-3.2-1B"
info = model_info(model_name)
print(info)


ModelInfo(id='onnx-community/Llama-3.2-1B', author='onnx-community', sha='aa6f4b3652f4b4530ba4e04b188a21e72f3d595e', created_at=datetime.datetime(2024, 9, 25, 12, 33, 37, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 10, 8, 13, 29, 34, tzinfo=datetime.timezone.utc), private=False, disabled=False, downloads=289, downloads_all_time=None, gated=False, gguf=None, inference='pipeline-library-pair-not-supported', likes=9, library_name='transformers.js', tags=['transformers.js', 'onnx', 'llama', 'text-generation', 'base_model:meta-llama/Llama-3.2-1B', 'base_model:quantized:meta-llama/Llama-3.2-1B', 'license:llama3.2', 'region:us'], pipeline_tag='text-generation', mask_token=None, card_data={'base_model': 'meta-llama/Llama-3.2-1B', 'datasets': None, 'eval_results': None, 'language': None, 'library_name': 'transformers.js', 'license': 'llama3.2', 'license_name': None, 'license_link': None, 'metrics': None, 'model_name': None, 'pipeline_tag': None, 'tags': None}, widget_da

In [24]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# Change this path to where you want to download the model
local_path = "./Llama-3.2-1B"

# Download model files manually
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx", cache_dir=local_path)
tokenizer = AutoTokenizer.from_pretrained("onnx-community/Llama-3.2-1B", cache_dir=local_path)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("He never went out without a book under his arm")
print(result)


FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in onnx-community/Llama-3.2-1B/onnx.

In [25]:
import os

model_dir = "./Llama-3.2-1B-onnx/onnx"
if os.path.exists(model_dir):
    print("ONNX model files:", os.listdir(model_dir))
else:
    print("ONNX model directory does not exist!")


ONNX model directory does not exist!


In [26]:
model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3-1B", subfolder="onnx")
tokenizer = AutoTokenizer.from_pretrained("onnx-community/Llama-3-1B")


RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-67a4176c-6fcedcc743bea1fd38a3364f;0e5a32cf-9ef7-4b72-8c95-46940ee3dbd1)

Repository Not Found for url: https://huggingface.co/api/models/onnx-community/Llama-3-1B/tree/main?recursive=True&expand=False.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Invalid username or password.

In [27]:
from huggingface_hub import hf_hub_list

files = hf_hub_list("onnx-community/Llama-3.2-1B")
print([f.rfilename for f in files])


ImportError: cannot import name 'hf_hub_list' from 'huggingface_hub' (/usr/local/lib/python3.11/dist-packages/huggingface_hub/__init__.py)

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "onnx-community/Llama-3.2-1B"
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",  # Explicitly set to the ONNX subfolder
    file_name="model.onnx"  # Pick a specific ONNX model
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("He never went out without a book under his arm")
print(result)


model.onnx:   0%|          | 0.00/524k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

https://onnxruntime.ai/docs/get-started/with-python.html

In [None]:
import os

model_dir = "./Llama-3.2-1B-onnx/onnx"
if os.path.exists(model_dir):
    print("ONNX model files:", os.listdir(model_dir))
else:
    print("ONNX model directory does not exist!")


In [None]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="onnx-community/Llama-3.2-1B", local_dir="./Llama-3.2-1B-onnx")


In [None]:
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="model_fp16.onnx"  # Use FP16 version
)


### شغال

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "onnx-community/Llama-3.2-1B"

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="model_q4.onnx"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("He never went out without a book under his arm")
print(result)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_q4.onnx:   0%|          | 0.00/1.66G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'He never went out without a book under his arm. He never went out without a book under his arm. He never went out without a book under'}]


In [5]:
!rm -rf /root/.cache

model_fp16.onnx_data

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX"

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="model_fp16.onnx"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


RuntimeError: Error in execution: Non-zero status code returned while running Cast node. Name:'InsertedPrecisionFreeCast_/model/layers.1/attn/v_proj/repeat_kv/Reshape_4/output_0' Status Message: /onnxruntime_src/onnxruntime/core/framework/op_kernel.cc:83 virtual OrtValue* onnxruntime::OpKernelContext::OutputMLValue(int, const onnxruntime::TensorShape&) status.IsOK() was false. Shape mismatch attempting to re-use buffer. {1,1,1536} != {1,9,1536}. Validate usage of dim_value (values should be > 0) and dim_param (all values with the same string should equate to the same size) in shapes in the model.


📌 ملفات أخرى يمكنك تجربتها:

"model_fp16.onnx" (16-bit)
"model_int8.onnx" (8-bit)
"model_q4.onnx" (4-bit) ⬅ الأفضل لاستهلاك الذاكرة
"model_uint8.onnx" (8-bit غير موقع)

### aشغال

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "onnx-community/DeepSeek-R1-Distill-Qwen-1.5B-ONNX"

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="model_q4.onnx"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_q4.onnx:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

Device set to use cpu


[{'generated_text': 'Who is Napoleon Bonaparte? What is his role in history? What is his legacy? What is his name? What is his'}]


In [None]:
result = pipe("He never went out without a book under his arm", max_new_tokens=50)
print(result)


3️⃣ تشغيل ONNX في وضع use_io_binding
خاصية use_io_binding=True تساعد في تقليل استهلاك الذاكرة عبر تحسين نقل البيانات بين المعالج والـ RAM:

In [None]:
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="model_q4.onnx",
    use_io_binding=True  # يقلل استهلاك الذاكرة
)


In [None]:
import torch
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "onnx-community/Llama-3.2-1B"

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="model_q4.onnx",
    torch_dtype=torch.float16  # تقليل استهلاك الذاكرة
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)  # نقل إلى GPU إن أمكن

result = pipe("He never went out without a book under his arm", max_new_tokens=50)
print(result)


In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()  # إذا كنت تستخدم GPU


In [4]:
import gc


gc.collect()

0

In [None]:
from huggingface_hub import login
login("your_huggingface_token_here")


In [None]:
result = pipe(
    "He never went out without a book under his arm",
    max_new_tokens=50,  # عدد الكلمات الجديدة التي سيتم توليدها
    temperature=0.7,  # يضيف تنوعًا للناتج (كلما زادت القيمة، زاد العشوائية)
    top_k=50,  # يمنع الكلمات منخفضة الاحتمالية من الظهور
    top_p=0.9,  # يحافظ على توليد نص منطقي
    repetition_penalty=1.2  # يمنع التكرار
)
print(result)


In [None]:
result = pipe("Who is Napoleon Bonaparte?", max_new_tokens=50, do_sample=True, temperature=0.7)
print(result)


In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id


ضبط num_return_sequences
إذا كان التكرار مشكلة، يمكنك تعيين num_return_sequences=1:

In [None]:
result = pipe("Who is Napoleon Bonaparte?", max_new_tokens=50, num_return_sequences=1)


التحقق من جودة النموذج Q4
بعض النماذج المضغوطة (Q4) تفقد الدقة مقارنة بالنماذج الأصلية (fp16). جرب تشغيل:

In [None]:
file_name="model_fp16.onnx"


لماذا هذا الكود أفضل؟
✅ تحسين جودة التوليد عبر max_new_tokens, do_sample, temperature, top_k, وtop_p.
✅ منع المشاكل في التوكنيزر بإضافة pad_token_id.
✅ استخدام نموذج أكثر استقرارًا (Mistral-7B-Instruct-v0.2-ONNX).
✅ تقليل التكرارات العشوائية عبر num_return_sequences=1.

إذا كنت لا تزال تواجه مشاكل، جرب استخدام model_fp16.onnx بدلاً من model_q4.onnx لتحسين الجودة! 🚀

mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx.data

In [4]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
import os

# تحديد المسار الصحيح
model_url = "https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/blob/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"
model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
subfolder = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
file_name = "mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx.data"

# التأكد من تحميل النموذج
model_path = f"/content/{file_name}"

if not os.path.exists(model_path):
    # تحميل النموذج من URL إذا لم يكن موجودًا محليًا
    from huggingface_hub import hf_hub_download
    hf_hub_download(
        repo_id=model_id,
        subfolder=subfolder,
        filename=file_name,
        local_dir="/content"
    )

# تحميل النموذج
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder=subfolder,
    file_name=file_name
)

# تحميل التوكنيزر
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# إصلاح مشكلة pad_token_id إذا لم يكن موجودًا
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# إنشاء pipeline للتوليد النصي
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# توليد نص مع المعاملات المحسنة
result = pipe(
    "Who is Napoleon Bonaparte?",
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1
)

# طباعة النتيجة
print(result)


(…)-int4-rtn-block-32-acc-level-4.onnx.data:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

(…)-int4-rtn-block-32-acc-level-4.onnx.data:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

DecodeError: Error parsing message

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# تحديد المسار الصحيح للنموذج
model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
subfolder = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
file_name = "model.onnx"  # أو أي اسم ملف ONNX موجود في المجلد

# تحميل النموذج
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder=subfolder,
    file_name=file_name
)

# تحميل التوكنيزر مع تفعيل trust_remote_code
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# إصلاح مشكلة pad_token_id إذا لم يكن موجودًا
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# إنشاء pipeline محسّن
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# توليد نص مع تحسين التحكم في الإخراج
result = pipe(
    "Who is Napoleon Bonaparte?",
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1
)

# طباعة النتيجة
print(result)


In [2]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# نموذج بديل أكثر استقرارًا (يمكنك تجربة Mistral 7B Instruct)
model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
file_name = "model_q4.onnx"  # استخدم 'model_fp16.onnx' إذا كنت تريد دقة أعلى

# تحميل النموذج
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name=file_name
)

# تحميل التوكنيزر مع تفعيل trust_remote_code
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# إصلاح مشكلة pad_token_id إذا لم يكن موجودًا
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# إنشاء pipeline محسّن
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# توليد نص مع تحسين التحكم في الإخراج
result = pipe(
    "Who is Napoleon Bonaparte?",
    max_new_tokens=100,  # زيادة عدد التوكنات المنتجة
    do_sample=True,       # تمكين العشوائية لتنوع الإجابات
    temperature=0.7,      # ضبط درجة الحرارة لتحسين التنوع
    top_k=50,             # تحديد أفضل 50 احتمالًا لاختيار الكلمة التالية
    top_p=0.9,            # تقليل احتمالية الكلمات غير المناسبة
    num_return_sequences=1  # توليد إجابة واحدة فقط
)

# طباعة النتيجة
print(result)


config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a41f6a-771dedbc2fd287c12ba402fe;d33165ea-b677-4265-b367-00003b0ebb3a)

Entry Not Found for url: https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/resolve/main/onnx/model_q4.onnx.

In [None]:
from huggingface_hub import hf_hub_download
model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
subfolder = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
file_name = "mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"

# التأكد من تحميل النموذج والملفات اللازمة
model_path = hf_hub_download(
    repo_id=model_id,
    subfolder=subfolder,
    filename=file_name,
    local_dir="/content"
)
print(f"Model downloaded to: {model_path}")


In [None]:
from huggingface_hub import hf_hub_download

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
subfolder = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
file_name = "mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"

# تحميل النموذج مع التأكد من وجود جميع الملفات
model_path = hf_hub_download(
    repo_id=model_id,
    subfolder=subfolder,
    filename=file_name,
    local_dir="/content"
)
print(f"Model downloaded to: {model_path}")


In [5]:
from huggingface_hub import hf_hub_download
import shutil
import os

cache_dir = "/root/.cache/huggingface/hub"
model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"

# حذف الذاكرة المؤقتة للنموذج
shutil.rmtree(os.path.join(cache_dir, "models--microsoft--mistral-7b-instruct-v0.2-ONNX"))


In [6]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
model = ORTModelForCausalLM.from_pretrained(model_id, subfolder="onnx")
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)


config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

FileNotFoundError: Could not find any ONNX model file for the regex ['^((?!decoder).)*.onnx', '(.*)?decoder(.*)?with_past(.*)?\\.onnx'] in microsoft/mistral-7b-instruct-v0.2-ONNX/onnx.

In [7]:
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"
)


EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a4221e-2063cd3f1afd7b5c5e180c33;580a8aa8-1946-4927-a36a-1b5f0186b64d)

Entry Not Found for url: https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/resolve/main/onnx/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx.

In [8]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"
onnx_model_path = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name=onnx_model_path
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)


EntryNotFoundError: 404 Client Error. (Request ID: Root=1-67a4223a-68afe1bc000c407c02db2b8a;88566fcb-e6a8-4455-a98d-bac0f385ec86)

Entry Not Found for url: https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/resolve/main/onnx/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx.

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# استخدام المسار المحلي للملف ONNX
onnx_model_path = "path/to/your/local/model.onnx"

model = ORTModelForCausalLM.from_pretrained(onnx_model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/mistral-7b-instruct-v0.2-ONNX")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)


In [9]:
!rm -rf /root/.cache

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx.data"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("He never went out without a book under his arm")
print(result)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DecodeError: Error parsing message

In [None]:
/root/.cache/huggingface/hub/models--microsoft--mistral-7b-instruct-v0.2-ONNX/snapshots/6d524105ea40470ca1897c97cc467f1f96a8a077/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx

In [3]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

# استخدام المسار المحلي للملف ONNX
model_path = "/root/.cache/huggingface/hub/models--microsoft--mistral-7b-instruct-v0.2-ONNX/snapshots/6d524105ea40470ca1897c97cc467f1f96a8a077/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"

model = ORTModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("microsoft/mistral-7b-instruct-v0.2-ONNX")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/root/.cache/huggingface/hub/models--microsoft--mistral-7b-instruct-v0.2-ONNX/snapshots/6d524105ea40470ca1897c97cc467f1f96a8a077/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx'. Use `repo_type` argument if needed.

In [4]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"  # Model identifier on Hugging Face Hub

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx" # or .onnx.data
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)

ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture mistral. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.


OSError: Can't load tokenizer for 'microsoft/mistral-7b-instruct-v0.2-ONNX'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'microsoft/mistral-7b-instruct-v0.2-ONNX' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"  # Model identifier on Hugging Face Hub

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx" # or .onnx.data
)

# Assuming the tokenizer is in the same directory as the model:
tokenizer_path = "/root/.cache/huggingface/hub/models--microsoft--mistral-7b-instruct-v0.2-ONNX/snapshots/6d524105ea40470ca1897c97cc467f1f96a8a077/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)  # Loading from the local directory

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"  # Model identifier on Hugging Face Hub

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx" # or .onnx.data
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture mistral. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.


OSError: Can't load tokenizer for 'microsoft/mistral-7b-instruct-v0.2-ONNX'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'microsoft/mistral-7b-instruct-v0.2-ONNX' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"  # Model identifier on Hugging Face Hub

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx",
    file_name="cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx" # or .onnx.data
)

# Use the original model ID for the tokenizer, not the ONNX ID:
tokenizer = AutoTokenizer.from_pretrained("microsoft/mistral-7b-instruct-v0.2")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)

In [None]:
python model-qa.py -m /*{YourModelPath}*/Phi-3.5-mini-instruct-onnx/cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4 -k 40 -p 0.95 -t 0.8 -r 1.0


In [5]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/Phi-3.5-mini-instruct-onnx"  # Model identifier on Hugging Face Hub

model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4",
    file_name="phi-3.5-mini-instruct-cpu-int4-awq-block-128-acc-level-4.onnx.data" # or .onnx.data
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)

(…)t4-awq-block-128-acc-level-4/config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

(…)int4-awq-block-128-acc-level-4.onnx.data:   0%|          | 0.00/2.73G [00:00<?, ?B/s]

(…)ck-128-acc-level-4/tokenizer_config.json:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

(…)awq-block-128-acc-level-4/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)-128-acc-level-4/special_tokens_map.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

DecodeError: Error parsing message

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/Phi-3.5-mini-instruct-onnx"  # Model identifier on Hugging Face Hub

# Change file_name to load the .onnx file instead of .onnx.data
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4",
    file_name="phi-3.5-mini-instruct-cpu-int4-awq-block-128-acc-level-4.onnx"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe("Who is Napoleon Bonaparte?")
print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture phi3. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.


OSError: Can't load tokenizer for 'microsoft/Phi-3.5-mini-instruct-onnx'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'microsoft/Phi-3.5-mini-instruct-onnx' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.

In [1]:
from transformers import AutoTokenizer, pipeline
   from optimum.onnxruntime import ORTModelForCausalLM

   model_id = "microsoft/Phi-3.5-mini-instruct-onnx"

   # Change file_name to load the .onnx file instead of .onnx.data
   model = ORTModelForCausalLM.from_pretrained(
       model_id,
       subfolder="cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4",
       file_name="phi-3.5-mini-instruct-cpu-int4-awq-block-128-acc-level-4.onnx"
   )

   # Use the original model id (without -onnx):
   tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   result = pipe("Who is Napoleon Bonaparte?")
   print(result)

IndentationError: unexpected indent (<ipython-input-1-026e7c0f5d9a>, line 2)

### شغال

In [2]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/Phi-3.5-mini-instruct-onnx"

# Change file_name to load the .onnx file instead of .onnx.data
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4",
    file_name="phi-3.5-mini-instruct-cpu-int4-awq-block-128-acc-level-4.onnx"
)

# Use the original model id (without -onnx):
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("Who is Napoleon Bonaparte?")
print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture phi3. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Device set to use cpu


[{'generated_text': 'Who is Napoleon Bonaparte?\n\nNapoleon Bonaparte was a French military and political leader who rose to prom'}]


/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture phi3. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.
tokenizer_config.json: 100%
 3.98k/3.98k [00:00<00:00, 219kB/s]
tokenizer.model: 100%
 500k/500k [00:00<00:00, 6.54MB/s]
tokenizer.json: 100%
 1.84M/1.84M [00:00<00:00, 9.03MB/s]
added_tokens.json: 100%
 306/306 [00:00<00:00, 20.3kB/s]
special_tokens_map.json: 100%
 665/665 [00:00<00:00, 37.4kB/s]
Device set to use cpu
[{'generated_text': 'Who is Napoleon Bonaparte?\n\nNapoleon Bonaparte was a French military and political leader who rose to prom'}]

In [None]:
https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/blob/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx

In [2]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"

# Change file_name to load the .onnx file instead of .onnx.data
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"
)

# Use the original model id (without -onnx):
tokenizer = AutoTokenizer.from_pretrained("model_id")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("Who is Napoleon Bonaparte?")
print(result)

ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture mistral. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.


OSError: model_id is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [1]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
The token `read` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate w

### ayhgشغال

In [1]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"

# Change file_name to load the .onnx file instead of .onnx.data
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx",
    use_io_binding=True
)

# Explicitly define the path to the tokenizer.model file
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/mistral-7b-instruct-v0.2-ONNX",
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
#result = pipe("Who is Napoleon Bonaparte?")

result = pipe(
    "Who is Napoleon Bonaparte?",
    max_new_tokens=10,  # زيادة عدد التوكنات المنتجة
    do_sample=True,       # تمكين العشوائية لتنوع الإجابات
    temperature=0.7,      # ضبط درجة الحرارة لتحسين التنوع
    top_k=50,             # تحديد أفضل 50 احتمالًا لاختيار الكلمة التالية
    top_p=0.9,            # تقليل احتمالية الكلمات غير المناسبة
    num_return_sequences=1  # توليد إجابة واحدة فقط
)
print(result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture mistral. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support.
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'Who is Napoleon Bonaparte?\n\nNapoleon Bonaparte ('}]


### ayhgشغال

In [2]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
#result = pipe("Who is Napoleon Bonaparte?")

result = pipe(
    "Who is Napoleon Bonaparte?",
    max_new_tokens=100,  # زيادة عدد التوكنات المنتجة
    do_sample=True,       # تمكين العشوائية لتنوع الإجابات
    temperature=0.7,      # ضبط درجة الحرارة لتحسين التنوع
    top_k=50,             # تحديد أفضل 50 احتمالًا لاختيار الكلمة التالية
    top_p=0.9,            # تقليل احتمالية الكلمات غير المناسبة
    num_return_sequences=1  # توليد إجابة واحدة فقط
)
print(result)

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'Who is Napoleon Bonaparte?\n\nNapoleon Bonaparte (1769-1821) was a French military and political leader who rose to power during the French Revolution and became Emperor of the French. He is best known for his military campaigns, which made him a legendary military figure and expanded the French Empire.\n\nNapoleon was born on the island of Corsica, which was then a possession of the Republic of Genoa. He was educated in France and became a'}]


In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM

model_id = "microsoft/mistral-7b-instruct-v0.2-ONNX"

# Change file_name to load the .onnx file instead of .onnx.data
model = ORTModelForCausalLM.from_pretrained(
    model_id,
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4",
    file_name="mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx",
    use_io_binding=True
)

# Explicitly define the path to the tokenizer.model file
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/mistral-7b-instruct-v0.2-ONNX",
    subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
#result = pipe("Who is Napoleon Bonaparte?")

result = pipe(
    "Who is Napoleon Bonaparte?",
    max_new_tokens=100,  # زيادة عدد التوكنات المنتجة
    do_sample=True,       # تمكين العشوائية لتنوع الإجابات
    temperature=0.7,      # ضبط درجة الحرارة لتحسين التنوع
    top_k=50,             # تحديد أفضل 50 احتمالًا لاختيار الكلمة التالية
    top_p=0.9,            # تقليل احتمالية الكلمات غير المناسبة
    num_return_sequences=1  # توليد إجابة واحدة فقط
)
print(result)
