In [2]:
import os
import time
import logging

import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoModelForCausalLM, GPT2Config

import hidet
from hidet.utils import benchmark_func

from transformer_deploy.utils.generative_model import GPTModelWrapper
from transformer_deploy.backends.ort_utils import create_model_for_provider, inference_onnx_binding, optimize_onnx
from transformer_deploy.backends.pytorch_utils import convert_to_onnx, get_model_size



In [3]:
model_name = "gpt2"

model: GPT2LMHeadModel = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id

In [4]:
inputs = tokenizer("Here is some text to encode Hello World", return_tensors="pt")
print("input tensors")
print(inputs)
print("input tensor shape")
print(inputs["input_ids"].size())

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
print("output tensor")
print(logits)
print("output shape")
print(logits.shape)

input tensors
{'input_ids': tensor([[ 4342,   318,   617,  2420,   284, 37773, 18435,  2159]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
input tensor shape
torch.Size([1, 8])
output tensor
tensor([[[ -34.3027,  -33.9891,  -37.5683,  ...,  -42.6734,  -42.0399,
           -34.6136],
         [ -83.3065,  -82.9769,  -86.1204,  ...,  -89.8063,  -89.4546,
           -83.6084],
         [ -91.4901,  -92.5656,  -95.6423,  ...,  -96.6183,  -98.1546,
           -91.5266],
         ...,
         [ -92.8820,  -94.8433,  -98.9224,  ..., -101.4426, -103.2702,
           -95.7642],
         [ -72.6140,  -76.3407,  -79.7973,  ...,  -87.3300,  -85.7930,
           -77.7521],
         [-103.6147, -108.7898, -109.6276,  ..., -116.8557, -116.5565,
          -107.4467]]])
output shape
torch.Size([1, 8, 50257])


In [None]:
input_ids = tokenizer(
    "Here is some text to encode Hello World", add_special_tokens=True, return_attention_mask=False, return_tensors="pt"
)

for k, v in input_ids.items():
    input_ids[k] = v.type(dtype=torch.int32)

convert_to_onnx(
    model_pytorch=model,
    output_path="test-gpt2.onnx",
    inputs_pytorch=dict(input_ids),
    quantization=False,
    var_output_seq=True,
    output_names=["output"],
)

_ = model.eval()

In [None]:
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
num_attention_heads, hidden_size = get_model_size(path=model_name)
optimize_onnx(
    onnx_path="test-gpt2.onnx",
    onnx_optim_model_path="test-gpt2-opt.onnx",
    fp16=True,
    use_cuda=True,
    num_attention_heads=num_attention_heads,
    hidden_size=hidden_size,
    architecture="gpt2",
)

In [5]:
def inference_torch(input_ids: torch.Tensor) -> torch.Tensor:
    transformer_outputs: BaseModelOutputWithPastAndCrossAttentions = model.transformer(input_ids=input_ids)
    return model.lm_head(transformer_outputs.last_hidden_state)


model.cuda()
model.eval()
inputs.to("cuda")
with torch.inference_mode():
    logits = inference_torch(inputs.input_ids)
    print(logits)
    print('----\nPytorch Forward Pass: {:.4f} ms'.format(benchmark_func(lambda: inference_torch(inputs.input_ids))))
_ = model.cpu()

tensor([[[ -34.3028,  -33.9892,  -37.5684,  ...,  -42.6735,  -42.0400,
           -34.6137],
         [ -83.3065,  -82.9769,  -86.1204,  ...,  -89.8063,  -89.4546,
           -83.6084],
         [ -91.4901,  -92.5656,  -95.6423,  ...,  -96.6184,  -98.1545,
           -91.5266],
         ...,
         [ -92.8820,  -94.8432,  -98.9224,  ..., -101.4425, -103.2702,
           -95.7642],
         [ -72.6140,  -76.3407,  -79.7973,  ...,  -87.3300,  -85.7930,
           -77.7521],
         [-103.6147, -108.7899, -109.6277,  ..., -116.8558, -116.5565,
          -107.4467]]], device='cuda:0')
----
Pytorch Forward Pass: 11.7891 ms


In [6]:
model_onnx = create_model_for_provider(path="test-gpt2.onnx", provider_to_use="CUDAExecutionProvider")

def inference_onnx(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids}
    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device="cuda")["output"]

inputs.to("cuda")
logits = inference_onnx(inputs.input_ids)
print(logits)
print('----\nONNX Forward Pass: {:.4f} ms'.format(benchmark_func(lambda: inference_onnx(inputs.input_ids))))

tensor([[[ -34.3027,  -33.9891,  -37.5683,  ...,  -42.6734,  -42.0399,
           -34.6136],
         [ -83.3065,  -82.9769,  -86.1204,  ...,  -89.8062,  -89.4546,
           -83.6083],
         [ -91.4901,  -92.5656,  -95.6423,  ...,  -96.6184,  -98.1545,
           -91.5266],
         ...,
         [ -92.8820,  -94.8432,  -98.9224,  ..., -101.4425, -103.2702,
           -95.7642],
         [ -72.6140,  -76.3407,  -79.7973,  ...,  -87.3300,  -85.7930,
           -77.7521],
         [-103.6147, -108.7898, -109.6277,  ..., -116.8558, -116.5565,
          -107.4467]]], device='cuda:0')
----
ONNX Forward Pass: 5.0749 ms


In [7]:
model_onnx = create_model_for_provider(path="test-gpt2-opt.onnx", provider_to_use="CUDAExecutionProvider")


def inference_onnx_optimized(input_ids: torch.Tensor) -> torch.Tensor:
    data = {"input_ids": input_ids}
    return inference_onnx_binding(model_onnx=model_onnx, inputs=data, device="cuda")["output"]

inputs.to("cuda")
logits = inference_onnx_optimized(inputs.input_ids)
print(logits)
print('----\nONNX Optimized Forward Pass: {:.4f} ms'.format(benchmark_func(lambda: inference_onnx_optimized(inputs.input_ids))))

tensor([[[ -34.3125,  -34.0000,  -37.5938,  ...,  -42.6875,  -42.0625,
           -34.6250],
         [ -83.2500,  -82.9375,  -86.1250,  ...,  -89.7500,  -89.4375,
           -83.5625],
         [ -91.5000,  -92.5625,  -95.6250,  ...,  -96.6250,  -98.1875,
           -91.5000],
         ...,
         [ -92.8750,  -94.8750,  -98.9375,  ..., -101.4375, -103.3125,
           -95.8125],
         [ -72.6250,  -76.3750,  -79.8125,  ...,  -87.3750,  -85.8125,
           -77.8125],
         [-103.6875, -108.8125, -109.6875,  ..., -116.8750, -116.5625,
          -107.5000]]], device='cuda:0')
----
ONNX Optimized Forward Pass: 4.5551 ms


In [8]:
hidet_onnx_module = hidet.graph.frontend.from_onnx("test-gpt2.onnx")

print('Input names:', hidet_onnx_module.input_names)
print('Output names: ', hidet_onnx_module.output_names)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Input names: ['input_ids']
Output names:  ['output']


In [9]:
data = hidet.from_torch(inputs.input_ids)
logits = hidet_onnx_module(data)

In [10]:
print(logits)

Tensor(shape=(1, 8, 50257), dtype='float32', device='cuda:0')
[[[ -34.302917  -33.989315  -37.568527 ...  -42.67358   -42.04013
    -34.613758]
  [ -83.30651   -82.9768    -86.12037  ...  -89.80621   -89.45452
    -83.60836 ]
  [ -91.49005   -92.565575  -95.64226  ...  -96.61834   -98.15459
    -91.5266  ]
  ...
  [ -92.88196   -94.843315  -98.922386 ... -101.44251  -103.27026
    -95.7642  ]
  [ -72.614     -76.340805  -79.797386 ...  -87.33003   -85.79304
    -77.75215 ]
  [-103.61467  -108.789795 -109.62762  ... -116.85566  -116.55652
   -107.446655]]]


In [11]:
print('----\nHidet  Forward Pass: {:.4f} ms'.format(benchmark_func(lambda: hidet_onnx_module(data))))

----
Hidet  Forward Pass: 2084.0274 ms


In [12]:
symbol_data = hidet.symbol_like(data)
symbol_output = hidet_onnx_module(symbol_data)
graph: hidet.FlowGraph = hidet.trace_from(symbol_output)

In [13]:
cuda_graph = graph.cuda_graph()
(output,) = cuda_graph.run([data])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [14]:
print(output)

Tensor(shape=(1, 8, 50257), dtype='float32', device='cuda:0')
[[[ -34.302917  -33.989315  -37.568527 ...  -42.67358   -42.04013
    -34.613758]
  [ -83.30651   -82.9768    -86.12037  ...  -89.80621   -89.45452
    -83.60836 ]
  [ -91.49005   -92.565575  -95.64226  ...  -96.61834   -98.15459
    -91.5266  ]
  ...
  [ -92.88196   -94.843315  -98.922386 ... -101.44251  -103.27026
    -95.7642  ]
  [ -72.614     -76.340805  -79.797386 ...  -87.33003   -85.79304
    -77.75215 ]
  [-103.61467  -108.789795 -109.62762  ... -116.85566  -116.55652
   -107.446655]]]


In [15]:
print('----\nCuda Graph Forward Pass: {:.4f} ms'.format(benchmark_func(lambda: cuda_graph.run([data]))))

----
Cuda Graph Forward Pass: 11.6329 ms


In [16]:
hidet.option.search_space(2)
with hidet.graph.PassContext() as ctx:
    ctx.save_graph_instrument('./outs/graphs')
    graph_opt: hidet.FlowGraph = hidet.graph.optimize(graph)

In [None]:
cuda_graph = graph_opt.cuda_graph()
(output,) = cuda_graph.run([data])

Compiling cuda task [92mfused(b=float32(4, 192, 2304), y=float32(768,), y=float32(768,), x=float32(1, 8, 768), y=float32(1, 8, 1), y=float32(1, 4, 8, 2304), fused_ops='div mul add reshape broadcast reshape rearrange batch_matmul reshape', anchor='batch_matmul')[0m...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Compiling: 100%|██████████████████████████████| 214/214 [10:47<00:00,  3.03s/it]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)





Batch build 214 modules within 647.959 seconds, on average 3.0 seconds per module.


Benchmarking: 100%|██████████████████████████| 214/214 [00:01<00:00, 171.72it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Compiling cuda task [92mfused(x=float32(1, 4, 8, 2304), y=float32(2304,), y=float32(1, 8, 2304), fused_ops='reduce_sum reshape add reshape', anchor='reduce_sum')[0m...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Compiling cuda task [92mfused(data=float32(1, 8, 2304), y=float32(12, 8, 8, 8), fused_ops='slice reshape rearrange rearrange reshape rearrange slice reshape rearrange rearrange reshape rearrange batch_matmul reshape', anchor='batch_matmul')[0m...
Compiling cpu task [92mcast(x=float64(1, 8, 2304), y=float32(1, 8, 2304))[0m...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Compiling cpu task [92mcast(x=float64(12, 8, 8, 8), y=float32(12, 8, 8, 8))[0m...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Compiling:  63%|██████████████████▉           | 135/214 [05:32<04:31,  3.44s/it]

In [None]:
print(output)

In [None]:
print('----\nCuda Graph Forward Pass: {:.4f} ms'.format(benchmark_func(lambda: cuda_graph.run([data]))))