# CodeGen Model Optimization on GPU
The code in this notebook is to do optimization and quantization of the SalesForce's [Codegen](https://github.com/salesforce/CodeGen) mono model in a Colab VM with hardware acceleration.

### Settings

Check for the GPU model.

In [None]:
!nvidia-smi

Install the missing requirements in the Colab VM.

In [None]:
!pip install transformers onnx onnxruntime-gpu

Download the CodeGen pre-trained model and tokenizer.

In [None]:
import torch
device = "cpu"
if torch.cuda.is_available():
  torch.set_default_tensor_type(torch.cuda.FloatTensor)
  device = "cuda"
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer_id = "Salesforce/codegen-350M-mono"
model_id = "Salesforce/codegen-350M-mono"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [None]:
tokenizer.save_pretrained("local-pt-checkpoint")
model.save_pretrained("local-pt-checkpoint")

### Conversion to ONNX Format

Convert the pre-trained model to the ONNX format using the tool available in the Transformers library. The command below performs also validation at the end of the conversion process.

In [None]:
!python -m transformers.onnx --feature "causal-lm" --framework pt --export_with_transformers --model=local-pt-checkpoint onnx/

### Benchmark

In [None]:
onnx_model_path = "onnx/model.onnx"
quantized_model_path = "model.quant.onnx"

Define some utility functions to perform benchmarks of different versions of the model with diverse providers in the ONNX runtime.

In [None]:
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers

def create_model_for_provider(model_path, provider, enable_profiling=False):

  assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

  # Few properties that might have an impact on performances (provided by MS)
  options = SessionOptions()
  options.enable_profiling = enable_profiling
  options.intra_op_num_threads = 1
  options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

  # Load the model as a graph and prepare the CPU backend
  session = InferenceSession(model_path, options, providers=[provider])
  session.disable_fallback()

  return session


@contextmanager
def track_infer_time(buffer: [int]):
    start = time()
    yield
    end = time()

    buffer.append(end - start)


@dataclass
class OnnxInferenceResult:
  model_inference_time: [int]
  optimized_model_path: str

Prepare the input to use for benchmarking the original model (PyTorch Tensor) and the ONNX versions (numpy array).

In [None]:
from transformers import CodeGenTokenizerFast

tokenizer = CodeGenTokenizerFast.from_pretrained(model_id)

prompt = "def create_bar_chart_with_matplotlib():"
model_inputs = tokenizer(prompt, return_tensors="pt")
inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}

Benchmark PyTorch on GPU

In [None]:
from transformers import CodeGenModel

PROVIDERS = {
    ("cuda:0", "PyTorch GPU")
}

results = {}

for device, label in PROVIDERS:

    model_inputs_on_device = {
        arg_name: tensor.to(device)
        for arg_name, tensor in model_inputs.items()
    }

    model_pt = CodeGenModel.from_pretrained(model_id).to(device)
    for _ in trange(10, desc="Warming up"):
      model_pt(**model_inputs_on_device)

    # Compute
    time_buffer = []
    for _ in trange(100, desc=f"Tracking inference time on PyTorch"):
      with track_infer_time(time_buffer):
        model_pt(**model_inputs_on_device)

    # Store the result
    results[label] = OnnxInferenceResult(
        time_buffer,
        None
    )

Benchmark the ONNX converted model on GPU.

In [None]:
PROVIDERS = {
    ("CUDAExecutionProvider", "ONNX GPU"),
}

for provider, label in PROVIDERS:
    model = create_model_for_provider(onnx_model_path, provider)

    time_buffer = []

    model.run(None, inputs_onnx)

    for _ in trange(100, desc=f"Tracking inference time on {provider}"):
      with track_infer_time(time_buffer):
          model.run(None, inputs_onnx)

    results[label] = OnnxInferenceResult(
      time_buffer,
      model.get_session_options().optimized_model_filepath
    )

Benchmark the ONNX converted model on GPU doing IO binding.

In [None]:
import numpy as np

PROVIDERS = {
    ("CUDAExecutionProvider", "ONNX GPU IO Binding"),
}

for provider, label in PROVIDERS:
    model = create_model_for_provider(onnx_model_path, provider)
    io_binding = model.io_binding()
    io_binding.bind_input(
      name='input_ids',
      device_type='cuda',
      device_id=0,
      element_type=np.int64,
      shape=tuple(model_inputs['input_ids'].shape),
      buffer_ptr=model_inputs['input_ids'].data_ptr(),
    )
    io_binding.bind_input(
      name='attention_mask',
      device_type='cuda',
      device_id=0,
      element_type=np.int64,
      shape=tuple(model_inputs['attention_mask'].shape),
      buffer_ptr=model_inputs['attention_mask'].data_ptr(),
    )

    output_names = model.get_outputs()[0].name
    io_binding.bind_output(output_names, 'cuda')

    time_buffer = []

    model.run_with_iobinding(io_binding)

    for _ in trange(100, desc=f"Tracking inference time on {provider}"):
      with track_infer_time(time_buffer):
          model.run_with_iobinding(io_binding)

    results[label] = OnnxInferenceResult(
      time_buffer,
      model.get_session_options().optimized_model_filepath
    )

### Compare Benchmark Results Visually

In [None]:
import plotly.express as px

# Compute average inference time and standard deviation
time_results = {k: np.mean(v.model_inference_time) * 1e3 for k, v in results.items()}
time_results_std = {k: np.std(v.model_inference_time) * 1000 for k, v in results.items()}

fig = px.bar(x=time_results.keys(), y=time_results.values(),
             title="Average inference time (ms) for each provider",
             labels={'x':'Provider', 'y':'Avg Inference time (ms)'},
             color=time_results.values(),
             color_continuous_scale=px.colors.sequential.Tealgrn,
             text_auto='.2s')
fig.show()

In [None]:
import pandas as pd

results_df = pd.DataFrame(columns=['Provider', 'Inference_time'])
for k, v in results.items():
  for i in range(len(v.model_inference_time)):
    results_df.loc[len(results_df.index)] = [k, v.model_inference_time[i] * 1e3]

fig = px.box(results_df, x="Provider", y="Inference_time",
             points="all",
             labels={'Provider':'Provider', 'Inference_time':'Inference durations (ms)'})
fig.show()