In [0]:
import sys
import os
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [0]:
import importlib
import subprocess
import sys
from utils.environment_specific import is_local_development

def install_if_missing(package_name, pip_name=None):
    try:
        importlib.import_module(package_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package_name])

if not is_local_development():
    install_if_missing("dotenv", "python-dotenv")
    # install_if_missing("onnxruntime")
    # install_if_missing("onnx")
    # install_if_missing("onnxoptimizer")

In [0]:
# !pip install -r requirements.txt
# !pip install -r requirements_gpu.txt
!pip install -r ./libraries/quantization_with_transformers_deploy/requirements_gpu.txt

In [0]:
# !pip install nvidia-pyindex
# !pip install nvidia-tensorrt

In [0]:
!pip3 install git+https://git@github.com/NVIDIA/TensorRT#egg=pytorch-quantization\&subdirectory=tools/pytorch-quantization/

Check the GPU is enabled and usable.

In [0]:
! nvidia-smi

In [0]:
import random
import os
import argparse
import copy

from dotenv import load_dotenv
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import torch
import mlflow
import onnxruntime as ort


from utils.dataset import URLDatasetPart, shorten_df_in_smart_way, get_df_by_folds_from_args
from utils.environment_specific import is_local_development
from utils.transformers import OnnxTransformerModelManager, TransformerModelManager, get_huggingface_model_checkpoint
from utils.experiments import setup_mlflow_client, log_dict_mlflow, get_file_size_mb, load_model_from_run
from utils.base_models import log_persistent_performance, OnnxModelManager
from utils.output import print_dict_level1_inline

In [0]:
import logging
import os
from collections import OrderedDict
from typing import Dict, List
from typing import OrderedDict as OD
from typing import Union
import onnx

import tensorrt as trt
from tensorrt import IExecutionContext, Logger, Runtime

import transformers
from transformers import (
    AutoModelForSequenceClassification,
)

from libraries.quantization_with_transformers_deploy.pytorch_utils import convert_to_onnx
from libraries.quantization_with_transformers_deploy.trt_utils import build_engine, get_binding_idxs, infer_tensorrt
from libraries.quantization_with_transformers_deploy.calibration_utils import QATCalibrate

In [0]:
from pytorch_quantization import nn as quant_nn

In [0]:
loaded = load_dotenv(".env")
if not loaded:
    loaded = load_dotenv("../../.env")
assert loaded is True

In [0]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["NCCL_SHM_DISABLE"] = "1"
if is_local_development():
    spark = None
else:
    print("Local development is running")
    print("Databricks is running")
    assert torch.cuda.is_available(), "GPU is not available!"
    assert "CUDAExecutionProvider" in ort.get_available_providers()

In [0]:
# if you find those version and useless warnings annoying, uncomment
# import warnings
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_rows", 800)
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")

In [0]:
MODEL_CONFIGS = {
    "bert_tiny": {
        "num_attention_heads": 2,
        "encoder_layers": 2,
        "hidden_size": 128,
        "enough_samples_gpu": 60000,
    },
    "bert_mini": {
        "num_attention_heads": 4,
        "encoder_layers": 4,
        "hidden_size": 256,
        "enough_samples_gpu": 30000,
    },
    "bert_small": {
        "num_attention_heads": 8,
        "encoder_layers": 4,
        "hidden_size": 512,
        "enough_samples_gpu": 10000,
    },
}

In [0]:
setup_mlflow_client()

In [0]:
working_dir = "./working_dir"

Set logging to `error` level to ease readability of this `notebook` on Github.

In [0]:
log_level = logging.ERROR
logging.getLogger().setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
trt_logger: Logger = trt.Logger(trt.Logger.ERROR)
transformers.logging.set_verbosity_error()
timings: Dict[str, List[float]] = dict()
runtime: Runtime = trt.Runtime(trt_logger)
profile_index = 0

In [0]:
def get_model_size_stats(path):
    size_mb = get_file_size_mb(path)

    model = onnx.load(path)
    total_params = sum(int(np.prod(init.dims)) for init in model.graph.initializer)

    return {
        "onnx_model_size_mb": round(size_mb, 2),
        "total_parameters": total_params,
    }

In [0]:
def get_calibrated_model(percentile, base_model_manager: TransformerModelManager, train_dataset, calib_length = 1000):
    args = copy.deepcopy(base_model_manager.args)
    print(f"percentile: {percentile}")
    with QATCalibrate(method="histogram", percentile=percentile) as qat:
        model_q_manager = TransformerModelManager.from_args(args)
        model_q_manager.model.load_state_dict(base_model_manager.model.state_dict())
        model_q_manager.tokenizer = base_model_manager.tokenizer
        qat.setup_model_qat(model_q_manager.model)
        loader = DataLoader(
            train_dataset,
            batch_size=model_q_manager.args.batch_size,
            shuffle=False,
            collate_fn=model_q_manager.prepare_batch,
        )
        seen = 0
        with torch.no_grad():
            for batch in loader:
                inputs = batch[0]
                inputs = model_q_manager.get_moved_batch_inputs_to_device(batch_input=inputs)
                _ = model_q_manager.forward(inputs)
                seen += inputs["input_ids"].size()[0]
                if seen >= calib_length:
                    break
    return model_q_manager

In [0]:
def do_percentile_evaluation(base_model_manager: TransformerModelManager, train_dataset, eval_dataset):
    percentiles_to_try = [99.9, 99.99, 99.999, 99.9999]
    for percentile in percentiles_to_try:
        model_q_manager = get_calibrated_model(percentile, base_model_manager, train_dataset)
        # TODO
        eval_data_loader = DataLoader(
            eval_dataset,
            batch_size=model_q_manager.args.batch_size,
            shuffle=False,
            collate_fn=model_q_manager.prepare_batch,
        )
        metrics, output, best_threshold_metrics = model_q_manager.evaluate(data_loader=eval_data_loader)
        log_persistent_performance(
            metrics=metrics,
            best_threshold_metrics=best_threshold_metrics,
            true_labels=output["true_labels"],
            class_probabilities=output["class_probabilities"],
            predictions=output["predictions"],
            prefix=f"percentile_{percentile}/",
            # store_predictions=True,
        )
        print_dict_level1_inline(metrics)

In [0]:
# def keep_only_operations(model_q_manager: TransformerModelManager, ops_to_enable):
#     for name, module in model_q_manager.model.named_modules():
#         if isinstance(module, quant_nn.TensorQuantizer):
#             if any(op in name for op in ops_to_enable):
#                 module.enable_quant()
#             else:
#                 module.disable_quant()

In [0]:
def convert_quantized_model_to_onnx(quantized_model_manager: TransformerModelManager, train_dataset, output_path):
    train_loader = DataLoader(
        train_dataset,
        batch_size=quantized_model_manager.args.batch_size,
        shuffle=False,
        collate_fn=quantized_model_manager.prepare_batch,
    )
    for batch in train_loader:
        inputs = batch[0]
        inputs = quantized_model_manager.get_moved_batch_inputs_to_device(batch_input=inputs)
        convert_to_onnx(
            model_pytorch=quantized_model_manager.model,
            output_path=output_path,
            output_names=["logits"],
            inputs_pytorch=inputs,
            quantization=True,
            var_output_seq=False,
        )
        break

In [0]:
# def do_operation_evaluation(model_q_manager: TransformerModelManager, eval_dataset):
#     for op in ["matmul", "add", "layernorm"]:
#         keep_only_operations(model_q_manager=model_q_manager, ops_to_enable=[op])
#         eval_data_loader = DataLoader(
#             eval_dataset,
#             batch_size=model_q_manager.args.batch_size,
#             shuffle=False,
#             collate_fn=model_q_manager.prepare_batch,
#         )
#         metrics, output, best_threshold_metrics = model_q_manager.evaluate(data_loader=eval_data_loader)
#         log_persistent_performance(
#             metrics=metrics,
#             best_threshold_metrics=best_threshold_metrics,
#             true_labels=output["true_labels"],
#             class_probabilities=output["class_probabilities"],
#             predictions=output["predictions"],
#             prefix=f"operation_{op}/",
#             # store_predictions=True,
#         )
#         print_dict_level1_inline(metrics)


In [0]:
def do_static_quant(base_model_manager, train_dataset, eval_dataset, quant_config):
    chosen_percentile = quant_config["chosen_percentile"]
    quantized_manager = get_calibrated_model(percentile=chosen_percentile, base_model_manager=base_model_manager, train_dataset=train_dataset)

    # keep_only_operations(quantized_manager)

    eval_data_loader = DataLoader(
        eval_dataset,
        batch_size=quantized_manager.args.batch_size,
        shuffle=False,
        collate_fn=quantized_manager.prepare_batch,
    )
    metrics, output, best_threshold_metrics = quantized_manager.evaluate(data_loader=eval_data_loader)
    output_folder = "static-ptq"
    log_persistent_performance(
        metrics=metrics,
        best_threshold_metrics=best_threshold_metrics,
        true_labels=output["true_labels"],
        class_probabilities=output["class_probabilities"],
        predictions=output["predictions"],
        prefix=f"{output_folder}/",
        # store_predictions=True,
    )
    print_dict_level1_inline(metrics)
    log_dict_mlflow(quant_config, f"{output_folder}/quant_config.json")

    # save ptq model
    output_path = f"{working_dir}/{output_folder}.onnx"
    convert_quantized_model_to_onnx(quantized_model_manager=quantized_manager, train_dataset=train_dataset, output_path=output_path)
    mlflow.log_artifact(output_path, artifact_path=output_folder)
    model_size_stats = get_model_size_stats(output_path)
    print(model_size_stats)
    log_dict_mlflow(model_size_stats, f"{output_folder}/model_size_stats.json")
    return quantized_manager

In [0]:
def do_qat(model_q_manager: TransformerModelManager, train_dataset, eval_dataset):
    model_q_manager.args.classifier_learning_rate = 2e-7
    model_q_manager.args.bert_learning_rate = 2e-7
    # model_q_manager.args.epochs_max = 1
    train_data_loader = DataLoader(
        train_dataset,
        batch_size=model_q_manager.args.batch_size,
        shuffle=True,
        collate_fn=model_q_manager.prepare_batch,
    )
    output_folder = "qat"
    output_path = f"{working_dir}/{output_folder}.onnx"
    model_q_manager.train_epoch(epoch=0, train_data_loader=train_data_loader)
    model_q_manager.run_before_eval()
    convert_quantized_model_to_onnx(quantized_model_manager=model_q_manager, train_dataset=train_dataset, output_path=output_path)
    mlflow.log_artifact(output_path, artifact_path=output_folder)
    model_size_stats = get_model_size_stats(output_path)
    print(model_size_stats)
    log_dict_mlflow(model_size_stats, f"{output_folder}/model_size_stats.json")
    eval_data_loader = DataLoader(
        eval_dataset,
        batch_size=model_q_manager.args.batch_size,
        shuffle=True,
        collate_fn=model_q_manager.prepare_batch,
    )

    metrics, output, best_threshold_metrics = model_q_manager.evaluate(data_loader=eval_data_loader)
    log_persistent_performance(
        metrics=metrics,
        best_threshold_metrics=best_threshold_metrics,
        true_labels=output["true_labels"],
        class_probabilities=output["class_probabilities"],
        predictions=output["predictions"],
        prefix=f"{output_folder}/",
        # store_predictions=True,
    )
    print_dict_level1_inline(metrics)

    return output_path

In [0]:
def print_model_graph(model):
    for name, module in model.named_modules():
        print(name)

# Evaluate on TensorRT

In [0]:
class TensorrtTransformerModelManager(TransformerModelManager):

    def __init__(self, args, tokenizer, context, input_binding_idxs, output_binding_idxs):
        self.context = context
        self.input_binding_idxs = input_binding_idxs
        self.output_binding_idxs = output_binding_idxs
        super().__init__(args, model=self.infer_trt, tokenizer=tokenizer, device=None, move_to_device=False)


    def run_before_eval(self):
        print("Running test")
        pass

    def forward(self, batch_input):
        # inputs = {
        #     "input_ids": batch_input["input_ids"],
        #     "attention_mask": batch_input["attention_mask"],
        #     "token_type_ids": batch_input["token_type_ids"]
        # }
        output = self.model(batch_input)
        return output["logits"]
    
    def infer_trt(self, inputs):
        return infer_tensorrt(
            context=self.context,
            inputs=inputs,
            input_binding_idxs=self.input_binding_idxs,
            output_binding_idxs=self.output_binding_idxs,
        )

    def prepare_batch(self, batch):
        urls, labels = zip(*batch)
        encoded_batch = self.tokenizer(
            urls, padding=True, truncation=True, max_length=self.args.max_sequence_length, return_tensors="pt"
        )
        input_dict = {
            # "urls": urls,
            "input_ids": encoded_batch["input_ids"],
            "attention_mask": encoded_batch["attention_mask"],
            "token_type_ids": encoded_batch["token_type_ids"],
        }
        labels_tensor = torch.tensor(labels, dtype=torch.long)
        for k, v in input_dict.items():
            input_dict[k] = v.int()
        return input_dict, labels_tensor

In [0]:
def get_trt_model(model_onnx_path, args, tokenizer):
    trt_logger: Logger = trt.Logger(trt.Logger.VERBOSE)
    runtime: Runtime = trt.Runtime(trt_logger)
    profile_index = 0
    engine = build_engine(
        runtime=runtime,
        onnx_file_path=model_onnx_path,
        logger=trt_logger,
        # min_shape=(1, args.max_sequence_length),
        min_shape=(1, 1),
        optimal_shape=(args.batch_size, args.max_sequence_length),
        max_shape=(args.batch_size, args.max_sequence_length),
        workspace_size=4000 * 1024 * 1024,
        fp16=False,
        int8=True,
    )
    context: IExecutionContext = engine.create_execution_context()
    context.set_optimization_profile_async(
        profile_index=profile_index, stream_handle=torch.cuda.current_stream().cuda_stream
    )
    input_binding_idxs, output_binding_idxs = get_binding_idxs(engine, profile_index)

    model_manager_tensorrt = TensorrtTransformerModelManager(
        args=args,
        tokenizer=tokenizer,
        context=context,
        input_binding_idxs=input_binding_idxs,
        output_binding_idxs=output_binding_idxs,
    )
    return model_manager_tensorrt

In [0]:
def do_trt_eval(trt_model: TensorrtTransformerModelManager, eval_dataset, output_name="txt"):
    eval_data_loader = DataLoader(
        eval_dataset,
        batch_size=trt_model.args.batch_size,
        shuffle=False,
        collate_fn=trt_model.prepare_batch,
    )

    metrics, output, best_threshold_metrics = trt_model.evaluate(data_loader=eval_data_loader)
    print_dict_level1_inline(metrics)
    log_persistent_performance(
        metrics=metrics,
        best_threshold_metrics=best_threshold_metrics,
        true_labels=output["true_labels"],
        class_probabilities=output["class_probabilities"],
        predictions=output["predictions"],
        prefix=f"{output_name}/",
        # store_predictions=True,
    )
    perf = trt_model.measure_performance(
        data_loader=eval_data_loader, enough_samples_to_measure=MODEL_CONFIGS[trt_model.args.model_type]["enough_samples_gpu"], log_to_mlflow=False
    )
    print_dict_level1_inline(perf)
    log_dict_mlflow(perf, f"{output_name}/performance.json")

In [0]:

from mlflow import artifacts

def eval_speed(model_run_id, qat_run_id):
    base_model_manager = load_model_from_run(model_run_id)
    onnx_file = "./working_dir/qat.onnx"
    args = base_model_manager.args
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # in case any standard library uses some random function
    random.seed(args.seed)
    args.shorten_to_train = None
    args.shorten_to_eval = None
    # args.shorten_to_train = args.shorten_to_train or "100000u"
    args.shorten_to_eval = args.shorten_to_eval or "20000u"

    full_train_df, full_eval_df = get_df_by_folds_from_args(args, spark)

    # load the onnx model from mlflow experiment and store it in working_dir/qat.onnx
    # local_path = mlflow.download_artifacts(qat_run_id, "qat/qat.onnx", onnx_file)
    artifacts.download_artifacts(
        run_id=qat_run_id,
        artifact_path="qat/qat.onnx",
        dst_path="./working_dir"
    )
    onnx_file = "./working_dir/qat/qat.onnx"

    full_train_dataset = URLDatasetPart.from_pandas(full_train_df)
    full_eval_dataset = URLDatasetPart.from_pandas(full_eval_df)
    trt_module = get_trt_model(onnx_file, args, tokenizer=base_model_manager.tokenizer)
    do_trt_eval(trt_module, eval_dataset=full_eval_dataset, output_name=f"{args.model_type}_{args.dataset_name}_{qat_run_id}")


- load onnx file, parameters and tokenizer from databricks run
- experimentally - can be run just once before it consumes all the memory and cannot be run again - TODO figure out why

In [0]:
def do_param_search_run(model_manager, spark):
    args = model_manager.args
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # in case any standard library uses some random function
    random.seed(args.seed)
    args.shorten_to_train = None
    args.shorten_to_eval = None
    args.shorten_to_train = args.shorten_to_train or "100000u"
    # args.shorten_to_eval = args.shorten_to_eval or "40000u"
    
    full_train_df, full_eval_df = get_df_by_folds_from_args(args, spark)

    param_train_dataset = URLDatasetPart.from_pandas(full_train_df)
    param_eval_dataset = URLDatasetPart.from_pandas(full_eval_df)

    with mlflow.start_run(run_name=f"Quantization-static-params-{args.model_type}-{args.dataset_name}") as run:
        do_percentile_evaluation(base_model_manager=model_manager, train_dataset=param_train_dataset, eval_dataset=param_eval_dataset)
        # do_baseline_evaluation(model_manager=model_manager, eval_dataset=param_eval_dataset)
        chosen_percentile = 99.9
        model_q_manager = get_calibrated_model(percentile=chosen_percentile, base_model_manager=model_manager, train_dataset=param_train_dataset)
        # do_operation_evaluation(model_q_manager=model_q_manager, eval_dataset=param_eval_dataset)
        
        
def do_run(base_model_manager, spark, should_do_trt_eval=True):
    args = base_model_manager.args
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # in case any standard library uses some random function
    random.seed(args.seed)
    args.shorten_to_train = None
    args.shorten_to_eval = None
    args.shorten_to_train = args.shorten_to_train or "100000u"
    # args.shorten_to_eval = args.shorten_to_eval or "4000u"

    full_train_df, full_eval_df = get_df_by_folds_from_args(args, spark)

    full_train_dataset = URLDatasetPart.from_pandas(full_train_df)
    full_eval_dataset = URLDatasetPart.from_pandas(full_eval_df)

    quant_config = {
        "chosen_percentile": 99.9,
    }
    with mlflow.start_run(run_name=f"Quantization-static-params-{args.model_type}-{args.dataset_name}") as run:
        # do_baseline_evaluation(model_manager=base_model_manager, eval_dataset=full_eval_dataset)
        # print_model_graph(base_model_manager.model)
        model_q_manager = do_static_quant(base_model_manager=base_model_manager, train_dataset=full_train_dataset, eval_dataset=full_eval_dataset, quant_config=quant_config)
        print_model_graph(model_q_manager.model)
        onnx_file = do_qat(model_q_manager=model_q_manager, train_dataset=full_train_dataset, eval_dataset=full_eval_dataset)

        # load tensorrt model
        if should_do_trt_eval:
            trt_module = get_trt_model(onnx_file, args, tokenizer=model_q_manager.tokenizer)
            do_trt_eval(trt_module, eval_dataset=full_eval_dataset)
        # evaluate both performance and speed

# onnx_baseline_model_path = os.path.join(working_dir, 'onnx_baseline.onnx')
# global_torch_model_manager.create_onnx_file(onnx_baseline_model_path)

# Runs

In [0]:
# load_run_id = "14aa0dcdb9344606a6b84778c9bcd69a"
# global_torch_manager = load_model_from_run(load_run_id)
# do_param_search_run(global_torch_manager, spark)

In [0]:
load_run_id = "94c1fa2098824e10be4fad32399fbc2b"
base_model_manager = load_model_from_run(load_run_id)
do_run(base_model_manager, spark, do_trt_eval=False)

In [0]:
load_run_id = "bf2a1ff5fe234761bc77c5cfa1077998"
global_torch_manager = load_model_from_run(load_run_id)
do_run(global_torch_manager, spark)

In [0]:
# load_run_id = "14aa0dcdb9344606a6b84778c9bcd69a"
# global_torch_manager = load_model_from_run(load_run_id)
# do_run(global_torch_manager, spark, do_trt_eval=False)

In [0]:
# load_run_id = "11d9a273d9204e928736c5482ada67b2"
# global_torch_manager.args.dataset_name = "private_data"
# do_run(global_torch_manager, spark)