In [None]:
import importlib
import subprocess
import sys
from utils.environment_specific import is_local_development

def install_if_missing(package_name, pip_name=None):
    try:
        importlib.import_module(package_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package_name])

if not is_local_development():
    install_if_missing("dotenv", "python-dotenv")
    install_if_missing("onnx", "onnx==1.16.2")
    install_if_missing("onnxruntime", "onnxruntime-gpu==1.17.0")

# must not be installed together with onnxruntime-transformers (it does not detect GPU then on databriks)
# %pip install --upgrade --force-reinstall --ignore-installed onnxruntime-gpu==1.17.0

In [None]:
import random
import os

from dotenv import load_dotenv
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import torch
import mlflow
import onnx
import onnxruntime as ort

from utils.dataset import URLDatasetPart, shorten_df_in_smart_way, get_df_by_folds_from_args
from utils.environment_specific import is_local_development
from utils.transformers import OnnxTransformerModelManager
from utils.experiments import setup_mlflow_client, log_dict_mlflow, get_file_size_mb, load_model_from_run
from utils.base_models import log_persistent_performance, OnnxModelManager
from utils.output import print_dict_level1_inline

In [None]:
loaded = load_dotenv(".env")
if not loaded:
    loaded = load_dotenv("../../.env")
assert loaded is True

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["NCCL_SHM_DISABLE"] = "1"
if is_local_development():
    spark = None
else:
    print("Local development is running")
    print("Databricks is running")
    assert torch.cuda.is_available(), "GPU is not available!"
    assert "CUDAExecutionProvider" in ort.get_available_providers()

- threads, parallelism env, ...

In [None]:
# TODO: check if useful
# num_threads = os.cpu_count()
# torch.set_num_threads(num_threads)
# if not "called" in locals():
#     called = True
#     torch.set_num_interop_threads(num_threads)
# print(f"Using {num_threads} CPU threads.")
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["NCCL_SHM_DISABLE"] = "1"
if not is_local_development():
    print("Databricks is running")
    assert torch.cuda.is_available(), "GPU is not available!"
else:
    print("Local development is running")



In [None]:
# if you find those version and useless warnings annoying, uncomment
# import warnings
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_rows", 800)
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")

# Dataset load

In [None]:
MODEL_CONFIGS = {
    "bert_tiny": {
        "num_attention_heads": 2,
        "encoder_layers": 2,
        "hidden_size": 128,
        "enough_samples_cpu": 10000,
        "enough_samples_gpu": 60000,
    },
    "bert_mini": {
        "num_attention_heads": 4,
        "encoder_layers": 4,
        "hidden_size": 256,
        "enough_samples_cpu": 5000,
        "enough_samples_gpu": 30000,
    },
    "bert_small": {
        "num_attention_heads": 8,
        "encoder_layers": 4,
        "hidden_size": 512,
        "enough_samples_cpu": 1000,
        "enough_samples_gpu": 10000,
    },
}

In [None]:
setup_mlflow_client()

In [None]:
working_dir = "./working_dir"
global_tokenizer = None
global_args = None
global_torch_model = None

In [None]:
def get_model_size_stats(session):
    path = session._model_path
    size_mb = get_file_size_mb(path)

    model = onnx.load(path)
    total_params = sum(int(np.prod(init.dims)) for init in model.graph.initializer)

    return {
        "onnx_model_size_mb": round(size_mb, 2),
        "total_parameters": total_params,
    }


def get_enough_samples_perf(session):
    providers = session.get_providers()
    if "CUDAExecutionProvider" in providers:
        return MODEL_CONFIGS[global_args.model_type]['enough_samples_gpu']
    else:
        return MODEL_CONFIGS[global_args.model_type]['enough_samples_cpu']


def evaluate_metrics_and_performance(session: OnnxModelManager, dataset, output_folder: str, params_to_log, measure_performance=False, evaluate=False, log_model_stats=True, log_model=False):
    evaluated_model = OnnxTransformerModelManager(session, global_tokenizer, global_args)
    metrics, perf = None, None
    dl = DataLoader(
        dataset,
        batch_size=evaluated_model.args.batch_size,
        shuffle=False,
        collate_fn=evaluated_model.prepare_batch,
    )
    if measure_performance:
        perf = evaluated_model.measure_performance(
            data_loader=dl, enough_samples_to_measure=get_enough_samples_perf(session), log_to_mlflow=False
        )
        print_dict_level1_inline(perf)
        log_dict_mlflow(perf, f"{output_folder}/performance.json")

    if log_model_stats:
        model_size_stats = get_model_size_stats(session)
        print(model_size_stats)
        log_dict_mlflow(model_size_stats, f"{output_folder}/model_size_stats.json")
    if log_model:
        mlflow.log_artifact(session._model_path, artifact_path=output_folder)

    if evaluate:
        metrics, output, best_threshold_metrics = evaluated_model.evaluate(data_loader=dl)
        log_persistent_performance(
            metrics=metrics,
            best_threshold_metrics=best_threshold_metrics,
            true_labels=output["true_labels"],
            class_probabilities=output["class_probabilities"],
            predictions=output["predictions"],
            prefix=f"{output_folder}/",
            # store_predictions=True,
        )
        print_dict_level1_inline(metrics)
    if params_to_log is not None:
        log_dict_mlflow(params_to_log, f"{output_folder}/config.json")

    return metrics, perf

# ONNX Baseline

- Purpose of this part:
    - Get baseline metrics (GPU) and performance (CPU) scores to compare with dynamic quantization on limited dataset
    - Get GPU performance to check that it is similar to Pytorch

In [None]:
def do_baseline_evaluation(onnx_baseline_model_path, cpu_eval_dataset, full_eval_dataset):
    # CPU performance
    session = OnnxTransformerModelManager.get_cpu_session(onnx_baseline_model_path)
    baseline_cpu_perf, _ = evaluate_metrics_and_performance(session, cpu_eval_dataset, output_folder="onnx_baseline_cpu", params_to_log=None, measure_performance=True)
    # we want evaluation comparison on CPU dataset - but it can be run with GPU to do it faster. Results are stored inside same folder.
    session = OnnxTransformerModelManager.get_gpu_session(onnx_baseline_model_path)
    baseline_cpu_metrics, _ = evaluate_metrics_and_performance(session, cpu_eval_dataset, output_folder="onnx_baseline_cpu", params_to_log=None, evaluate=True, log_model_stats=False)

    # Eval GPU performance
    session = OnnxTransformerModelManager.get_gpu_session(onnx_baseline_model_path)
    _, baseline_gpu_perf = evaluate_metrics_and_performance(session, full_eval_dataset, output_folder="onnx_baseline_gpu", params_to_log=None, measure_performance=True)

# Optimized model

In [None]:
from onnxruntime.transformers.optimizer import optimize_model

In [None]:
def do_optimized_model_part(baseline_model_path, full_eval_dataset):
    model_name = "onnx_baseline_optimized"
    optimized_fp32_path = os.path.join(working_dir, f"{model_name}.onnx")
    optimized_model = optimize_model(
        baseline_model_path, model_type="bert", use_gpu=True, opt_level=2, num_heads=MODEL_CONFIGS[global_args.model_type]['num_attention_heads'], hidden_size=MODEL_CONFIGS[global_args.model_type]['hidden_size']
    )
    optimized_model.save_model_to_file(optimized_fp32_path)
    session_fp32_opt = OnnxTransformerModelManager.get_gpu_session(optimized_fp32_path)
    optimized_metrics, optimized_perf = evaluate_metrics_and_performance(
        session_fp32_opt, full_eval_dataset, model_name, params_to_log=None, measure_performance=True, evaluate=True, log_model=True
    )

# Float16

- onnxruntime optimizer variant

## Without optimizations

In [None]:
def do_float16_part_without_optimizations(baseline_model_path, model_type, full_eval_dataset):
    model_name = "onnx_baseline_opt_fp16_without_optimizations"
    onnx_fp16_path = os.path.join(working_dir, f"{model_name}.onnx")

    opt_model = optimize_model(
        baseline_model_path,
        model_type="bert",
        use_gpu=True,
        opt_level=0,
        num_heads=MODEL_CONFIGS[model_type]["num_attention_heads"],
        hidden_size=MODEL_CONFIGS[model_type]["hidden_size"],
    )
    opt_model.convert_float_to_float16(keep_io_types=True)
    opt_model.save_model_to_file(onnx_fp16_path)
    session = OnnxTransformerModelManager.get_gpu_session(onnx_fp16_path)
    gpu_fp16_metrics, gpu_fp16_perf = evaluate_metrics_and_performance(
        session=session, dataset=full_eval_dataset, output_folder=model_name, params_to_log=None, measure_performance=True, evaluate=True, log_model=True
    )

## With optimizations

In [None]:
def do_float16_part(baseline_model_path, model_type, full_eval_dataset):
    model_name = "onnx_baseline_opt_fp16"
    onnx_fp16_path = os.path.join(working_dir, f"{model_name}.onnx")

    opt_model = optimize_model(
        baseline_model_path,
        model_type="bert",
        use_gpu=True,
        opt_level=2,
        num_heads=MODEL_CONFIGS[model_type]["num_attention_heads"],
        hidden_size=MODEL_CONFIGS[model_type]["hidden_size"],
    )
    opt_model.convert_float_to_float16(keep_io_types=True)
    opt_model.save_model_to_file(onnx_fp16_path)
    session = OnnxTransformerModelManager.get_gpu_session(onnx_fp16_path)
    gpu_fp16_metrics, gpu_fp16_perf = evaluate_metrics_and_performance(
        session=session, dataset=full_eval_dataset, output_folder=model_name, params_to_log=None, measure_performance=True, evaluate=True, log_model=True
    )

# Dynamic quantization

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

def do_dynamic_quant(onnx_baseline_model_path, cpu_eval_dataset):
    model_name = "onnx_baseline_dynq_int8"
    dynq_model_path = os.path.join(working_dir, f"{model_name}.onnx")

    quant_config = {
        "weight_type": QuantType.QInt8,
        "op_types_to_quantize": ["MatMul", "Gemm"],
        "reduce_range": True,
        "per_channel": True,
    }

    quantize_dynamic(
        model_input=onnx_baseline_model_path,
        model_output=dynq_model_path,
        weight_type=quant_config["weight_type"],
        op_types_to_quantize=quant_config["op_types_to_quantize"],
        reduce_range=quant_config["reduce_range"],
        per_channel=quant_config["per_channel"],
    )

    session_cpu = OnnxTransformerModelManager.get_cpu_session(dynq_model_path)
    dynq_metrics, dynq_perf = evaluate_metrics_and_performance(
        session=session_cpu, dataset=cpu_eval_dataset, output_folder=model_name, params_to_log=quant_config, measure_performance=True, evaluate=True, log_model=True
    )

In [None]:
def do_run(spark, cpu_dataset_max_length=16000):
    model_type = global_args.model_type
    np.random.seed(global_args.seed)
    torch.manual_seed(global_args.seed)
    # in case any standard library uses some random function
    random.seed(global_args.seed)

    if is_local_development():
        global_args.shorten_to_train = None
        global_args.shorten_to_eval = None
        global_args.shorten_to_train = global_args.shorten_to_train or "10000u"
        global_args.shorten_to_eval = global_args.shorten_to_eval or "1000u"
    full_train_df, full_eval_df = get_df_by_folds_from_args(global_args, spark)

    full_train_dataset = URLDatasetPart.from_pandas(full_train_df)
    full_eval_dataset = URLDatasetPart.from_pandas(full_eval_df)

    cpu_eval_df = shorten_df_in_smart_way(full_eval_df, min(cpu_dataset_max_length, len(full_eval_df)), global_args.seed)
    cpu_eval_dataset = URLDatasetPart.from_pandas(cpu_eval_df)


    onnx_baseline_model_path = os.path.join(working_dir, 'onnx_baseline.onnx')
    global_torch_model.create_onnx_file(onnx_baseline_model_path)

    with mlflow.start_run(run_name=f"Quantization-just-unoptimized-float16-{model_type}-{global_args.dataset_name}") as run:
        print(run.info.run_id)
        do_baseline_evaluation(onnx_baseline_model_path, cpu_eval_dataset=cpu_eval_dataset, full_eval_dataset=full_eval_dataset)
        do_float16_part_without_optimizations(onnx_baseline_model_path, model_type=model_type, full_eval_dataset=full_eval_dataset)
        do_optimized_model_part(onnx_baseline_model_path, full_eval_dataset=full_eval_dataset)
        do_float16_part(onnx_baseline_model_path, model_type=model_type, full_eval_dataset=full_eval_dataset)
        do_dynamic_quant(onnx_baseline_model_path, cpu_eval_dataset)

In [None]:
load_run_id = "0997a077284848c094e5e6fa20e3f9a8"
global_torch_model = load_model_from_run(load_run_id)
global_args = global_torch_model.args
print(global_args)
global_tokenizer = global_torch_model.tokenizer
do_run(spark)

In [None]:
load_run_id = "14aa0dcdb9344606a6b84778c9bcd69a"
global_torch_model = load_model_from_run(load_run_id)
global_args = global_torch_model.args
print(global_args)
global_tokenizer = global_torch_model.tokenizer
do_run(spark)

In [None]:
load_run_id = "14aa0dcdb9344606a6b84778c9bcd69a"
global_torch_model = load_model_from_run(load_run_id)
global_args = global_torch_model.args
print(global_args)
global_tokenizer = global_torch_model.tokenizer
do_run(spark)

In [None]:
load_run_id = "94c1fa2098824e10be4fad32399fbc2b"
global_torch_model = load_model_from_run(load_run_id)
global_args = global_torch_model.args
print(global_args)
global_tokenizer = global_torch_model.tokenizer
do_run(spark)

In [None]:
load_run_id = "11d9a273d9204e928736c5482ada67b2"
global_torch_model = load_model_from_run(load_run_id)
global_args = global_torch_model.args
print(global_args)
global_tokenizer = global_torch_model.tokenizer
do_run(spark)

- how to load from filesystem

In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer
# model_type = 
# local_output_dir = f'./myScriptsNotShare/joined_small_model'

# # Model args
# parser = argparse.ArgumentParser()
# parser.add_argument("--seed", default=42)

# parser.add_argument("--model_type", default=f"bert_small", type=str, help="Type of BERT model to use. see 'get_model_checkpoint' method")
# parser.add_argument("--dropout", default=0, type=float, help="Dropout rate on final classification layer")
# parser.add_argument("--max_sequence_length", default=256)

# # Training params
# parser.add_argument("--batch_size", default=128, type=int, help="Batch size")
# parser.add_argument("--epochs_max", default=5, type=int, help="Maximum number of epochs. Can stop early, however")
# parser.add_argument("--patience", default=3, type=int, help="Number of epochs to wait for validation accuracy increase before stopping. If it is set to None, then early stopping is not used")
# parser.add_argument("--freeze_epochs", default=1, type=int, help="Number of epochs to freeze BERT non-final layers initially.")
# parser.add_argument("--loss", default="focal", choices=["cross_entropy", "focal"], type=str, help="Loss function used")
# parser.add_argument("--focal_loss_gamma", default=2, help="Pass gama parameter if focal loss is being used. Otherwise has no effect")
# parser.add_argument("--focal_loss_alpha", default=-1, help="Pass alpha parameter if focal loss is being used. Otherwise has no effect")
# parser.add_argument("--decision_threshold", default=0.5, type=float, help="If probability of a class 1 is higher than this, then the sample is classified as class 1")
# parser.add_argument("--weight_decay", default=0.00, type=float, help="AdamW weight decay for both parts of the model")
# parser.add_argument("--bert_learning_rate", default=1e-5, type=float, help="AdamW learning rate for everything in model except final classifaction layer")
# parser.add_argument("--classifier_learning_rate", default=1e-5, type=float, help="AdamW learning rate for classification layer of model")

# # Dataset args
# parser.add_argument("--dataset_name", default="joined", choices=["private_data", "any_public_dataset_name"])
# parser.add_argument("--train_folds", default=None, type=str, help="Which folds of the dataset should be used for training")
# parser.add_argument("--eval_folds", default=[4], type=str, help="Which folds of the dataset should be used for evaluation")
# parser.add_argument("--shorten_to_train", default=None, help="How much should train dataset be shortened (400u - 400 records), (10% - 10 percent of all records)")
# parser.add_argument("--shorten_to_eval", default=None, help="How much should test or validation set be shortened")
# parser.add_argument("--label_count", default=2)


# default_args = parser.parse_args([])
# raw_model = BertForSequenceClassification.from_pretrained(local_output_dir)
# tokenizer = BertTokenizer.from_pretrained(local_output_dir)
# torch_model = TransformerModelManager(default_args, raw_model, tokenizer, None)