microsoft · tezheng · Apr 27, 2025 · May 2, 2025 · May 14, 2025 · May 14, 2025
diff --git a/examples/bert/qdq/.python-version b/examples/bert/qdq/.python-version
@@ -0,0 +1 @@
+3.12.9
diff --git a/examples/bert/qdq/google_bert_base_qdq.json b/examples/bert/qdq/google_bert_base_qdq.json
@@ -0,0 +1,123 @@
+{
+    "input_model": {
+        "type": "PytorchModel",
+        "model_path": "google-bert/bert-base-multilingual-cased",
+        "io_config": {
+            "input_names": [ "input_ids", "attention_mask", "token_type_ids" ],
+            "input_shapes": [ [ 1, 512 ], [ 1, 512 ], [ 1, 512 ] ],
+            "input_types": [ "int32", "int32", "int32" ],
+            "output_names": [ "logits" ]
+        },
+        "model_loader": "load_bert_nsp_model",
+        "model_script": "google_bert_script.py"
+    },
+    "passes": {
+        "conversion": { "type": "OnnxConversion", "target_opset": 20, "dynamic": true, "use_dynamo_exporter": false },
+        "to_fixed_shape": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "batch_size", "sequence_length" ],
+            "dim_value": [ 1, 512 ]
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                { "surgeon": "ReplaceAttentionMaskValue", "replacement": -100.0 },
+                { "surgeon": "MatMulAddToGemm" }
+            ]
+        },
+        "transformer_optimizer": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "bert",
+            "opt_level": 1,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_bias_gelu": false,
+                "enable_layer_norm": true,
+                "enable_skip_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_attention": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "calib_data",
+            "quant_preprocess": true,
+            "activation_type": "uint16",
+            "precision": "uint8"
+        },
+        "addmetadata": {
+            "type": "VitisAIAddMetaData",
+            "config_meta_data_keys": [ "architectures", "model_type" ],
+            "activation_type": "uint16",
+            "weight_type": "uint8",
+            "quant_type": "OnnxStaticQuantization"
+        }
+    },
+    "data_configs": [
+        {
+            "name": "calib_data",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "glue", "subset": "mrpc", "split": "train[:12]" },
+            "pre_process_data_config": {
+                "model_name": "google-bert/bert-base-multilingual-cased",
+                "input_cols": [ "sentence1", "sentence2" ],
+                "max_length": 512,
+                "padding": "max_length"
+            },
+            "dataloader_config": { "batch_size": 1 }
+        },
+        {
+            "name": "wiki_data",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": {
+                "type": "dataset_to_nsp_dataset",
+                "data_name": "wikitext",
+                "subset": "wikitext-2-raw-v1",
+                "split": "test",
+                "input_cols": [ "sentence1", "sentence2" ],
+                "label_col": "label"
+            },
+            "pre_process_data_config": {
+                "model_name": "google-bert/bert-base-multilingual-cased",
+                "input_cols": [ "sentence1", "sentence2" ],
+                "label_col": "label",
+                "max_length": 512,
+                "padding": "max_length"
+            },
+            "post_process_data_config": { "type": "bert_scl_post_process" },
+            "dataloader_config": { "batch_size": 1 },
+            "user_script": "google_bert_script.py",
+            "script_dir": "."
+        }
+    ],
+    "evaluators": {
+        "nsp_evaluator": {
+            "metrics": [
+                {
+                    "name": "nsp",
+                    "type": "accuracy",
+                    "backend": "huggingface_metrics",
+                    "data_config": "wiki_data",
+                    "sub_types": [ { "name": "accuracy", "priority": 1 }, { "name": "f1" } ]
+                },
+                { "name": "latency", "type": "latency", "sub_types": [ { "name": "avg" } ] }
+            ]
+        },
+        "performance": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        { "name": "avg", "priority": 1, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                }
+            ]
+        }
+    },
+    "clean_cache": true,
+    "clean_evaluation_cache": true,
+    "evaluate_input_model": false,
+    "output_dir": "models/google/bert_base_multilingual_cased"
+}
diff --git a/examples/bert/qdq/google_bert_large_qdq.json b/examples/bert/qdq/google_bert_large_qdq.json
@@ -0,0 +1,109 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "google-bert/bert-large-cased-whole-word-masking-finetuned-squad",
+        "task": "question-answering",
+        "io_config": {
+            "input_names": [ "input_ids", "attention_mask" ],
+            "input_shapes": [ [ 1, 512 ], [ 1, 512 ] ],
+            "input_types": [ "int32", "int32" ],
+            "output_names": [ "start_logits", "end_logits" ]
+        }
+    },
+    "passes": {
+        "conversion": { "type": "OnnxConversion", "target_opset": 20, "dynamic": true, "use_dynamo_exporter": false },
+        "to_fixed_shape": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "batch_size", "sequence_length" ],
+            "dim_value": [ 1, 512 ]
+        },
+        "surgery": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                { "surgeon": "ReplaceAttentionMaskValue", "replacement": -100.0 },
+                { "surgeon": "MatMulAddToGemm" }
+            ]
+        },
+        "transformer_optimizer": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "bert",
+            "opt_level": 1,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_bias_gelu": false,
+                "enable_layer_norm": true,
+                "enable_skip_layer_norm": false,
+                "enable_bias_skip_layer_norm": false,
+                "enable_attention": false
+            }
+        },
+        "quantization": {
+            "type": "OnnxStaticQuantization",
+            "data_config": "calib_data",
+            "quant_preprocess": true,
+            "activation_type": "uint16",
+            "precision": "uint8"
+        },
+        "addmetadata": {
+            "type": "VitisAIAddMetaData",
+            "config_meta_data_keys": [ "architectures", "model_type" ],
+            "activation_type": "uint16",
+            "weight_type": "uint8",
+            "quant_type": "OnnxStaticQuantization"
+        }
+    },
+    "data_configs": [
+        {
+            "name": "calib_data",
+            "type": "HuggingfaceContainer",
+            "load_dataset_config": { "data_name": "squad", "split": "train[:12]" },
+            "pre_process_data_config": {
+                "input_cols": [ "question", "context" ],
+                "label_col": "id",
+                "padding": "max_length",
+                "max_length": 512
+            },
+            "dataloader_config": { "batch_size": 1 },
+            "user_script": "google_bert_script.py"
+        }
+    ],
+    "evaluators": {
+        "squad_evaluator": {
+            "metrics": [
+                {
+                    "name": "squad",
+                    "type": "custom",
+                    "sub_types": [
+                        { "name": "exact_match", "priority": 1, "higher_is_better": true },
+                        { "name": "f1", "higher_is_better": true }
+                    ],
+                    "user_config": {
+                        "evaluate_func": "eval_squad",
+                        "evaluate_func_kwargs": {
+                            "model_name": "google-bert/bert-large-cased-whole-word-masking-finetuned-squad",
+                            "dataset_config": { "data_name": "squad", "split": "validation" }
+                        },
+                        "user_script": "google_bert_script.py"
+                    }
+                },
+                { "name": "latency", "type": "latency", "sub_types": [ { "name": "avg" } ] }
+            ]
+        },
+        "performance": {
+            "metrics": [
+                {
+                    "name": "latency",
+                    "type": "latency",
+                    "sub_types": [
+                        { "name": "avg", "priority": 1, "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } },
+                        { "name": "p90", "metric_config": { "warmup_num": 20, "repeat_test_num": 100 } }
+                    ]
+                }
+            ]
+        }
+    },
+    "clean_cache": true,
+    "clean_evaluation_cache": true,
+    "evaluate_input_model": false,
+    "output_dir": "models/google/bert_large_cased_qa"
+}