-
Notifications
You must be signed in to change notification settings - Fork 477
/
Copy pathquantize_helper.h
executable file
·135 lines (108 loc) · 5.41 KB
/
quantize_helper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <onnx/onnx_pb.h>
#include <cmath>
#include <fstream>
#include <iomanip>
#include "paddle2onnx/mapper/mapper.h"
#include "paddle2onnx/parser/parser.h"
namespace paddle2onnx {
struct QuantizeModelProcessor {
public:
std::vector<QuantizeInfo> quantize_info;
const PaddleParser* parser_;
OnnxHelper* helper_;
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* parameters_;
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* inputs_;
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* outputs_;
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* nodes_;
// All types that support quantization
std::vector<std::string> supported_quantize_type_;
std::map<std::string, std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>>
name2node_dict_;
std::vector<std::string> tensors_to_be_quantize; // records those tensors
// that need to add quantize
// and dequantize op
std::vector<std::string> only_dequantize_tensors; // records those tensors
// that only need to add
// the dequantize op
// Convert to different model formats based on backend, backend can be
// TensorRT, ONNXRuntime and Others
void ProcessQuantizeModel(
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* parameters,
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* inputs,
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* outputs,
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* nodes,
OnnxHelper* helper, const std::string& deploy_backend,
const PaddleParser& parser, std::string* calibration_cache = nullptr);
// Remove all Quantize and Dequantize ops
void RemoveAllQuantizeOps();
// If all tensors in tensor_names have quantize info and all the next nodes
// can be quantized, return True, otherwise
// return false
bool CanBeQuantize(const std::vector<std::string>& tensor_names,
const std::vector<int64_t>& output_index = {-1});
// only_dequantize records those tensors that only need to add the dequantize
// op
void AppendQuantizeTensor(const std::string& tensor,
const bool& only_dequantize = false);
// Add QDQ for ORT according to:
// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
void AddQDQForORT();
// Determine if the tensor is directly linked to the output by identity
bool ConnectToOutput(const std::string& output_name);
// Generate cache file for TensorRT8.X int8 deploy
void GenerateCache(std::string* calibration_cache);
// Add QDQ for TRT according to:
// https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules
void AddTrtQDQ();
// Add QDQ for RKNN
void AddQDQForRKNN();
void RemoveIdentityOp();
// Add quantize related op in model according to tensor names
void AddQDQInModel(const std::vector<std::string>& tensors_to_be_quantize);
void QuantizeInfoBroadcast();
// merge conv + add
void MergeConvAdd();
// merge conv + BN
void MergeConvBN();
// Determine whether a tensor is an output
bool IsGraphOutput(const std::string& name);
// Because processing the quantize model will add new nodes, which will
// destroy the topo sorting of nodes, this function will sort the nodes again
void SortNodes();
bool GetTensorShape(const std::string& name, std::vector<int64_t>* shape);
// return the value of tensor by name
template <typename T>
bool GetTensorByName(const std::string& name, std::vector<T>* value);
// Perform tensor wise quantization, returning scale and zero
void GetTensorWiseQuantizeInfo(const std::vector<float>& tensor,
std::vector<float>* scale,
std::vector<int64_t>* zero);
// Perform channel wise quantization, returning scale and zero
void GetChannelWiseQuantizeInfo(const std::vector<float>& tensor,
const std::vector<int64_t>& shape,
const int64_t& quant_axis,
std::vector<float>* scale,
std::vector<int64_t>* zero);
// Generate name2node_dict to save input name and its related nodes
void UpdateInputNameToNodes();
void RemoveNodeByName(const std::string& name, const bool& update_io = true);
void ReplaceInputOfAllNodes(
const std::string& old_name, const std::string& new_name,
const std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>&
except_nodes = {});
};
} // namespace paddle2onnx