Skip to content

Commit

Permalink
Merge pull request #2580 from alibaba/feature/sync
Browse files Browse the repository at this point in the history
[MNN:Sync] Sync Internal 2.7.0
  • Loading branch information
jxt1234 committed Sep 4, 2023
2 parents c442ff3 + ea4f13d commit 9e3cc72
Show file tree
Hide file tree
Showing 138 changed files with 4,183 additions and 2,414 deletions.
4 changes: 1 addition & 3 deletions CMakeLists.txt
Expand Up @@ -715,9 +715,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
else()
endif()
if (NOT MNN_BUILD_SHARED_LIBS)
if(APPLE)
set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# Static-link will not replace thread-related weak symbol in glibc with strong symbol
# in pthread library, so we need use --whole-archive to pthread
# https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
Expand Down
File renamed without changes.
File renamed without changes.
84 changes: 66 additions & 18 deletions codegen/cuda/CUDATarget.cpp
Expand Up @@ -473,15 +473,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break;
case UnaryOpOperation_LOG1P:
if(mVectorize) {
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
ss << inpName << ".y=(log(1.0+" << operand << ".y))";
if(mPrecision != BackendConfig::Precision_Low) {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".x=(half)(log(1.0+(float)" << operand << ".x));\n";
ss << inpName << ".y=(half)(log(1.0+(float)" << operand << ".y))";
} else {
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
ss << inpName << ".y=(log(1.0+" << operand << ".y))";
ss << ";\n";
ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
ss << inpName << ".w=(log(1.0+" << operand << ".w))";
}
} else {
ss << inpName << "=(log(1.0+" << operand << "))";
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(log((half)1.0+" << operand << "))";
} else {
ss << inpName << "=(log(1.0+" << operand << "))";
}
}
break;
case UnaryOpOperation_FLOOR:
Expand Down Expand Up @@ -512,15 +519,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break;
case UnaryOpOperation_SIGMOID:
if(mVectorize) {
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
if(mPrecision != BackendConfig::Precision_Low) {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".x=(half)(1.0/(1.0+(float)exp(-" << operand << ".x)));\n";
ss << inpName << ".y=(half)(1.0/(1.0+(float)exp(-" << operand << ".y)))";
} else {
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
ss << ";\n";
ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
}
} else {
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)(1.0/(1.0+(float)exp(-" << operand << ")))";
} else {
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
}
}
break;
case UnaryOpOperation_TANH:
Expand All @@ -538,15 +552,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break;
case UnaryOpOperation_RECIPROCAL:
if(mVectorize) {
ss << inpName << ".x=(1.0/" << operand << ".x);\n";
ss << inpName << ".y=(1.0/" << operand << ".y)";
if(mPrecision != BackendConfig::Precision_Low) {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".x=(half)(1.0/(float)" << operand << ".x);\n";
ss << inpName << ".y=(half)(1.0/(float)" << operand << ".y)";
} else {
ss << inpName << ".x=(1.0/" << operand << ".x);\n";
ss << inpName << ".y=(1.0/" << operand << ".y)";
ss << ";\n";
ss << inpName << ".z=(1.0/" << operand << ".z);\n";
ss << inpName << ".w=(1.0/" << operand << ".w)";
}
} else {
ss << inpName << "=(1.0/" << operand << ")";
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)(1.0/(float)" << operand << ")";
} else {
ss << inpName << "=(1.0/" << operand << ")";
}
}
break;
case UnaryOpOperation_LOG:
Expand All @@ -564,17 +585,44 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break;
case UnaryOpOperation_GELU:
if(mVectorize) {
ss << inpName << ".x=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
ss << inpName << ".y=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
if(mPrecision != BackendConfig::Precision_Low) {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".x=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".x*(float)" << operand << ".x*(float)" << operand << ".x+(float)" << operand + ".x))) * (float)" << operand << ".x* 0.5f);\n";
ss << inpName << ".y=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".y*(float)" << operand << ".y*(float)" << operand << ".y+(float)" << operand + ".y))) * (float)" << operand << ".y* 0.5f)";
} else {
ss << inpName << ".x=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
ss << inpName << ".y=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
ss << ";\n";
ss << inpName << ".z=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
ss << inpName << ".w=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
ss << inpName << ".z=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
ss << inpName << ".w=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
}
} else {
ss << inpName << "=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << "*(float)" << operand << "*(float)" << operand << "+(float)" << operand + "))) * (float)" << operand << "* 0.5f)";
} else {
ss << inpName << "=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
}
}
break;
case UnaryOpOperation_GELU_STANDARD:
if(mVectorize) {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".x=(half)((erf((float)" << operand << ".x*0.7071067932881648f)+1.f)*(float)" << operand << ".x*0.5f);\n";
ss << inpName << ".y=(half)((erf((float)" << operand << ".y*0.7071067932881648f)+1.f)*(float)" << operand << ".y*0.5f)";
} else {
ss << inpName << ".x=((erf(" << operand << ".x*0.7071067932881648f)+1.f)*" << operand << ".x*0.5f);\n";
ss << inpName << ".y=((erf(" << operand << ".y*0.7071067932881648f)+1.f)*" << operand << ".y*0.5f)";
ss << ";\n";
ss << inpName << ".z=((erf(" << operand << ".z*0.7071067932881648f)+1.f)*" << operand << ".z*0.5f);\n";
ss << inpName << ".w=((erf(" << operand << ".w*0.7071067932881648f)+1.f)*" << operand << ".w*0.5f)";
}
} else {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)((erf((float)" << operand << "*0.7071067932881648f)+1.f)*(float)" << operand << "*0.5f)";
} else {
ss << inpName << "=((erf(" << operand << "*0.7071067932881648f)+1.f)*" << operand << "*0.5f)";
}
}
break;
default:
MNN_PRINT("Error: CUDA CodeGen not support Unary type:%d\n", type);
break;
Expand Down
3 changes: 0 additions & 3 deletions demo/exec/nluDemo.cpp
Expand Up @@ -104,12 +104,9 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < 3; ++i) {
outputs = module->onForward(inputs);
}
globalExecutor->resetProfile();
outputs = module->onForward(inputs);
globalExecutor->dumpProfile();
{
MNN::Timer autoTime;
globalExecutor->resetProfile();
for (int i = 0; i < benchTime; ++i) {
MNN::AutoTime _t(0, "Once time");
// std::cout << i << std::endl;
Expand Down
2 changes: 0 additions & 2 deletions demo/exec/transformerDemo.cpp
Expand Up @@ -42,9 +42,7 @@ int main(int argc, const char* argv[]) {
for (int i = 0; i < 2; ++i) {
{
AUTOTIME;
Executor::getGlobalExecutor()->resetProfile();
outputs = model->onForward({first, second});
Executor::getGlobalExecutor()->dumpProfile();
}
std::ostringstream fileNameOs;
std::ostringstream dimInfo;
Expand Down
2 changes: 1 addition & 1 deletion docs/tools/benchmark.md
Expand Up @@ -10,7 +10,7 @@
- warm_up_count: 预热次数
- forwardtype: 可选,默认是0,即CPU,forwardtype有0->CPU,1->Metal,3->OpenCL,6->OpenGL,7->Vulkan
- numberThread: 可选,默认是4,为 CPU 线程数或者 GPU 的运行模式
- precision: 可选,默认是 2 (precision_low)
- precision: 可选,默认是2,有效输入为:0(Normal), 1(High), 2(Low_FP16), 3(Low_BF16)
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
- testQuantizedModel 可选,默认是0,即只测试浮点模型;取1时,会在测试浮点模型后进行量化模型的测试
Expand Down
48 changes: 1 addition & 47 deletions docs/tools/test.md
Expand Up @@ -68,7 +68,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
### 参数
`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
- `model:str` 模型文件路径
- `dir:str` 输入输出信息文件夹,可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成,参考模型转换的正确性校验部分。
- `dir:str` 输入输出信息文件夹,可使用 testMNNFromTf.py / testMNNFromOnnx.py / testMNNFromTflite.py 等脚本生成,参考模型转换的正确性校验部分。
- `runMask:int` 默认为 0 ,为一系列功能的开关,如需开启多个功能,可把对齐的 mask 值相加(不能叠加的情况另行说明),具体见下面的 runMask 参数解析
- `forwardType:int` 执行推理的计算设备,有效值为:0(CPU)、1(Metal)、2(CUDA)、3(OpenCL)、6(OpenGL),7(Vulkan) ,9 (TensorRT),可选,默认为`0`
- `runLoops:int` 性能测试的循环次数,可选,默认为`0`即不做性能测试
Expand Down Expand Up @@ -456,49 +456,3 @@ Matrix:
0.0000000 0.0000000 1.0000000
```

## winogradGenerateCL.out
### 说明
生成winograd变换矩阵程序,并生成opencl转换代码
### 参数
`./winogradExample.out unit kernelSize`
- `unit:int` 分块大小
- `kernelSize:int` 卷积核大小
### 示例
```bash
$ ./winogradGenerateCL.out 2 2
A
1.0000000 0.0000000
1.0000000 0.5000000
0.0000000 1.0000000
B
1.0000000 0.0000000 -0.0000000
-2.0000000 2.0000000 -0.5000000
0.0000000 0.0000000 1.0000000
G
1.0000000 0.0000000
1.0000000 0.5000000
0.0000000 1.0000000
Generate winogradTransformSource2_2_0.5.cl
Generate winogradTransformDest2_2_0.5.cl
```

## winogradGenerateGLSL.out
### 说明
生成winograd变换矩阵程序,并生成opengl转换代码
### 参数
`./winogradExample.out unit kernelSize`
- `unit:int` 分块大小
- `kernelSize:int` 卷积核大小
### 示例
```bash
$ ./winogradGenerateGLSL.out 1 2
A
1.0000000
B
1.0000000 -0.0000000
0.0000000 1.0000000
G
1.0000000
Generate winogradTransformSource1_2_0.5.comp
Generate winogradTransformDest1_2_0.5.comp
```
4 changes: 0 additions & 4 deletions express/CMakeLists.txt
Expand Up @@ -13,11 +13,7 @@ if(MNN_CUDA_PROFILE)
endif()

file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
IF (MNN_EXPR_ENABLE_PROFILER)
add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
ENDIF()
IF (MNN_EXPR_SHAPE_EAGER)
add_definitions(-DMNN_EXPR_SHAPE_EAGER)
ENDIF()
Expand Down
76 changes: 3 additions & 73 deletions express/Executor.cpp
Expand Up @@ -21,55 +21,9 @@
#ifdef MNN_EXPR_ENABLE_PROFILER
#define MNN_EXPRESS_ERROR_REPORT
#endif
#define MNN_EXPRESS_OPEN_MEMORY_REUSE

namespace MNN {
namespace Express {
#ifdef MNN_EXPR_ENABLE_PROFILER
class Executor::Profiler {
public:
void reset();
void dump() const;
void add(const std::string& opType, float timeInMs);
void addFlops(const std::string& opType, float flops);
private:
std::map<std::string, float> mTimes;
std::map<std::string, float> mFlops;
};
void Executor::Profiler::reset() {
mTimes.clear();
mFlops.clear();
}
void Executor::Profiler::dump() const {
float sumValue = 0.0f;
for (auto iter : mTimes) {
MNN_PRINT("%s: %f ms\n", iter.first.c_str(), iter.second);
sumValue += iter.second;
}
MNN_PRINT("Total: %f ms\n", sumValue);
sumValue = 0.0f;
for (auto iter : mFlops) {
MNN_PRINT("%s: %f \n", iter.first.c_str(), iter.second);
sumValue += iter.second;
}
MNN_PRINT("Total flops: %f M\n", sumValue);
}
void Executor::Profiler::add(const std::string& opType, float timeInMs) {
auto iter = mTimes.find(opType);
if (iter == mTimes.end()) {
mTimes[opType] = timeInMs;
return;
}
iter->second += timeInMs;
}
void Executor::Profiler::addFlops(const std::string& opType, float flops) {
auto iter = mFlops.find(opType);
if (iter == mFlops.end()) {
mFlops[opType] = flops;
return;
}
iter->second += flops;
}
#endif

void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
std::lock_guard<std::mutex> _l(mMutex);
Expand Down Expand Up @@ -648,36 +602,12 @@ void Executor::makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
//FUNC_PRINT(mCaches.size());
_makeCache(expr, forceCPU);
}
void Executor::addOpCostTime(int op, float costTime) {
#ifdef MNN_EXPR_ENABLE_PROFILER
auto opType = MNN::EnumNameOpType((OpType)op);
if (nullptr == opType) {
return;
}
mProfiler->add(opType, costTime);
#endif
}
void Executor::addOpCostTime(const std::string& type, float costTime) {
#ifdef MNN_EXPR_ENABLE_PROFILER
mProfiler->add(type, costTime);
#endif
}
void Executor::addOpFlops(const std::string& type, float flops) {
#ifdef MNN_EXPR_ENABLE_PROFILER
mProfiler->addFlops(type, flops);
#endif
}


void Executor::resetProfile() {
#ifdef MNN_EXPR_ENABLE_PROFILER
mProfiler->reset();
#endif
// Depercated
}
void Executor::dumpProfile() {
#ifdef MNN_EXPR_ENABLE_PROFILER
mProfiler->dump();
#endif
// Depercated
}

bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {
Expand Down

0 comments on commit 9e3cc72

Please sign in to comment.