Merge pull request #2580 from alibaba/feature/sync

[MNN:Sync] Sync Internal 2.7.0
alibaba · Sep 4, 2023 · 9e3cc72 · 9e3cc72
2 parents c442ff3 + ea4f13d
commit 9e3cc72
Show file tree

Hide file tree

Showing 138 changed files with 4,183 additions and 2,414 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -715,9 +715,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
 else()
 endif()
 if (NOT MNN_BUILD_SHARED_LIBS)
-    if(APPLE)
-        set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         # Static-link will not replace thread-related weak symbol in glibc with strong symbol
         # in pthread library, so we need use --whole-archive to pthread
         # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why

diff --git a/tools/cpp/winogradGenerateCL.cpp → backupcode/winogradGenerateCL.cpp b/tools/cpp/winogradGenerateCL.cpp → backupcode/winogradGenerateCL.cpp
diff --git a/tools/cpp/winogradGenerateGLSL.cpp → backupcode/winogradGenerateGLSL.cpp b/tools/cpp/winogradGenerateGLSL.cpp → backupcode/winogradGenerateGLSL.cpp
diff --git a/codegen/cuda/CUDATarget.cpp b/codegen/cuda/CUDATarget.cpp
@@ -473,15 +473,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                     break;
                 case UnaryOpOperation_LOG1P:
                     if(mVectorize) {
-                        ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
-                        ss << inpName << ".y=(log(1.0+" << operand << ".y))";
-                        if(mPrecision != BackendConfig::Precision_Low) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)(log(1.0+(float)" << operand << ".x));\n";
+                            ss << inpName << ".y=(half)(log(1.0+(float)" << operand << ".y))";
+                        } else {
+                            ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
+                            ss << inpName << ".y=(log(1.0+" << operand << ".y))";
                             ss << ";\n";   
                             ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
                             ss << inpName << ".w=(log(1.0+" << operand << ".w))";
                         }
                     } else {
-                        ss << inpName << "=(log(1.0+" << operand << "))";
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(log((half)1.0+" << operand << "))";
+                        } else {
+                            ss << inpName << "=(log(1.0+" << operand << "))";
+                        }
                     }
                     break;
                 case UnaryOpOperation_FLOOR:
@@ -512,15 +519,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                     break;
                 case UnaryOpOperation_SIGMOID:
                     if(mVectorize) {
-                        ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
-                        ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
-                        if(mPrecision != BackendConfig::Precision_Low) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)(1.0/(1.0+(float)exp(-" << operand << ".x)));\n";
+                            ss << inpName << ".y=(half)(1.0/(1.0+(float)exp(-" << operand << ".y)))";
+                        } else {
+                            ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
+                            ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
                             ss << ";\n";                           
                             ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
                             ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
                         }
                     } else {
-                        ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)(1.0/(1.0+(float)exp(-" << operand << ")))";
+                        } else {
+                            ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
+                        }
                     }
                     break;
                 case UnaryOpOperation_TANH:
@@ -538,15 +552,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                     break;
                 case UnaryOpOperation_RECIPROCAL:
                     if(mVectorize) {
-                        ss << inpName << ".x=(1.0/" << operand << ".x);\n";
-                        ss << inpName << ".y=(1.0/" << operand << ".y)";
-                        if(mPrecision != BackendConfig::Precision_Low) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)(1.0/(float)" << operand << ".x);\n";
+                            ss << inpName << ".y=(half)(1.0/(float)" << operand << ".y)";
+                        } else {
+                            ss << inpName << ".x=(1.0/" << operand << ".x);\n";
+                            ss << inpName << ".y=(1.0/" << operand << ".y)";
                             ss << ";\n";                           
                             ss << inpName << ".z=(1.0/" << operand << ".z);\n";
                             ss << inpName << ".w=(1.0/" << operand << ".w)";
                         }
                     } else {
-                        ss << inpName << "=(1.0/" << operand << ")";
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)(1.0/(float)" << operand << ")";
+                        } else {
+                            ss << inpName << "=(1.0/" << operand << ")";
+                        }
                     }                
                     break;
                 case UnaryOpOperation_LOG:
@@ -564,17 +585,44 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
                     break;
                 case UnaryOpOperation_GELU:
                     if(mVectorize) {
-                        ss << inpName << ".x=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".x*" << operand <<  ".x*" <<  operand <<  ".x+" <<  operand + ".x))) * "  << operand << ".x* 0.5f);\n";
-                        ss << inpName << ".y=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".y*" << operand <<  ".y*" <<  operand <<  ".y+" <<  operand + ".y))) * "  << operand << ".y* 0.5f)";
-                        if(mPrecision != BackendConfig::Precision_Low) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand <<  ".x*(float)" << operand <<  ".x*(float)" <<  operand <<  ".x+(float)" <<  operand + ".x))) * (float)"  << operand << ".x* 0.5f);\n";
+                            ss << inpName << ".y=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand <<  ".y*(float)" << operand <<  ".y*(float)" <<  operand <<  ".y+(float)" <<  operand + ".y))) * (float)"  << operand << ".y* 0.5f)";
+                        } else {
+                            ss << inpName << ".x=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".x*" << operand <<  ".x*" <<  operand <<  ".x+" <<  operand + ".x))) * "  << operand << ".x* 0.5f);\n";
+                            ss << inpName << ".y=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".y*" << operand <<  ".y*" <<  operand <<  ".y+" <<  operand + ".y))) * "  << operand << ".y* 0.5f)";
                             ss << ";\n";   
-                            ss << inpName << ".z=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".z*" << operand <<  ".z*" <<  operand <<  ".z+" <<  operand + ".z))) * "  << operand << ".z* 0.5f);\n";
-                            ss << inpName << ".w=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".w*" << operand <<  ".w*" <<  operand <<  ".w+" <<  operand + ".w))) * "  << operand << ".w* 0.5f)";
+                            ss << inpName << ".z=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".z*" << operand <<  ".z*" <<  operand <<  ".z+" <<  operand + ".z))) * "  << operand << ".z* 0.5f);\n";
+                            ss << inpName << ".w=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  ".w*" << operand <<  ".w*" <<  operand <<  ".w+" <<  operand + ".w))) * "  << operand << ".w* 0.5f)";
                         }
                     } else {
-                        ss << inpName << "=(1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  "*" << operand <<  "*" <<  operand <<  "+" <<  operand + "))) * "  << operand << "* 0.5f)";
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand <<  "*(float)" << operand <<  "*(float)" <<  operand <<  "+(float)" <<  operand + "))) * (float)"  << operand << "* 0.5f)";
+                        } else {
+                            ss << inpName << "=((1.0f + tanh(0.79788458f * (0.044715f * " << operand <<  "*" << operand <<  "*" <<  operand <<  "+" <<  operand + "))) * "  << operand << "* 0.5f)";
+                        }
                     }                
                     break;
+                case UnaryOpOperation_GELU_STANDARD:
+                    if(mVectorize) {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << ".x=(half)((erf((float)" << operand << ".x*0.7071067932881648f)+1.f)*(float)" << operand << ".x*0.5f);\n";
+                            ss << inpName << ".y=(half)((erf((float)" << operand << ".y*0.7071067932881648f)+1.f)*(float)" << operand << ".y*0.5f)";
+                        } else {
+                            ss << inpName << ".x=((erf(" << operand << ".x*0.7071067932881648f)+1.f)*" << operand << ".x*0.5f);\n";
+                            ss << inpName << ".y=((erf(" << operand << ".y*0.7071067932881648f)+1.f)*" << operand << ".y*0.5f)";
+                            ss << ";\n";   
+                            ss << inpName << ".z=((erf(" << operand << ".z*0.7071067932881648f)+1.f)*" << operand << ".z*0.5f);\n";
+                            ss << inpName << ".w=((erf(" << operand << ".w*0.7071067932881648f)+1.f)*" << operand << ".w*0.5f)";
+                        }
+                    } else {
+                        if(mPrecision == BackendConfig::Precision_Low) {
+                            ss << inpName << "=(half)((erf((float)" << operand << "*0.7071067932881648f)+1.f)*(float)" << operand << "*0.5f)";
+                        } else {
+                            ss << inpName << "=((erf(" << operand << "*0.7071067932881648f)+1.f)*" << operand << "*0.5f)";
+                        }
+                    }
+                    break;
                 default:
                     MNN_PRINT("Error: CUDA CodeGen not support Unary type:%d\n", type);
                     break;

diff --git a/demo/exec/nluDemo.cpp b/demo/exec/nluDemo.cpp
@@ -104,12 +104,9 @@ int main(int argc, char* argv[]) {
         for (int i = 0; i < 3; ++i) {
             outputs = module->onForward(inputs);
         }
-        globalExecutor->resetProfile();
         outputs = module->onForward(inputs);
-        globalExecutor->dumpProfile();
         {
             MNN::Timer autoTime;
-            globalExecutor->resetProfile();
             for (int i = 0; i < benchTime; ++i) {
                 MNN::AutoTime _t(0, "Once time");
                 // std::cout << i << std::endl;

diff --git a/demo/exec/transformerDemo.cpp b/demo/exec/transformerDemo.cpp
@@ -42,9 +42,7 @@ int main(int argc, const char* argv[]) {
     for (int i = 0; i < 2; ++i) {
         {
             AUTOTIME;
-            Executor::getGlobalExecutor()->resetProfile();
             outputs = model->onForward({first, second});
-            Executor::getGlobalExecutor()->dumpProfile();
         }
         std::ostringstream fileNameOs;
         std::ostringstream dimInfo;

diff --git a/docs/tools/benchmark.md b/docs/tools/benchmark.md
@@ -10,7 +10,7 @@
 - warm_up_count: 预热次数
 - forwardtype: 可选，默认是0，即CPU，forwardtype有0->CPU，1->Metal，3->OpenCL，6->OpenGL，7->Vulkan
 - numberThread: 可选，默认是4，为 CPU 线程数或者 GPU 的运行模式
-- precision: 可选，默认是 2 （precision_low）
+- precision: 可选，默认是2，有效输入为：0(Normal), 1(High), 2(Low_FP16), 3(Low_BF16)
 - weightSparsity: 可选，默认是 0.0 ，在 weightSparsity > 0.5 时且后端支持时，开启稀疏计算
 - weightSparseBlockNumber: 可选，默认是 1 ，仅当 weightSparsity > 0.5 时生效，为稀疏计算 block 大小，越大越有利于稀疏计算的加速，一般选择 1, 4, 8, 16
 - testQuantizedModel 可选，默认是0，即只测试浮点模型；取1时，会在测试浮点模型后进行量化模型的测试

diff --git a/docs/tools/test.md b/docs/tools/test.md
@@ -68,7 +68,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
 ### 参数
 `./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
 - `model:str` 模型文件路径
-- `dir:str` 输入输出信息文件夹，可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成，参考模型转换的正确性校验部分。
+- `dir:str` 输入输出信息文件夹，可使用 testMNNFromTf.py / testMNNFromOnnx.py / testMNNFromTflite.py 等脚本生成，参考模型转换的正确性校验部分。
 - `runMask:int` 默认为 0 ，为一系列功能的开关，如需开启多个功能，可把对齐的 mask 值相加（不能叠加的情况另行说明），具体见下面的 runMask 参数解析
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `runLoops:int` 性能测试的循环次数，可选，默认为`0`即不做性能测试
@@ -456,49 +456,3 @@ Matrix:
 0.0000000	0.0000000	1.0000000
 ```
 
-## winogradGenerateCL.out
-### 说明
-生成winograd变换矩阵程序，并生成opencl转换代码
-### 参数
-`./winogradExample.out unit kernelSize`
-- `unit:int` 分块大小
-- `kernelSize:int` 卷积核大小
-### 示例
-```bash
-$ ./winogradGenerateCL.out 2 2
-A
-1.0000000	0.0000000	
-1.0000000	0.5000000	
-0.0000000	1.0000000	
-B
-1.0000000	0.0000000	-0.0000000	
--2.0000000	2.0000000	-0.5000000	
-0.0000000	0.0000000	1.0000000	
-G
-1.0000000	0.0000000	
-1.0000000	0.5000000	
-0.0000000	1.0000000	
-Generate winogradTransformSource2_2_0.5.cl
-Generate winogradTransformDest2_2_0.5.cl
-```
-
-## winogradGenerateGLSL.out
-### 说明
-生成winograd变换矩阵程序，并生成opengl转换代码
-### 参数
-`./winogradExample.out unit kernelSize`
-- `unit:int` 分块大小
-- `kernelSize:int` 卷积核大小
-### 示例
-```bash
-$ ./winogradGenerateGLSL.out 1 2
-A
-1.0000000	
-B
-1.0000000	-0.0000000	
-0.0000000	1.0000000	
-G
-1.0000000	
-Generate winogradTransformSource1_2_0.5.comp
-Generate winogradTransformDest1_2_0.5.comp
-```
diff --git a/express/CMakeLists.txt b/express/CMakeLists.txt
@@ -13,11 +13,7 @@ if(MNN_CUDA_PROFILE)
 endif()
 
 file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
-option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
 option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
-IF (MNN_EXPR_ENABLE_PROFILER)
-    add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
-ENDIF()
 IF (MNN_EXPR_SHAPE_EAGER)
     add_definitions(-DMNN_EXPR_SHAPE_EAGER)
 ENDIF()

diff --git a/express/Executor.cpp b/express/Executor.cpp
@@ -21,55 +21,9 @@
 #ifdef MNN_EXPR_ENABLE_PROFILER
 #define MNN_EXPRESS_ERROR_REPORT
 #endif
-#define MNN_EXPRESS_OPEN_MEMORY_REUSE
+
 namespace MNN {
 namespace Express {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-class Executor::Profiler {
-public:
-    void reset();
-    void dump() const;
-    void add(const std::string& opType, float timeInMs);
-    void addFlops(const std::string& opType, float flops);
-private:
-    std::map<std::string, float> mTimes;
-    std::map<std::string, float> mFlops;
-};
-void Executor::Profiler::reset() {
-    mTimes.clear();
-    mFlops.clear();
-}
-void Executor::Profiler::dump() const {
-    float sumValue = 0.0f;
-    for (auto iter : mTimes) {
-        MNN_PRINT("%s: %f ms\n", iter.first.c_str(), iter.second);
-        sumValue += iter.second;
-    }
-    MNN_PRINT("Total: %f ms\n", sumValue);
-    sumValue = 0.0f;
-    for (auto iter : mFlops) {
-        MNN_PRINT("%s: %f \n", iter.first.c_str(), iter.second);
-        sumValue += iter.second;
-    }
-    MNN_PRINT("Total flops: %f M\n", sumValue);
-}
-void Executor::Profiler::add(const std::string& opType, float timeInMs) {
-    auto iter = mTimes.find(opType);
-    if (iter == mTimes.end()) {
-        mTimes[opType] = timeInMs;
-        return;
-    }
-    iter->second += timeInMs;
-}
-void Executor::Profiler::addFlops(const std::string& opType, float flops) {
-    auto iter = mFlops.find(opType);
-    if (iter == mFlops.end()) {
-        mFlops[opType] = flops;
-        return;
-    }
-    iter->second += flops;
-}
-#endif
 
 void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
     std::lock_guard<std::mutex> _l(mMutex);
@@ -648,36 +602,12 @@ void Executor::makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
     //FUNC_PRINT(mCaches.size());
     _makeCache(expr, forceCPU);
 }
-void Executor::addOpCostTime(int op, float costTime) {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    auto opType = MNN::EnumNameOpType((OpType)op);
-    if (nullptr == opType) {
-        return;
-    }
-    mProfiler->add(opType, costTime);
-#endif
-}
-void Executor::addOpCostTime(const std::string& type, float costTime) {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->add(type, costTime);
-#endif
-}
-void Executor::addOpFlops(const std::string& type, float flops) {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->addFlops(type, flops);
-#endif
-}
-
 
 void Executor::resetProfile() {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->reset();
-#endif
+    // Depercated
 }
 void Executor::dumpProfile() {
-#ifdef MNN_EXPR_ENABLE_PROFILER
-    mProfiler->dump();
-#endif
+    // Depercated
 }
 
 bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {