update xnnpack to newer version and update API usage in pytorch (pyto…

…rch#94330) Summary: Pull Request resolved: pytorch#94330 Update XNNPACK to 51a987591a6fc9f0fc0707077f53d763ac132cbf (https://github.com/google/XNNPACK/commits/51a987591a6fc9f0fc0707077f53d763ac132cbf) Update the corresponding CMake and BUCK rules, as well as the generate_wrapper.py for the new version. Due to XNNPACK having already changed a lot. We need to update XNNPACK in this time for many reasons. Firstly, XNNAPCK has updated a lot, and developers' community has re-factored codes' such as API changes. We can see from their cmakefile.txt to see there are many changes! Thus, in order to follow up upstream. We need to update xnnpack at this time. It is very crucial for our future development. Also, many projects are relying on newer versions of XNNPACK, so we probably need to update XNNPACK third-party libs at this time. we have some api changes of XNNPACK, so we also need to update them in this time. We also update target building files and generate-wrapper.py file to make this process more automatically. The original target files have some files which are missing, so we add them into buck2 building files so that it can build and test XNNPACK successfully. This commit needs to allow large files to be committed since we are updating newer xnnpack version. Test Plan: buck2 build //xplat/third-party/XNNPACK:operators buck2 build //xplat/third-party/XNNPACK:XNNPACK buck2 test fbcode//caffe2/test:xnnpack_integration Reviewed By: digantdesai Differential Revision: D43092938 fbshipit-source-id: 6fa5028544533ce40ce1b4b91d57524bc88a3f8c
tiandiao123 · Feb 8, 2023 · 6b67439 · 6b67439
1 parent 3ce1ebb
commit 6b67439
Show file tree

Hide file tree

Showing 7 changed files with 13,608 additions and 1,553 deletions.
diff --git a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
@@ -99,6 +99,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
         op_min,         /* int8_t output_min                    */
         op_max,         /* int8_t output_max                    */
         flags,          /* uint32_t flags                       */
+        nullptr,        /* xnn_caches_t caches                  */
         op);            /* xnn_operator_t* deconvolution_op_out */
 
   }
@@ -130,6 +131,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
         op_min,         /* int8_t output_min                  */
         op_max,         /* int8_t output_max                  */
         flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
         op);            /* xnn_operator_t* convolution_op_out */
   } else { /* per_channel */
     return xnn_create_convolution2d_nhwc_qc8(
@@ -158,6 +160,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
         op_min,         /* int8_t output_min                  */
         op_max,         /* int8_t output_max                  */
         flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
         op);            /* xnn_operator_t* convolution_op_out */
   }
 }
@@ -254,6 +257,7 @@ enum xnn_status xnnp_create_fully_connected_nc(
       output_min,              /* int8_t output_min                      */
       output_max,              /* int8_t output_max                      */
       flags,                   /* uint32_t flags                         */
+      nullptr,                 /* xnn_caches_t caches                    */
       fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
 }
 

diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp
@@ -236,6 +236,7 @@ ContextConv2D create(
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
+      nullptr,                                                        // xnn_caches_t
       &convolution_op);                                               // operator
   } else {
     for (const auto i : c10::irange(4)) {
@@ -264,6 +265,7 @@ ContextConv2D create(
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
+      nullptr,                                                        // xnn_caches_t
       &convolution_op);                                               // operator
   }
 

diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -97,6 +97,7 @@ ContextLinear create(
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
+      nullptr,                                                        // xnn_caches_t
       &linear_op);                                                    // operator
 
   TORCH_CHECK(

diff --git a/third_party/XNNPACK b/third_party/XNNPACK
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
@@ -35,6 +35,9 @@ load(
     "PROD_SSE_MICROKERNEL_SRCS",
     "PROD_SSSE3_MICROKERNEL_SRCS",
     "PROD_XOP_MICROKERNEL_SRCS",
+    "ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS",
+    "ALL_NEON_AARCH64_MICROKERNEL_SRCS",
+    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
 )
 
 # This defines XNNPACK targets for both fbsource BUCK and OSS BUCK
@@ -99,6 +102,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
+            "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0",
         ],
         visibility = ["PUBLIC"],
         windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS,
@@ -131,6 +135,9 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
+            "-DXNN_ENABLE_JIT=0",
+            "-DXNN_ENABLE_SPARSE=0",
+            "-DXNN_ENABLE_MEMOPT",
         ],
         visibility = ["PUBLIC"],
         windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS,
@@ -1088,6 +1095,53 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
+    fb_xplat_cxx_library(
+        name = "prod_avx512vbmi",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_AVX512VBMI_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.c"),
+            ("XNNPACK/src", "**/*.h"),
+        ]),
+        header_namespace = "",
+        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        compiler_flags = [
+            "-O2",
+            "-mavx512f",
+        ],
+        fbobjc_preprocessor_flags = [
+            "-DXNN_PRIVATE=",
+            "-DXNN_INTERNAL=",
+        ],
+        labels = labels,
+        platform_compiler_flags = [
+            (
+                "x86",
+                [
+                    "-mavx512f",
+                ],
+            ),
+        ],
+        platform_srcs = ([
+            (
+                "x86|x86_64|platform009|platform010",
+                PROD_AVX512F_MICROKERNEL_SRCS,
+            ),
+        ] if not is_arvr_mode() else []),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        visibility = ["PUBLIC"],
+        windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS + ["-mavx512f"],
+        windows_compiler_flags_override = WINDOWS_FLAGS + ["-mavx512f"],
+        deps = [
+            ":interface",
+        ],
+    )
+
     fb_xplat_cxx_library(
         name = "ukernels_avx512_ovr_win32",
         headers = subdir_glob([
@@ -1474,7 +1528,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_neon_aarch64",
-        srcs = PROD_AARCH64_NEON_MICROKERNEL_SRCS,
+        srcs = ALL_NEON_AARCH64_MICROKERNEL_SRCS,
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -1589,6 +1643,47 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
+    fb_xplat_cxx_library(
+        name = "ukernels_neonfma_aarch64",
+        srcs = ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS,
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.h"),
+            ("XNNPACK/src", "**/*.c"),
+        ]),
+        header_namespace = "",
+        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        compiler_flags = [
+            "-O2",
+        ],
+        fbobjc_preprocessor_flags = [
+            "-DXNN_PRIVATE=",
+            "-DXNN_INTERNAL=",
+        ],
+        labels = labels,
+        platform_compiler_flags = [
+            (
+                "^(android-armv8|iphoneos-armv8)$",
+                [
+                    "-march=armv8-a",
+                    "-mfpu=neon-fp-armv8",
+                    "-mfloat-abi=softfp",
+                ],
+            ),
+        ],
+        platforms = (APPLE, ANDROID, CXX, WINDOWS),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        visibility = ["PUBLIC"],
+        windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS,
+        windows_compiler_flags_override = WINDOWS_FLAGS,
+        deps = [
+            ":interface",
+            third_party("FP16"),
+        ],
+    )
+
     fb_xplat_cxx_library(
         name = "ukernels_asm_aarch32",
         srcs = AARCH32_ASM_MICROKERNEL_SRCS,
@@ -1686,6 +1781,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_neon_fp16",
             ":ukernels_neon_fp16arith_aarch64",
             ":ukernels_neon_v8",
+            ":ukernels_neonfma_aarch64",
         ],
     )
 
@@ -1707,6 +1803,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_sse41",
             ":ukernels_ssse3",
             ":ukernels_xop",
+            ":prod_avx512vbmi",
         ],
     )
 
@@ -1749,6 +1846,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_neon_fp16arith_aarch64",
             ":ukernels_neon_v8",
             ":ukernels_scalar_aarch32",
+            ":ukernels_neonfma_aarch64",
         ],
     )
 
@@ -1820,15 +1918,30 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             "-DXNN_NO_X8_OPERATORS",
             "-DXNN_NO_XX_OPERATORS",
             "-DXNN_ENABLE_MEMOPT",
+            "-DXNN_ENABLE_SPARSE=0",
+            "-DXNN_ENABLE_JIT=0",
+            "-DXNN_ENABLE_ASSEMBLY",
+            "-DXNN_ENABLE_GEMM_M_SPECIALIZATION",
+            "-DXNN_ENABLE_ARM_DOTPROD",
         ],
         srcs = [
             "XNNPACK/src/allocator.c",
             "XNNPACK/src/init.c",
-            "XNNPACK/src/memory-planner.c",
-            "XNNPACK/src/operator-delete.c",
-            "XNNPACK/src/runtime.c",
-            "XNNPACK/src/subgraph.c",
-            "XNNPACK/src/tensor.c",
+            "XNNPACK/src/params.c",
+            "XNNPACK/src/operator-run.c",
+            "XNNPACK/src/microparams-init.c",
+            "XNNPACK/src/binary-elementwise-config.c",
+            "XNNPACK/src/packing.c",
+            "XNNPACK/src/indirection.c",
+            "XNNPACK/src/cache.c",
+            "XNNPACK/src/mutex.c",
+            "XNNPACK/src/operator-utils.c",
+            "XNNPACK/src/memory.c",
+            "XNNPACK/src/hardware-config.c",
+            "XNNPACK/src/x8-lut-config.c",
+            "XNNPACK/src/normalization.c",
+            "XNNPACK/src/transpose-config.c",
+            "XNNPACK/src/amalgam/scalar.c",
         ] + LOGGING_SRCS,
         visibility = ["PUBLIC"],
         windows_clang_compiler_flags_override = (WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS) if XNNPACK_WINDOWS_AVX512F_ENABLED else WINDOWS_FLAGS,