From 9c3a35b9a92919a06c25eaf30d21872e80bba83d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=B0=B8=E4=B9=85?=
 <34344716+yjjiang11@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:11:29 +0800
Subject: [PATCH 01/89] rm flags retain grad in pybind (#49888)

* rm flags_retain grad in pybind

* retain grads for xpu test

* set retain grad for xpu

* rm flag

* lint

---------

Co-authored-by: wanghuancoder <wanghuan29@baidu.com>
---
 .../eager_manual/forwards/add_n_fwd_func.cc   |  1 -
 .../forwards/conv2d_fwd_function.cc           |  1 -
 .../manual/eager_manual/nodes/conv2d_nodes.cc |  2 -
 .../forwards/fused_attention_fwd_func.cc      | 14 ------
 ...as_dropout_residual_layer_norm_fwd_func.cc |  1 -
 .../forwards/fused_feedforward_fwd_func.cc    |  1 -
 .../forwards/fused_gate_attention_fwd_func.cc |  1 -
 .../forwards/fused_gemm_epilogue_fwd_func.cc  |  1 -
 .../auto_code_generator/eager_generator.cc    | 18 -------
 .../generator/eager_gen.py                    | 14 +-----
 .../custom_operator/custom_operator_node.cc   |  1 -
 .../eager/to_static/run_program_op_func.h     |  1 -
 paddle/fluid/eager/utils.cc                   | 33 ------------
 paddle/fluid/eager/utils.h                    |  8 ---
 paddle/fluid/pybind/eager_functions.cc        |  1 -
 paddle/fluid/pybind/eager_py_layer.cc         |  2 -
 .../paddle/fluid/tests/unittests/test_flip.py |  2 -
 .../unittests/xpu/test_zero_dim_tensor_xpu.py | 50 ++++++++++++++++++-
 18 files changed, 51 insertions(+), 101 deletions(-)

diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index a6bc082715a39..a6f1b99e1f022 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -99,7 +99,6 @@ paddle::experimental::Tensor add_n_ad_func(
       egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
     }
     grad_node->SetGradInMeta(out, 0);
-    egr::EagerUtils::CheckAndRetainGrad(out);
     // Set TensorWrappers for Forward Outputs if needed
   }
 
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index 52a2bd12f0a05..df5feab911f4f 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -162,7 +162,6 @@ paddle::experimental::Tensor conv2d_ad_func(
       egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
     }
     grad_node->SetGradInMeta(out, 0);
-    egr::EagerUtils::CheckAndRetainGrad(out);
     // Set TensorWrappers for Forward Outputs if needed
   }
 
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
index 647f6768bc6b1..8ba19d99cf458 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
@@ -159,8 +159,6 @@ Conv2dGradNodeFinal::operator()(
     }
     grad_node->SetGradInMeta(grad_input, 0);
     grad_node->SetGradInMeta(grad_filter, 1);
-    egr::EagerUtils::CheckAndRetainGrad(grad_input);
-    egr::EagerUtils::CheckAndRetainGrad(grad_filter);
     // Set TensorWrappers for Forward Outputs if needed
   }
 
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
index a84c53e33a106..db1d6c1d409d7 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -432,7 +432,6 @@ fused_attention_dygraph_function(
         egr::EagerUtils::SetHistory(p_autograd_QKVBiasOut,
                                     QKVBiasOut_accumulation_node);
         QKVBiasOut_accumulation_node->SetGradInMeta(QKVBiasOut, 0);
-        egr::EagerUtils::CheckAndRetainGrad(QKVBiasOut);
         grad_node->SetGradOutMeta(QKVBiasOut, 11);
       }
 
@@ -446,7 +445,6 @@ fused_attention_dygraph_function(
         egr::EagerUtils::SetHistory(p_autograd_SrcMaskOut,
                                     SrcMaskOut_accumulation_node);
         SrcMaskOut_accumulation_node->SetGradInMeta(SrcMaskOut, 0);
-        egr::EagerUtils::CheckAndRetainGrad(SrcMaskOut);
         grad_node->SetGradOutMeta(SrcMaskOut, 12);
       }
 
@@ -473,7 +471,6 @@ fused_attention_dygraph_function(
           egr::EagerUtils::SetHistory(p_autograd_LnOut,
                                       LnOut_accumulation_node);
           LnOut_accumulation_node->SetGradInMeta(LnOut, 0);
-          egr::EagerUtils::CheckAndRetainGrad(LnOut);
           grad_node->SetGradOutMeta(LnOut, 13);
         }
         if (LnMean.initialized()) {
@@ -505,7 +502,6 @@ fused_attention_dygraph_function(
                                     BiasDropoutResidualOut_accumulation_node);
         BiasDropoutResidualOut_accumulation_node->SetGradInMeta(
             BiasDropoutResidualOut, 0);
-        egr::EagerUtils::CheckAndRetainGrad(BiasDropoutResidualOut);
         grad_node->SetGradOutMeta(BiasDropoutResidualOut, 14);
       }
 
@@ -524,17 +520,14 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_CacheKVOut, 18);
       egr::EagerUtils::SetHistory(p_autograd_CacheKVOut, grad_node);
       grad_node->SetGradInMeta(CacheKVOut, 18);
-      egr::EagerUtils::CheckAndRetainGrad(CacheKVOut);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Y, 19);
       egr::EagerUtils::SetHistory(p_autograd_Y, grad_node);
       grad_node->SetGradInMeta(Y, 19);
-      egr::EagerUtils::CheckAndRetainGrad(Y);
       auto QKVOut_accumulation_node =
           std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKVOut);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVOut, 0);
       egr::EagerUtils::SetHistory(p_autograd_QKVOut, QKVOut_accumulation_node);
       QKVOut_accumulation_node->SetGradInMeta(QKVOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(QKVOut);
       grad_node->SetGradOutMeta(QKVOut, 15);
 
       auto QKTVOut_accumulation_node =
@@ -543,7 +536,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_QKTVOut,
                                   QKTVOut_accumulation_node);
       QKTVOut_accumulation_node->SetGradInMeta(QKTVOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(QKTVOut);
       grad_node->SetGradOutMeta(QKTVOut, 16);
 
       auto TransposeOut2_accumulation_node =
@@ -552,7 +544,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_TransposeOut2,
                                   TransposeOut2_accumulation_node);
       TransposeOut2_accumulation_node->SetGradInMeta(TransposeOut2, 0);
-      egr::EagerUtils::CheckAndRetainGrad(TransposeOut2);
       grad_node->SetGradOutMeta(TransposeOut2, 17);
 
       auto QKOut_accumulation_node =
@@ -560,7 +551,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKOut, 0);
       egr::EagerUtils::SetHistory(p_autograd_QKOut, QKOut_accumulation_node);
       QKOut_accumulation_node->SetGradInMeta(QKOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(QKOut);
       grad_node->SetGradOutMeta(QKOut, 18);
 
       auto SoftmaxOut_accumulation_node =
@@ -569,7 +559,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_SoftmaxOut,
                                   SoftmaxOut_accumulation_node);
       SoftmaxOut_accumulation_node->SetGradInMeta(SoftmaxOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(SoftmaxOut);
       grad_node->SetGradOutMeta(SoftmaxOut, 19);
 
       if (AttnDropoutOut.initialized()) {
@@ -580,7 +569,6 @@ fused_attention_dygraph_function(
         egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut,
                                     AttnDropoutOut_accumulation_node);
         AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0);
-        egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut);
         grad_node->SetGradOutMeta(AttnDropoutOut, 20);
       }
 
@@ -590,7 +578,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_FMHAOut,
                                   FMHAOut_accumulation_node);
       FMHAOut_accumulation_node->SetGradInMeta(FMHAOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(FMHAOut);
       grad_node->SetGradOutMeta(FMHAOut, 21);
 
       auto OutLinearOut_accumulation_node =
@@ -599,7 +586,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_OutLinearOut,
                                   OutLinearOut_accumulation_node);
       OutLinearOut_accumulation_node->SetGradInMeta(OutLinearOut, 0);
-      egr::EagerUtils::CheckAndRetainGrad(OutLinearOut);
       grad_node->SetGradOutMeta(OutLinearOut, 22);
     }
   }
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
index 2e87d0b740cc7..2544ad7b6e2da 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
@@ -221,7 +221,6 @@ fused_bias_dropout_residual_layer_norm_dygraph_function(
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Y, 4);
       egr::EagerUtils::SetHistory(p_autograd_Y, grad_node);
       grad_node->SetGradInMeta(Y, 4);
-      egr::EagerUtils::CheckAndRetainGrad(Y);
     }
   }
 
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
index 5b66eea7abb62..dce620fd32a4a 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
@@ -363,7 +363,6 @@ fused_feedforward_dygraph_function(
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0);
       egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
       grad_node->SetGradInMeta(Out, 0);
-      egr::EagerUtils::CheckAndRetainGrad(Out);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Mask, 1);
       grad_node->SetGradInMeta(Dropout1Mask, 1);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Mask, 2);
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
index 641d62a58e864..1ad201a8f81ac 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
@@ -372,7 +372,6 @@ fused_gate_attention_dygraph_function(
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 7);
       egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
       grad_node->SetGradInMeta(Out, 7);
-      egr::EagerUtils::CheckAndRetainGrad(Out);
     }
   }
 
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
index 8d9f6d769a37c..72dccb3bb0d15 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
@@ -120,7 +120,6 @@ paddle::experimental::Tensor fused_gemm_epilogue_dygraph_function(
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0);
       egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
       grad_node->SetGradInMeta(Out, 0);
-      egr::EagerUtils::CheckAndRetainGrad(Out);
     }
   }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 5305f4f984f62..5915494ebc3cd 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1305,15 +1305,6 @@ static std::string GenerateGradNodeCreationContent(
           paddle::string::Sprintf(SET_GRAD_IN_META_TEMPLATE,
                                   LegalizeVarName(inplace_input_name),
                                   output_position);
-
-      // Intermediate Tensor does not require CheckAndRetainGrad
-      if (!output.intermediate()) {
-        VLOG(6) << "Generated Call RetainGradForTensor";
-        const char* RETAIN_GRAD_TEMPLATE =
-            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-        grad_node_creation_str += paddle::string::Sprintf(
-            RETAIN_GRAD_TEMPLATE, LegalizeVarName(inplace_input_name));
-      }
     } else {
       const std::string& output_autograd_name =
           "p_autograd_" + LegalizeVarName(output_name);
@@ -1363,15 +1354,6 @@ static std::string GenerateGradNodeCreationContent(
                                     LegalizeVarName(output_name),
                                     output_position);
       }
-
-      // Intermediate Tensor does not require CheckAndRetainGrad
-      if (!output.intermediate()) {
-        VLOG(6) << "Generated Call RetainGradForTensor";
-        const char* RETAIN_GRAD_TEMPLATE =
-            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-        grad_node_creation_str += paddle::string::Sprintf(
-            RETAIN_GRAD_TEMPLATE, LegalizeVarName(output_name));
-      }
     }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 499eb42ea5ca3..650bf0626f1ad 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -280,8 +280,7 @@ class {} : public egr::GradNodeBase {{
 {}
     // SetGradOutMeta & SetEdges
 {}
-    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
-{}
+    // SetOutRank & SetHistory & SetGradInMeta
 {}
 {}
 {}
@@ -300,8 +299,7 @@ class {} : public egr::GradNodeBase {{
 {}
     // SetGradOutMeta & SetEdges
 {}
-    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
-{}
+    // SetOutRank & SetHistory & SetGradInMeta
 {}
 {}
 {}
@@ -987,7 +985,6 @@ def GenerateNodeCreationCodes(self, for_backward=False):
         set_out_rank_list = []
         set_history_list = []
         set_grad_in_meta_list = []
-        set_retain_grad_list = []
         num_outputs = len(forward_outputs_position_map.keys())
         for name, (_, pos) in forward_outputs_position_map.items():
             output_autograd_meta_name = GetAutoGradMetaName(name)
@@ -1002,19 +999,14 @@ def GenerateNodeCreationCodes(self, for_backward=False):
             set_grad_in_meta = (
                 f"{indent}grad_node->SetGradInMeta({name}, {pos});"
             )
-            set_retain_grad = (
-                f"{indent}egr::EagerUtils::CheckAndRetainGrad({name});"
-            )
 
             set_out_rank_list.append(set_out_rank)
             set_history_list.append(set_history)
             set_grad_in_meta_list.append(set_grad_in_meta)
-            set_retain_grad_list.append(set_retain_grad)
 
         set_out_rank_str = "\n".join(set_out_rank_list)
         set_history_str = "\n".join(set_history_list)
         set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
-        set_retain_grad_str = "\n".join(set_retain_grad_list)
 
         node_event_name = forward_api_name + " node_creation"
         node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n"
@@ -1029,7 +1021,6 @@ def GenerateNodeCreationCodes(self, for_backward=False):
                 set_out_rank_str,
                 set_history_str,
                 set_grad_in_meta_str,
-                set_retain_grad_str,
                 set_output_tensor_wrappers_str,
             )
         else:
@@ -1043,7 +1034,6 @@ def GenerateNodeCreationCodes(self, for_backward=False):
                     set_out_rank_str,
                     set_history_str,
                     set_grad_in_meta_str,
-                    set_retain_grad_str,
                     set_output_tensor_wrappers_str,
                 )
             )
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 57932ec4c1e69..f70b402b566dd 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -310,7 +310,6 @@ RunCustomOpNode::operator()(
       egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
       egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
       grad_node->SetGradInMeta(out_tensors, i);
-      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
     }
 
     // Prepare Grad inputs with fwd outputs
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index e58c9bd0c4e07..7305e79cd73fb 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -122,6 +122,5 @@ inline void run_program_ad_func(
 
     // Set History for output set current Grad Node for
     egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
-    egr::EagerUtils::CheckAndRetainGrad(deref_out);
   }
 }
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 777929bbc7536..4b992f5acaaa7 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -27,10 +27,6 @@
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
-PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor,
-                            false,
-                            "retain grad for all tensor");
-
 namespace egr {
 /**
  * Implementation of Eager Utils.
@@ -409,35 +405,6 @@ std::vector<paddle::experimental::Tensor> EagerUtils::RecoverTensorWrapper(
   }
   return ret;
 }
-// TODO(jiabin): remove all this when we fix all test using tmp grad
-void EagerUtils::CheckAndRetainGrad(
-    const paddle::experimental::Tensor& tensor) {
-  VLOG(6) << "Check RetainGradForTensor: " << tensor.name();
-  if (FLAGS_retain_grad_for_all_tensor) {
-    VLOG(6) << "RetainGradForTensor: " << tensor.name();
-    egr::egr_utils_api::RetainGradForTensor(tensor);
-  }
-}
-
-void EagerUtils::CheckAndRetainGrad(
-    const std::vector<paddle::experimental::Tensor>& tensors) {
-  if (FLAGS_retain_grad_for_all_tensor) {
-    for (auto& tensor : tensors) {
-      VLOG(6) << "RetainGradForTensor: " << tensor.name();
-      egr::egr_utils_api::RetainGradForTensor(tensor);
-    }
-  }
-}
-
-void EagerUtils::CheckAndRetainGrad(
-    const std::vector<paddle::experimental::Tensor*>& tensors) {
-  if (FLAGS_retain_grad_for_all_tensor) {
-    for (auto& tensor : tensors) {
-      VLOG(6) << "RetainGradForTensor: " << tensor->name();
-      egr::egr_utils_api::RetainGradForTensor(*tensor);
-    }
-  }
-}
 
 std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
     const paddle::experimental::Tensor& tensor) {
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 339f7af80364b..a726528f53d05 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -223,14 +223,6 @@ class EagerUtils {
       const std::vector<paddle::experimental::Tensor*>& out_var,
       std::vector<paddle::experimental::Tensor>* result);
 
-  // end Intermidate needed.
-
-  static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor);
-  static void CheckAndRetainGrad(
-      const std::vector<paddle::experimental::Tensor>& tensors);
-  static void CheckAndRetainGrad(
-      const std::vector<paddle::experimental::Tensor*>& tensors);
-
   static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
       const paddle::experimental::Tensor& tensor);
 
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 2874c7b90f437..cc5a8d64e1234 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -575,7 +575,6 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
         egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
         egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
         grad_node->SetGradInMeta(out_tensors, i);
-        egr::EagerUtils::CheckAndRetainGrad(out_tensors);
       }
 
       // Prepare Grad inputs with fwd outputs
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index de3cf80cef6cc..89bad2bfc924d 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -432,12 +432,10 @@ PyObject* pylayer_method_apply(PyObject* cls,
         for (auto t : outputs_tensor[i]) {
           grad_node->SetGradInMeta(*t, i);
         }
-        egr::EagerUtils::CheckAndRetainGrad(outputs_tensor[i]);
       } else {
         egr::EagerUtils::SetOutRankWithSlot(outputs_autograd_meta[i][0], i);
         egr::EagerUtils::SetHistory(outputs_autograd_meta[i][0], grad_node);
         grad_node->SetGradInMeta(*outputs_tensor[i][0], i);
-        egr::EagerUtils::CheckAndRetainGrad(*outputs_tensor[i][0]);
       }
     }
     VLOG(6) << "PyLayer construct backward node finish...";
diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py
index 1807199821eb7..4f095493f007b 100644
--- a/python/paddle/fluid/tests/unittests/test_flip.py
+++ b/python/paddle/fluid/tests/unittests/test_flip.py
@@ -152,7 +152,6 @@ def func(self, place):
         gradient_checker.double_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.double_grad_check_for_dygraph(
             self.flip_wrapper, [data], out, x_init=[data_arr], place=place
         )
@@ -184,7 +183,6 @@ def func(self, place):
         gradient_checker.triple_grad_check(
             [data], out, x_init=[data_arr], place=place, eps=eps
         )
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         gradient_checker.triple_grad_check_for_dygraph(
             self.flip_wrapper, [data], out, x_init=[data_arr], place=place
         )
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index 43b44a07dcbb8..c0597d0ad53ea 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -20,7 +20,6 @@
 import paddle.nn.functional as F
 
 paddle.set_device('xpu')
-paddle.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
 unary_api_list = [
     paddle.nn.functional.elu,
@@ -102,6 +101,7 @@ def test_dygraph_unary(self):
             x = paddle.rand([])
             x.stop_gradient = False
             out = api(x)
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -147,6 +147,7 @@ def test_dygraph_reduce(self):
                 x = paddle.rand([])
             x.stop_gradient = False
             out = api(x, None)
+            out.retain_grads()
 
             out.backward()
 
@@ -201,12 +202,15 @@ def test_dygraph_binary(self):
             y = paddle.rand([])
             x.stop_gradient = False
             y.stop_gradient = False
+            x.retain_grads()
+            y.retain_grads()
             if isinstance(api, dict):
                 out = api['func'](x, y)
                 out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
                 np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
             else:
                 out = api(x, y)
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -228,6 +232,7 @@ def test_dygraph_binary(self):
                 np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
             else:
                 out = api(x, y)
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [2, 3, 4])
@@ -243,12 +248,15 @@ def test_dygraph_binary(self):
             y = paddle.rand([2, 3, 4])
             x.stop_gradient = False
             y.stop_gradient = False
+            x.retain_grads()
+            y.retain_grads()
             if isinstance(api, dict):
                 out = api['func'](x, y)
                 out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
                 np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
             else:
                 out = api(x, y)
+            out.retain_grads()
             out.backward()
 
             self.assertEqual(x.shape, [])
@@ -265,6 +273,7 @@ def test_dygraph_binary(self):
             y = 0.5
             if isinstance(api, dict):
                 out = getattr(paddle.Tensor, api['cls_method'])(x, y)
+                out.retain_grads()
                 out.backward()
 
                 self.assertEqual(x.shape, [])
@@ -381,7 +390,9 @@ def test_shape(self):
     def test_pow_factor(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.pow(x, 2.0)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -391,7 +402,9 @@ def test_pow_factor(self):
     def test_cast(self):
         x = paddle.full([], 1.0, 'float32')
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.cast(x, 'int32')
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -401,7 +414,9 @@ def test_cast(self):
     def test_clip(self):
         x = paddle.uniform([], None, -10, 10)
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.clip(x, -5, 5)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -446,6 +461,7 @@ def test_transpose(self):
         x = paddle.rand([])
         x.stop_gradient = False
         out = paddle.transpose(x, [])
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -461,6 +477,7 @@ def test_moveaxis(self):
         x = paddle.rand([])
         x.stop_gradient = False
         out = paddle.moveaxis(x, [], [])
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -476,6 +493,7 @@ def test_gather_1D(self):
         x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
         index = paddle.full([], 2, 'int64')
         out = paddle.gather(x, index)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -489,6 +507,7 @@ def test_gather_xD_axis_0(self):
         )
         index = paddle.full([], 1, 'int64')
         out = paddle.gather(x, index)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [3])
@@ -541,10 +560,18 @@ def test_diagflat(self):
         x2.stop_gradient = False
         x3.stop_gradient = False
 
+        x1.retain_grads()
+        x2.retain_grads()
+        x3.retain_grads()
+
         out1 = paddle.diagflat(x1, 1)
         out2 = paddle.diagflat(x2, -1)
         out3 = paddle.diagflat(x3, 0)
 
+        out1.retain_grads()
+        out2.retain_grads()
+        out3.retain_grads()
+
         out1.backward()
         out2.backward()
         out3.backward()
@@ -592,7 +619,9 @@ def test_flatten(self):
     def test_scale(self):
         x = paddle.rand([])
         x.stop_gradient = False
+        x.retain_grads()
         out = paddle.scale(x, scale=2.0, bias=1.0)
+        out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])
@@ -674,24 +703,28 @@ def test_reshape_list(self):
         x.stop_gradient = False
 
         out = paddle.reshape(x, [])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [])
         self.assertEqual(out.grad.shape, [])
 
         out = paddle.reshape(x, [1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1])
         self.assertEqual(out.grad.shape, [1])
 
         out = paddle.reshape(x, [-1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1])
         self.assertEqual(out.grad.shape, [1])
 
         out = paddle.reshape(x, [-1, 1])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [])
         self.assertEqual(out.shape, [1, 1])
@@ -702,6 +735,7 @@ def test_reshape_tensor(self):
         x.stop_gradient = False
 
         out = paddle.reshape(x, [])
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [])
@@ -709,6 +743,7 @@ def test_reshape_tensor(self):
 
         new_shape = paddle.to_tensor([1, 1, 1], "int32")
         out = paddle.reshape(x, new_shape)
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [1, 1, 1])
@@ -716,6 +751,7 @@ def test_reshape_tensor(self):
 
         new_shape = paddle.to_tensor([-1], "int32")
         out = paddle.reshape(x, new_shape)
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [1])
@@ -723,6 +759,7 @@ def test_reshape_tensor(self):
 
         new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
         out = paddle.reshape(x, new_shape)
+        out.retain_grads()
         out.backward()
         self.assertEqual(x.grad.shape, [1, 1])
         self.assertEqual(out.shape, [1, 1])
@@ -765,9 +802,15 @@ def test_sort(self):
         x1.stop_gradient = False
         x2.stop_gradient = False
 
+        x1.retain_grads()
+        x2.retain_grads()
+
         out1 = paddle.sort(x1, axis=-1)
         out2 = paddle.sort(x2, axis=0)
 
+        out1.retain_grads()
+        out2.retain_grads()
+
         out1.backward()
         out2.backward()
 
@@ -787,10 +830,15 @@ def test_argsort(self):
         x2 = paddle.rand([])
         x1.stop_gradient = False
         x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
 
         out1 = paddle.argsort(x1, axis=-1)
         out2 = paddle.argsort(x2, axis=0)
 
+        out1.retain_grads()
+        out2.retain_grads()
+
         out1.backward()
         out2.backward()
 

From db83b53ae14f44654e74350f35a34a35746c1557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=B0=B8=E4=B9=85?=
 <34344716+yjjiang11@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:12:14 +0800
Subject: [PATCH 02/89] update erf gumbel_softmax ..ops (#50077)

* update erf gumbel_softmax ..ops

* lint

* reset sequence_conv

* reset exponetial&interp nearest
---
 python/paddle/fluid/tests/unittests/test_erf_op.py |  3 ++-
 .../fluid/tests/unittests/test_expand_v2_op.py     | 11 ++++++++---
 .../tests/unittests/test_gumbel_softmax_op.py      |  6 +++++-
 .../fluid/tests/unittests/test_is_empty_op.py      |  4 +++-
 .../fluid/tests/unittests/test_multiplex_op.py     |  3 ++-
 .../tests/unittests/test_transfer_layout_op.py     |  7 ++++++-
 .../fluid/tests/unittests/test_transpose_op.py     | 14 ++++++++------
 7 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_erf_op.py b/python/paddle/fluid/tests/unittests/test_erf_op.py
index 06b7f55069fb9..db5c48151c505 100644
--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 from scipy.special import erf
 
 import paddle
@@ -26,6 +26,7 @@
 class TestErfOp(OpTest):
     def setUp(self):
         self.op_type = "erf"
+        self.python_api = paddle.erf
         self.dtype = self._init_dtype()
         self.x_shape = [11, 17]
         x = np.random.uniform(-1, 1, size=self.x_shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index 0a5eda417e95c..0565be630a942 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -42,10 +42,10 @@ def init_data(self):
         self.expand_times = [1]
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(['X'], 'Out')
 
 
 class TestExpandV2OpRank2_DimExpanding(TestExpandV2OpRank1):
@@ -80,6 +80,7 @@ def init_data(self):
 class TestExpandV2OpRank1_tensor_attr(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
+        self.python_api = paddle.expand
         self.init_data()
         expand_shapes_tensor = []
         for index, ele in enumerate(self.expand_shape):
@@ -120,6 +121,7 @@ def init_data(self):
 class TestExpandV2OpRank1_tensor(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
+        self.python_api = paddle.expand
         self.init_data()
 
         self.inputs = {
@@ -146,6 +148,7 @@ def test_check_grad(self):
 class TestExpandV2OpInteger(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
+        self.python_api = paddle.expand
         self.inputs = {
             'X': np.random.randint(10, size=(2, 4, 5)).astype("int32")
         }
@@ -161,6 +164,7 @@ def test_check_output(self):
 class TestExpandV2OpBoolean(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
+        self.python_api = paddle.expand
         self.inputs = {'X': np.random.randint(2, size=(2, 4, 5)).astype("bool")}
         self.attrs = {'shape': [2, 4, 5]}
         output = np.tile(self.inputs['X'], (1, 1, 1))
@@ -174,6 +178,7 @@ def test_check_output(self):
 class TestExpandV2OpInt64_t(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
+        self.python_api = paddle.expand
         self.inputs = {
             'X': np.random.randint(10, size=(2, 4, 5)).astype("int64")
         }
diff --git a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
index 60b4cd5783347..30e4d7943ff24 100644
--- a/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gumbel_softmax_op.py
@@ -13,10 +13,11 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
+import paddle.nn.functional as F
 
 paddle.enable_static()
 
@@ -36,6 +37,7 @@ def verify_output(self, outs):
 
     def setUp(self):
         self.op_type = "gumbel_softmax"
+        self.python_api = F.gumbel_softmax
         self.init_attrs()
         np.random.seed(0)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
@@ -53,6 +55,7 @@ def test_check_grad(self):
 class TestGumbelSoftmax_ZeroDim(OpTest):
     def setUp(self):
         self.op_type = "gumbel_softmax"
+        self.python_api = F.gumbel_softmax
         self.dtype = "float64"
         x = np.random.uniform(0.1, 1, []).astype(self.dtype)
         out = np.array(1.0).astype(self.dtype)
@@ -123,6 +126,7 @@ def accumulate_output(self, outs):
 
     def setUp(self):
         self.op_type = "gumbel_softmax"
+        self.python_api = F.gumbel_softmax
         self.init_attrs()
         single_x = np.array([0.2, 0.3, 0.5])
         batch_x = np.ones(self.shape) * single_x
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 6cf410eaede84..f771c33cb67e6 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 
@@ -23,6 +23,7 @@
 class TestEmpty(OpTest):
     def setUp(self):
         self.op_type = "is_empty"
+        self.python_api = paddle.is_empty
         self.inputs = {'X': np.array([1, 2, 3])}
         self.outputs = {'Out': np.array([False])}
 
@@ -33,6 +34,7 @@ def test_check_output(self):
 class TestNotEmpty(TestEmpty):
     def setUp(self):
         self.op_type = "is_empty"
+        self.python_api = paddle.is_empty
         self.inputs = {'X': np.array([])}
         self.outputs = {'Out': np.array([True])}
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiplex_op.py b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
index 563a9fdb34b34..a0f8932ba23ab 100644
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +24,7 @@
 class TestMultiplexOp(OpTest):
     def setUp(self):
         self.op_type = "multiplex"
+        self.python_api = paddle.tensor.multiplex
         rows = 4
         index = np.arange(0, rows).astype('int32')
         np.random.shuffle(index)
diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
index 6c26a70694ac2..fd65ddfa48527 100644
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +24,10 @@
 from paddle.fluid.layer_helper import LayerHelper
 
 
+def transpose_layout(x, src_layout, dst_layout):
+    return x.transpose([0, 2, 3, 1])
+
+
 # default kNCHW
 class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
     def setUp(self):
@@ -31,6 +35,7 @@ def setUp(self):
         self.inputs = {'X': ipt.astype('float32')}
         self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
         self.attrs = {'src_layout': 0, 'dst_layout': 1}  # kNHWC
+        self.python_api = transpose_layout
         self.op_type = 'transfer_layout'
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index a2f922dcd8db4..a05b2a5c0a30f 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -17,12 +17,12 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
-from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 
 paddle.enable_static()
 
@@ -47,10 +47,10 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_eager=True)
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(['X'], 'Out')
 
     def initTestCase(self):
         self.shape = (3, 40)
@@ -150,11 +150,11 @@ def init_op_type(self):
         self.use_mkldnn = False
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'], check_eager=True)
+        self.check_output(no_check_set=['XShape'])
         fluid.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(['X'], 'Out')
 
 
 class TestTransposeBF16Op(OpTest):
@@ -162,6 +162,7 @@ def setUp(self):
         self.init_op_type()
         self.initTestCase()
         self.dtype = np.uint16
+        self.python_api = paddle.transpose
         x = np.random.random(self.shape).astype("float32")
 
         self.inputs = {'X': convert_float_to_uint16(x)}
@@ -580,7 +581,8 @@ def test_dygraph(self):
         x = paddle.rand([])
         x.stop_gradient = False
         out = paddle.transpose(x, [])
-        out.retain_grads()
+        if hasattr(out, 'retain_grads'):
+            out.retain_grads()
         out.backward()
 
         self.assertEqual(out.shape, [])

From a8078bbd7e00733cd8bfd7b553c78288f3469a26 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:31:06 +0800
Subject: [PATCH 03/89] add multi fetch (#50070)

---
 .../distributed/fleet_executor/carrier.cc     | 25 ++++--
 .../distributed/fleet_executor/carrier.h      |  4 +-
 .../fleet_executor/fleet_executor.cc          | 14 +++-
 .../fleet_executor/fleet_executor.h           |  7 +-
 python/paddle/fluid/executor.py               | 24 ++++++
 .../test_fleet_executor_cond_interceptor.py   | 79 ++++++++++++++-----
 6 files changed, 120 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 094afff577a9e..2b75c3ba066ec 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 
 #include <algorithm>
+#include <vector>
 
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -24,6 +25,7 @@
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
 
 namespace paddle {
@@ -55,23 +57,34 @@ void Carrier::Init(
     framework::Scope* scope,
     int64_t num_micro_batches,
     const platform::Place& place,
-    const std::vector<std::string>& inference_root_scope_vars) {
+    const std::vector<std::string>& inference_root_scope_vars,
+    const std::vector<framework::Scope*>& micro_scope_list) {
   rank_ = rank;
   interceptor_id_to_rank_ = interceptor_id_to_rank;
   interceptor_id_to_node_ = interceptor_id_to_node;
   place_ = place;
   root_scope_ = scope;
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+  bool need_create_scope = micro_scope_list.empty();
 
   PADDLE_ENFORCE_NOT_NULL(
       root_scope_,
       platform::errors::InvalidArgument("root_scope can not be nullptr"));
-  minibatch_scope_ = &root_scope_->NewScope();
-  microbatch_scopes_.resize(num_micro_batches);
-  for (int i = 0; i < num_micro_batches; ++i) {
-    microbatch_scopes_[i] = &minibatch_scope_->NewScope();
-    CopyParameters(i, program, inference_root_scope_vars);
+
+  if (need_create_scope) {
+    minibatch_scope_ = &root_scope_->NewScope();
+    microbatch_scopes_.resize(num_micro_batches);
+    for (int i = 0; i < num_micro_batches; ++i) {
+      microbatch_scopes_[i] = &minibatch_scope_->NewScope();
+      CopyParameters(i, program, inference_root_scope_vars);
+    }
+  } else {
+    microbatch_scopes_ = micro_scope_list;
+    for (int i = 0; i < num_micro_batches; ++i) {
+      CopyParameters(i, program, inference_root_scope_vars);
+    }
   }
+
   // Add source and sink interceptor id to rank
   interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
   interceptor_id_to_rank_.emplace(SINK_ID, rank);
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 2523942e06223..8e7fad3e892d8 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -60,7 +61,8 @@ class Carrier final {
       framework::Scope* scope,
       int64_t num_micro_batches,
       const platform::Place& place,
-      const std::vector<std::string>& inference_root_scope_vars = {});
+      const std::vector<std::string>& inference_root_scope_vars = {},
+      const std::vector<framework::Scope*>& micro_scope_list = {});
 
   void CopyParameters(
       int microbatch_id,
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 1f397a91746b9..88363696ede25 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 
 #include <algorithm>
+#include <vector>
 
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -24,6 +25,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace distributed {
@@ -59,7 +61,8 @@ void FleetExecutor::Init(
     int64_t num_micro_batches,
     const std::vector<TaskNode*>& task_nodes,
     const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
-    const std::vector<std::string>& inference_root_scope_vars) {
+    const std::vector<std::string>& inference_root_scope_vars,
+    const std::vector<framework::Scope*>& micro_scope_list) {
   PADDLE_ENFORCE_GT(task_nodes.size(),
                     0,
                     platform::errors::InvalidArgument(
@@ -144,7 +147,8 @@ void FleetExecutor::Init(
               place,
               num_micro_batches,
               program_desc,
-              inference_root_scope_vars);
+              inference_root_scope_vars,
+              micro_scope_list);
   GlobalVal<MessageBus>::Get()->Barrier();
 }
 
@@ -154,7 +158,8 @@ void FleetExecutor::InitCarrier(
     const platform::Place& place,
     int64_t num_micro_batches,
     const framework::ProgramDesc& program_desc,
-    const std::vector<std::string>& inference_root_scope_vars) {
+    const std::vector<std::string>& inference_root_scope_vars,
+    const std::vector<framework::Scope*>& micro_scope_list) {
   carrier->Init(exe_desc_.cur_rank(),
                 runtime_graph_->interceptor_id_to_rank(),
                 runtime_graph_->interceptor_id_to_node(),
@@ -162,7 +167,8 @@ void FleetExecutor::InitCarrier(
                 scope,
                 num_micro_batches,
                 place,
-                inference_root_scope_vars);
+                inference_root_scope_vars,
+                micro_scope_list);
 }
 
 void FleetExecutor::InitMessageBus() {
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index f633dbbc3600f..e8123bea1e19f 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -18,6 +18,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -45,7 +46,8 @@ class FleetExecutor final {
             int64_t num_micro_batches,
             const std::vector<TaskNode*>& task_nodes,
             const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
-            const std::vector<std::string>& inference_root_scope_vars = {});
+            const std::vector<std::string>& inference_root_scope_vars = {},
+            const std::vector<framework::Scope*>& micro_scope_list = {});
   void Run(const std::string& carrier_id);
 
  private:
@@ -57,7 +59,8 @@ class FleetExecutor final {
       const platform::Place& place,
       int64_t num_micro_batches,
       const framework::ProgramDesc& program_desc,
-      const std::vector<std::string>& inference_root_scope_vars = {});
+      const std::vector<std::string>& inference_root_scope_vars = {},
+      const std::vector<framework::Scope*>& micro_scope_list = {});
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
   std::unordered_set<std::string> carrier_ids_;
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2822a87a02172..da9d12802434f 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2464,6 +2464,7 @@ def _prepare_fleet_executor_carrier(
         program=None,
         scope=None,
         fleet_opt=None,
+        micro_scope_list=[],
         with_standalone_executor=False,
     ):
         num_micro_batches = (
@@ -2532,6 +2533,7 @@ def _prepare_fleet_executor_carrier(
             fleet_opt['task_id_to_rank'] = task_id_to_rank
         place = core.Place()
         place.set_place(self.place)
+
         # NOTE: the last argument is used to force create some vars in root scope,
         # won't be used during train.
         self._fleet_executor.init(
@@ -2543,6 +2545,7 @@ def _prepare_fleet_executor_carrier(
             tasks,
             task_id_to_rank,
             [],
+            micro_scope_list,
         )
 
     def _run_using_fleet_executor(
@@ -2624,11 +2627,20 @@ def _run_using_fleet_executor(
                         )
                 fetch_task.set_program(fetch_program)
 
+            micro_scope_list = []
+            if (
+                "inference_generation" in fleet_opt
+                and fleet_opt["inference_generation"]
+            ):
+                for i in range(int(fleet_opt["num_micro_batches"])):
+                    micro_scope_list.append(cached_scope.new_scope())
+
             self._prepare_fleet_executor_carrier(
                 cache_key,
                 program=cached_program,
                 scope=cached_scope,
                 fleet_opt=fleet_opt,
+                micro_scope_list=micro_scope_list,
                 with_standalone_executor=with_standalone_executor,
             )
 
@@ -2653,6 +2665,18 @@ def _run_using_fleet_executor(
 
         self._fleet_executor.run(cache_key)
 
+        if "fetch_var" in fleet_opt:
+            # If we speed up the generation in evaluation, we need to generate
+            # multiple queries at the same time. Each query will in separate scope in order
+            # not mix up. It indicate that final result will in multiple scopes and need to
+            # fetch each.
+            result_list = []
+            for scope in micro_scope_list:
+                for var in fleet_opt["fetch_var"]:
+                    tensor = core.get_variable_tensor(scope, var)
+                result_list.append(as_numpy(tensor))
+            return result_list
+
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
             tensors = arr._move_to_list()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
index d3a57898a0dce..1ca8c869a96bd 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
@@ -14,6 +14,8 @@
 
 import unittest
 
+import numpy as np
+
 import paddle
 import paddle.fluid.core as core
 from paddle.distributed.fleet.fleet_executor_utils import TaskNode
@@ -21,13 +23,26 @@
 paddle.enable_static()
 
 
-def cond(i, ten):
+def cond(i, ten, data):
     return i < ten
 
 
-def body(i, ten):
+def body(i, ten, data):
     i = i + 1
-    return [i, ten]
+    data = data + 1
+    return [i, ten, data]
+
+
+num_micro_batches = 3
+
+
+def batch_generator_creator():
+    def __reader__():
+        for i in range(num_micro_batches):
+            data = np.full(shape=[1, 1], fill_value=i, dtype=np.float32)
+            yield data
+
+    return __reader__
 
 
 class TestFleetExecutor(unittest.TestCase):
@@ -41,7 +56,16 @@ def test_cond_interceptor(self):
             ten = paddle.full(
                 shape=[1], fill_value=10, dtype='int64'
             )  # loop length
-            i, ten = paddle.static.nn.while_loop(cond, body, [i, ten])
+            data = paddle.static.data(name='x', shape=[1])
+
+            loader = paddle.fluid.io.DataLoader.from_generator(
+                feed_list=[data], capacity=num_micro_batches * 4, iterable=False
+            )
+            loader.set_batch_generator(
+                batch_generator_creator(), paddle.CUDAPlace(0)
+            )
+
+            paddle.static.nn.while_loop(cond, body, [i, ten, data])
 
         program_a = paddle.static.Program()
         program_b = paddle.static.Program()
@@ -49,18 +73,27 @@ def test_cond_interceptor(self):
         for var_name in main_program.block(0).vars:
             if var_name != "_generated_var_0":
                 var = main_program.block(0).var(var_name)
-                program_a.block(0).create_var(
-                    name=var_name,
-                    shape=var.shape,
-                    dtype=var.dtype,
-                    stop_gradient=var.stop_gradient,
-                )
-                program_b.block(0).create_var(
-                    name=var_name,
-                    shape=var.shape,
-                    dtype=var.dtype,
-                    stop_gradient=var.stop_gradient,
-                )
+                if (
+                    var_name == "create_py_reader_0"
+                    or var_name == "double_buffer_0"
+                ):
+                    program_a.block(0).create_var(
+                        name=var_name,
+                        persistable=var.persistable,
+                    )
+                else:
+                    program_a.block(0).create_var(
+                        name=var_name,
+                        shape=var.shape,
+                        dtype=var.dtype,
+                        stop_gradient=var.stop_gradient,
+                    )
+                    program_b.block(0).create_var(
+                        name=var_name,
+                        shape=var.shape,
+                        dtype=var.dtype,
+                        stop_gradient=var.stop_gradient,
+                    )
 
         for op in main_program.block(0).ops:
             if op.type != "while":
@@ -89,7 +122,6 @@ def test_cond_interceptor(self):
             )
 
         cond_var_name = "tmp_0"
-        num_micro_batches = 3
 
         task_a = TaskNode(
             0,
@@ -159,12 +191,19 @@ def test_cond_interceptor(self):
                     task_e.task_id(): 0,
                 },
                 'num_micro_batches': num_micro_batches,
+                'inference_generation': True,
+                'fetch_var': ['x'],
             },
         }
 
-        place = paddle.fluid.CUDAPlace(0)
-        exe = paddle.fluid.Executor(place)
-        exe.run(main_program)
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        loader.start()
+        res = exe.run(main_program)
+        ref_res = np.full([1], 10, dtype="float32")
+        for data in res:
+            np.testing.assert_allclose(data, ref_res, rtol=1e-05)
+            ref_res = ref_res + 1
 
 
 if __name__ == "__main__":

From a34d85d9be073648c4e34ff8aba507203d09c2ee Mon Sep 17 00:00:00 2001
From: mjxs <52824616+kk-2000@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:31:30 +0800
Subject: [PATCH 04/89] np.unicode_ => np.str_ (#49975)

---
 .../fluid/tests/unittests/test_egr_string_tensor_api.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
index 3032dc5810dd6..0c2ad7517edda 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
@@ -41,7 +41,7 @@ def test_constructor_with_args(self):
         self.assertEqual(ST2.name, "ST2")
         self.assertEqual(ST2.shape, shape)
         np.testing.assert_array_equal(
-            ST2.numpy(), np.empty(shape, dtype=np.unicode_)
+            ST2.numpy(), np.empty(shape, dtype=np.str_)
         )
 
         ST3 = core.eager.StringTensor(self.str_arr, "ST3")  # constructor 3
@@ -74,7 +74,7 @@ def test_constructor_with_kwargs(self):
         self.assertEqual(ST1.name, "ST1")
         self.assertEqual(ST1.shape, shape)
         np.testing.assert_array_equal(
-            ST1.numpy(), np.empty(shape, dtype=np.unicode_)
+            ST1.numpy(), np.empty(shape, dtype=np.str_)
         )
 
         ST2 = core.eager.StringTensor(self.str_arr, name="ST2")  # constructor 3

From e7deae2129583d5eb19d975a07904a6bfce7026f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:33:39 +0800
Subject: [PATCH 05/89] modify np.int with np.int64 (#49967)

* change int -> int64

* Update python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py
---
 .../tests/unittests/ir/inference/test_trt_convert_one_hot.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py
index 53574a3fd27dc..30446265a431d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_one_hot.py
@@ -59,7 +59,7 @@ def generate_depth(dims, batch):
                         },
                         "op_outputs": {"Out": ["output_data"]},
                         "op_attrs": dics[0],
-                        "outputs_dtype": {"output_data": np.int64},
+                        "outputs_dtype": {"output_data": np.int_},
                     },
                 ]
                 ops = self.generate_op_config(ops_config)

From 3586e856c581f8e1ee1d924152a037357e3ccfb8 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:35:17 +0800
Subject: [PATCH 06/89] Unify the gpu implementation of stack and unstack to
 reuse the optimization. (#49748)

* Unify the gpu implementation of stack and unstack to reuse the optimization.

* Optimize the cuda implementation of unstack.

* Use GpuMemcpyAsync instead of memory::Copy.

* Fix error of calculating the index.

* Use FastDivMod to further imporve the performance of unstack.
---
 paddle/phi/kernels/funcs/segmented_array.h    |  13 +-
 paddle/phi/kernels/funcs/stack_and_unstack.h  | 276 ++++++++++++++++++
 paddle/phi/kernels/gpu/stack_grad_kernel.cu   | 153 +---------
 paddle/phi/kernels/gpu/stack_kernel.cu        |  78 +----
 paddle/phi/kernels/gpu/unstack_grad_kernel.cu |  17 +-
 paddle/phi/kernels/gpu/unstack_kernel.cu      |  31 +-
 paddle/phi/kernels/stack_grad_kernel.h        |   2 +-
 paddle/phi/kernels/unstack_grad_kernel.h      |   2 +-
 8 files changed, 338 insertions(+), 234 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/stack_and_unstack.h

diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index 0f03dbac591ec..aa03eb4e9fcd2 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/funcs/fast_divmod.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 namespace funcs {
@@ -89,12 +89,11 @@ struct ArraySetterBase {
         ctx.GetPlace(),
         num_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    paddle::memory::Copy(ctx.GetPlace(),
-                         allocation->ptr(),
-                         phi::CPUPlace(),
-                         src,
-                         num_bytes,
-                         ctx.stream());
+    phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
+                                       src,
+                                       num_bytes,
+                                       phi::gpuMemcpyHostToDevice,
+                                       ctx.stream());
     return allocation->ptr();
   }
 
diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
new file mode 100644
index 0000000000000..c516d4892bf62
--- /dev/null
+++ b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -0,0 +1,276 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/fast_divmod.h"
+#include "paddle/phi/kernels/funcs/segmented_array.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T, typename IndexT, typename ArrayT>
+__global__ void StackCudaKernel(ArrayT array,
+                                GeneralDivMod<IndexT> divmoder,
+                                IndexT split_size,
+                                IndexT rows,
+                                IndexT cols,
+                                T* __restrict__ output) {
+  IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
+  IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x;
+  IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y;
+
+  for (; grid_x < cols; grid_x += grid_x_stride) {
+    IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;
+
+    auto divmod_rslt = divmoder.div_mod(grid_x);
+    IndexT split = divmod_rslt[0];       // grid_x / split_size
+    IndexT col_offset = divmod_rslt[1];  // grid_x % split_size
+    const T* input_ptr = array.data[split];
+#pragma unroll
+    for (; grid_y < rows; grid_y += grid_y_stride) {
+      output[grid_y * cols + grid_x] =
+          input_ptr[grid_y * split_size + col_offset];
+    }
+  }
+}
+
+template <typename Context,
+          typename T,
+          typename IndexT,
+          SegmentedArraySize Size>
+void LaunchStackKernel(const Context& ctx,
+                       const IndexT x_col,
+                       const IndexT x_row,
+                       const IndexT out_col,
+                       const std::vector<const DenseTensor*>& x,
+                       DenseTensor* out) {
+  T* out_ptr = ctx.template Alloc<T>(out);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig2D(ctx, out_col, x_row);
+
+  ConstPointerArraySetter<Context, T, Size> setter(ctx, x);
+  GeneralDivMod<IndexT> divmoder(x_col);
+  StackCudaKernel<T, IndexT, decltype(setter.array)>
+      <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+          setter.array, divmoder, x_col, x_row, out_col, out_ptr);
+}
+
+template <typename T, typename Context>
+void StackRawKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    int axis,
+                    DenseTensor* out) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+  int num = static_cast<int>(x.size());
+
+  // Split x dim from axis to matrix of shape [x_row, x_col], and the output
+  // tensor's shape is [x_row, out_col].
+  int64_t x_row = 1;
+  for (int i = 0; i < axis; ++i) {
+    x_row *= x[0]->dims()[i];
+  }
+  int64_t x_col = x[0]->numel() / x_row;
+  int64_t out_col = x_col * num;
+
+  if (out->numel() < std::numeric_limits<int32_t>::max()) {
+    switch (CalcArraySize(num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchStackKernel<Context, T, int32_t, kArraySize>(
+              ctx, x_col, x_row, out_col, x, out));
+    }
+  } else {
+    switch (CalcArraySize(num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchStackKernel<Context, T, int64_t, kArraySize>(
+              ctx, x_col, x_row, out_col, x, out));
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename ArrayT>
+__global__ void UnStackCudaKernel(const T* __restrict__ input,
+                                  IndexT out_row,
+                                  IndexT split_dim,
+                                  IndexT out_col,
+                                  IndexT num_splits,
+                                  GeneralDivMod<IndexT> col_divmoder,
+                                  ArrayT array) {
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  // In this case they are equal
+  assert(split_dim % num_splits == 0);
+
+  IndexT numel = out_row * split_dim * out_col;
+  IndexT each_dim_size = split_dim / num_splits;
+  IndexT split_dim_with_out_col = split_dim * out_col;
+
+  IndexT offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (each_dim_size == 1) {
+    for (; offset < numel; offset += blockDim.x * gridDim.x) {
+      auto col_divmod_rslt = col_divmoder.div_mod(offset);
+
+      IndexT i = offset / split_dim_with_out_col;
+      IndexT j = col_divmod_rslt[0] - i * split_dim;
+      IndexT k = col_divmod_rslt[1];  // offset % out_col
+
+      T* output = array.data[j];
+      if (output) {
+        IndexT output_idx = i * out_col + k;
+        *(output + output_idx) = input[offset];
+      }
+    }
+  } else {
+    for (; offset < numel; offset += blockDim.x * gridDim.x) {
+      auto col_divmod_rslt = col_divmoder.div_mod(offset);
+
+      IndexT i = offset / split_dim_with_out_col;
+      IndexT j = col_divmod_rslt[0] - i * split_dim;
+      IndexT k = col_divmod_rslt[1];  // offset % out_col
+
+      T* output = array.data[j / each_dim_size];
+      if (output) {
+        IndexT output_idx = (i + j % each_dim_size) * out_col + k;
+        *(output + output_idx) = input[offset];
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename ArrayT>
+__global__ void UnStackCudaKernelForLastDim(const T* __restrict__ in_data,
+                                            const IndexT cols,
+                                            const IndexT rows,
+                                            const IndexT tile_x_num,
+                                            ArrayT array) {
+  constexpr int buffer_size = 512;
+  __shared__ T s_buf[buffer_size];
+
+  for (IndexT tile_x = blockIdx.x; tile_x < tile_x_num; tile_x += gridDim.x) {
+    IndexT row_idx = tile_x * blockDim.x + threadIdx.x;
+    IndexT col_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    int s_idx = threadIdx.y * blockDim.x + threadIdx.x;
+    bool is_valid = (col_idx < cols && row_idx < rows);
+
+    if (is_valid) {
+      T data = in_data[row_idx * cols + col_idx];
+      s_buf[s_idx] = data;
+    }
+    __syncthreads();
+    if (is_valid) {
+      if (array.data[col_idx]) {
+        array.data[col_idx][row_idx] = s_buf[s_idx];
+      }
+    }
+  }
+}
+
+template <typename Context,
+          typename T,
+          typename IndexT,
+          SegmentedArraySize Size>
+void LaunchUnStackKernel(const Context& ctx,
+                         const IndexT out_row,
+                         const IndexT split_dim,
+                         const IndexT out_col,
+                         const IndexT num_splits,
+                         const DenseTensor& x,
+                         std::vector<DenseTensor*>* outs) {
+  // each tensor in outs should have same shape.
+  VLOG(6) << "out_row=" << out_row << ", split_dim=" << split_dim
+          << ", out_col=" << out_col << ", num_splits=" << num_splits;
+
+  auto x_ptr = x.data<T>();
+  PointerArraySetter<Context, T, Size> setter(ctx, outs);
+
+  if (out_col == 1) {
+    // For the case axis == (x.dims().size() - 1)
+    constexpr int kThreads = 512;
+    constexpr int kWarpSize = 32;
+    constexpr int kMaxOut = 16;
+
+    int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
+    if (split_dim < kMaxOut) {
+      tid_y = split_dim;
+      tid_x =
+          std::min(backends::gpu::RoundToNextHighPowOfTwo(out_row, kWarpSize),
+                   kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y));
+    } else {
+      tid_y = kMaxOut;
+      tid_x = kWarpSize;
+      bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
+    }
+    int tile_x_num = backends::gpu::DivUp<int>(out_row, tid_x);
+    bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
+    dim3 blocks(tid_x, tid_y, 1);
+    dim3 grids(bid_x, bid_y, 1);
+
+    UnStackCudaKernelForLastDim<T, IndexT, decltype(setter.array)>
+        <<<grids, blocks, 0, ctx.stream()>>>(
+            x_ptr, split_dim, out_row, tile_x_num, setter.array);
+  } else {
+    GeneralDivMod<IndexT> col_divmoder(out_col);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        ctx, out_row * split_dim * out_col);
+
+    UnStackCudaKernel<T, IndexT, decltype(setter.array)>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           ctx.stream()>>>(x_ptr,
+                           out_row,
+                           split_dim,
+                           out_col,
+                           num_splits,
+                           col_divmoder,
+                           setter.array);
+  }
+}
+
+template <typename T, typename Context>
+void UnStackRawKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      std::vector<DenseTensor*>* outs) {
+  auto x_dims = x.dims();
+
+  // Input tensor is splited to split_dim tensors along split_dim dimension.
+  int64_t split_dim = x_dims[axis];
+
+  // Treat outs[i] as [out_row, out_col], and x as [out_row, split_dim,
+  // out_col].
+  int64_t out_row = 1;
+  for (int i = 0; i < axis; ++i) {
+    out_row *= x_dims[i];
+  }
+
+  int64_t out_col = x.numel() / (split_dim * out_row);
+
+  if (x.numel() < std::numeric_limits<int32_t>::max()) {
+    switch (CalcArraySize(split_dim)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchUnStackKernel<Context, T, int32_t, kArraySize>(
+              ctx, out_row, split_dim, out_col, split_dim, x, outs));
+    }
+  } else {
+    switch (CalcArraySize(split_dim)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchUnStackKernel<Context, T, int64_t, kArraySize>(
+              ctx, out_row, split_dim, out_col, split_dim, x, outs));
+    }
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
index 572ed4a361b4e..6c72a3562e6a7 100644
--- a/paddle/phi/kernels/gpu/stack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu
@@ -13,125 +13,13 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/stack_grad_kernel.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/segmented_array.h"
+#include "paddle/phi/kernels/funcs/stack_and_unstack.h"
 
 namespace phi {
 
-template <typename T, typename IndexT, typename ArrayT>
-__global__ void UnStackCudaKernel(const T* __restrict__ input,
-                                  IndexT pre_dim_size,
-                                  IndexT split_dim_size,
-                                  IndexT suf_dim_size,
-                                  IndexT num_split,
-                                  ArrayT array) {
-  assert(blockDim.y == 1);
-  assert(blockDim.z == 1);
-  // In this case they are equal
-  assert(split_dim_size % num_split == 0);
-
-  IndexT size = pre_dim_size * split_dim_size * suf_dim_size;
-  IndexT each_dim_size = split_dim_size / num_split;
-
-  for (IndexT offset = blockIdx.x * blockDim.x + threadIdx.x; offset < size;
-       offset += blockDim.x * gridDim.x) {
-    IndexT i = offset / (split_dim_size * suf_dim_size);
-    IndexT j = (offset % (split_dim_size * suf_dim_size)) / suf_dim_size;
-    IndexT k = offset % suf_dim_size;
-
-    T* output = array.data[j / each_dim_size];
-    if (output == nullptr) {
-      return;
-    }
-    IndexT output_ind = i * each_dim_size * suf_dim_size +
-                        (j % each_dim_size) * suf_dim_size + k;
-    *(output + output_ind) = input[offset];
-  }
-}
-
-template <typename T, typename IndexT, typename ArrayT>
-__global__ void UnStackCudaKernelForLastDim(const T* __restrict__ in_data,
-                                            const IndexT cols,
-                                            const IndexT rows,
-                                            const IndexT tile_x_num,
-                                            ArrayT array) {
-  constexpr int buffer_size = 512;
-  __shared__ T s_buf[buffer_size];
-
-  for (IndexT tile_x = blockIdx.x; tile_x < tile_x_num; tile_x += gridDim.x) {
-    IndexT row_idx = tile_x * blockDim.x + threadIdx.x;
-    IndexT col_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    int s_idx = threadIdx.y * blockDim.x + threadIdx.x;
-    bool is_valid = (col_idx < cols && row_idx < rows);
-
-    if (is_valid) {
-      T data = in_data[row_idx * cols + col_idx];
-      s_buf[s_idx] = data;
-    }
-    __syncthreads();
-    if (is_valid) {
-      if (array.data[col_idx]) {
-        array.data[col_idx][row_idx] = s_buf[s_idx];
-      }
-    }
-  }
-}
-
-template <typename Context,
-          typename T,
-          typename IndexT,
-          funcs::SegmentedArraySize Size>
-void LaunchUnStackKernel(const Context& ctx,
-                         const IndexT pre_dim,
-                         const IndexT split_dim,
-                         const IndexT suf_dim,
-                         const IndexT num_splits,
-                         const DenseTensor& out_grad,
-                         std::vector<DenseTensor*>* x_grad) {
-  // each x_grad should have same shape
-  auto dout_ptr = out_grad.data<T>();
-  funcs::PointerArraySetter<Context, T, Size> setter(ctx, x_grad);
-
-  if (suf_dim == 1) {
-    // For the case axis == (out_grad.dims().size() - 1)
-    constexpr int kThreads = 512;
-    constexpr int kWarpSize = 32;
-    constexpr int kMaxOut = 16;
-
-    int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
-    if (split_dim < kMaxOut) {
-      tid_y = split_dim;
-      tid_x =
-          std::min(backends::gpu::RoundToNextHighPowOfTwo(pre_dim, kWarpSize),
-                   kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y));
-    } else {
-      tid_y = kMaxOut;
-      tid_x = kWarpSize;
-      bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
-    }
-    int tile_x_num = backends::gpu::DivUp<int>(pre_dim, tid_x);
-    bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
-    dim3 blocks(tid_x, tid_y, 1);
-    dim3 grids(bid_x, bid_y, 1);
-
-    UnStackCudaKernelForLastDim<T, IndexT, decltype(setter.array)>
-        <<<grids, blocks, 0, ctx.stream()>>>(
-            dout_ptr, split_dim, pre_dim, tile_x_num, setter.array);
-  } else {
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        ctx, pre_dim * split_dim * suf_dim);
-
-    UnStackCudaKernel<T, IndexT, decltype(setter.array)>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           ctx.stream()>>>(
-            dout_ptr, pre_dim, split_dim, suf_dim, num_splits, setter.array);
-  }
-}
-
 template <typename T, typename Context>
 void StackGradKernel(const Context& ctx,
                      const DenseTensor& out_grad,
@@ -144,41 +32,12 @@ void StackGradKernel(const Context& ctx,
       split_dim,
       x_grad.size(),
       phi::errors::InvalidArgument(
-          "Output x_grad size should be equal to the split_dim, but"
-          " received split_dim is:%d x_grad size is:%d.",
+          "Output x_grad's size should be equal to the split_dim, but"
+          " received split_dim is:%d x_grad's size is:%d.",
           split_dim,
           x_grad.size()));
 
-  auto dout_dims = out_grad.dims();
-  int64_t dout_pre = 1;
-  for (int i = 0; i < axis; ++i) {
-    dout_pre *= dout_dims[i];
-  }
-  int64_t dout_suf = out_grad.numel() / (split_dim * dout_pre);
-
-  if (out_grad.numel() < std::numeric_limits<int32_t>::max()) {
-    switch (funcs::CalcArraySize(split_dim)) {
-      SEGMENTED_ARRAY_KERNEL_HELPER(
-          LaunchUnStackKernel<Context, T, int32_t, kArraySize>(ctx,
-                                                               dout_pre,
-                                                               split_dim,
-                                                               dout_suf,
-                                                               split_dim,
-                                                               out_grad,
-                                                               &x_grad));
-    }
-  } else {
-    switch (funcs::CalcArraySize(split_dim)) {
-      SEGMENTED_ARRAY_KERNEL_HELPER(
-          LaunchUnStackKernel<Context, T, int64_t, kArraySize>(ctx,
-                                                               dout_pre,
-                                                               split_dim,
-                                                               dout_suf,
-                                                               split_dim,
-                                                               out_grad,
-                                                               &x_grad));
-    }
-  }
+  funcs::UnStackRawKernel<T, Context>(ctx, out_grad, axis, &x_grad);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
index a50396e7c9729..e1d7d4e6f389c 100644
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -13,89 +13,19 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/stack_kernel.h"
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/segmented_array.h"
+#include "paddle/phi/kernels/funcs/stack_and_unstack.h"
 
 namespace phi {
 
-template <typename T, typename IndexT, typename ArrayT>
-__global__ void StackCUDAKernel(ArrayT array,
-                                funcs::GeneralDivMod<IndexT> divmoder,
-                                IndexT split_size,
-                                IndexT rows,
-                                IndexT cols,
-                                T* __restrict__ output) {
-  IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
-  IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x;
-  IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y;
-
-  for (; grid_x < cols; grid_x += grid_x_stride) {
-    IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;
-
-    auto divmod_rslt = divmoder.div_mod(grid_x);
-    IndexT split = divmod_rslt[0];       // grid_x / split_size
-    IndexT col_offset = divmod_rslt[1];  // grid_x % split_size
-    const T* input_ptr = array.data[split];
-#pragma unroll
-    for (; grid_y < rows; grid_y += grid_y_stride) {
-      output[grid_y * cols + grid_x] =
-          input_ptr[grid_y * split_size + col_offset];
-    }
-  }
-}
-
-template <typename Context,
-          typename T,
-          typename IndexT,
-          funcs::SegmentedArraySize Size>
-void LaunchStackKernel(const Context& ctx,
-                       const IndexT x_col,
-                       const IndexT x_row,
-                       const IndexT out_col,
-                       const std::vector<const DenseTensor*>& x,
-                       DenseTensor* out) {
-  T* out_ptr = ctx.template Alloc<T>(out);
-  auto config = phi::backends::gpu::GetGpuLaunchConfig2D(ctx, out_col, x_row);
-
-  funcs::ConstPointerArraySetter<Context, T, Size> setter(ctx, x);
-  funcs::GeneralDivMod<IndexT> divmoder(x_col);
-  StackCUDAKernel<T, IndexT, decltype(setter.array)>
-      <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
-          setter.array, divmoder, x_col, x_row, out_col, out_ptr);
-}
-
 template <typename T, typename Context>
 void StackKernel(const Context& ctx,
                  const std::vector<const DenseTensor*>& x,
                  int axis,
                  DenseTensor* out) {
-  if (axis < 0) axis += (x[0]->dims().size() + 1);
-  int num = static_cast<int>(x.size());
-
-  // Split x dim from axis to matrix
-  int64_t x_row = 1;
-  for (int i = 0; i < axis; ++i) {
-    x_row *= x[0]->dims()[i];
-  }
-  int64_t x_col = x[0]->numel() / x_row;
-  int64_t out_col = x_col * num;
-
-  if (out->numel() < std::numeric_limits<int32_t>::max()) {
-    switch (funcs::CalcArraySize(num)) {
-      SEGMENTED_ARRAY_KERNEL_HELPER(
-          LaunchStackKernel<Context, T, int32_t, kArraySize>(
-              ctx, x_col, x_row, out_col, x, out));
-    }
-  } else {
-    switch (funcs::CalcArraySize(num)) {
-      SEGMENTED_ARRAY_KERNEL_HELPER(
-          LaunchStackKernel<Context, T, int64_t, kArraySize>(
-              ctx, x_col, x_row, out_col, x, out));
-    }
-  }
+  funcs::StackRawKernel<T, Context>(ctx, x, axis, out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
index b7c349de0df32..88bf155606c1b 100644
--- a/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_grad_kernel.cu
@@ -16,7 +16,19 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/unstack_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/stack_and_unstack.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnStackGradKernel(const Context& ctx,
+                       const std::vector<const DenseTensor*>& out_grad,
+                       int axis,
+                       DenseTensor* x_grad) {
+  funcs::StackRawKernel<T, Context>(ctx, out_grad, axis, x_grad);
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(unstack_grad,
                    GPU,
@@ -26,4 +38,5 @@ PD_REGISTER_KERNEL(unstack_grad,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/unstack_kernel.cu b/paddle/phi/kernels/gpu/unstack_kernel.cu
index f147f4c0f0edf..4331322bdc202 100644
--- a/paddle/phi/kernels/gpu/unstack_kernel.cu
+++ b/paddle/phi/kernels/gpu/unstack_kernel.cu
@@ -16,7 +16,33 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/unstack_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/stack_and_unstack.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnStackKernel(const Context& ctx,
+                   const DenseTensor& x,
+                   int axis,
+                   int num,
+                   std::vector<DenseTensor*> outs) {
+  if (x.numel() == 0) return;
+  if (axis < 0) axis += x.dims().size();
+
+  int64_t split_dim = x.dims()[axis];
+  PADDLE_ENFORCE_EQ(
+      split_dim,
+      outs.size(),
+      phi::errors::InvalidArgument(
+          "Output outs's size should be equal to the split_dim, but"
+          " received split_dim is:%d outs's size is:%d.",
+          split_dim,
+          outs.size()));
+
+  funcs::UnStackRawKernel<T, Context>(ctx, x, axis, &outs);
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(unstack,
                    GPU,
@@ -26,4 +52,5 @@ PD_REGISTER_KERNEL(unstack,
                    double,
                    int64_t,
                    int,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/stack_grad_kernel.h b/paddle/phi/kernels/stack_grad_kernel.h
index 32451e606f26a..1e8f2d68399f8 100644
--- a/paddle/phi/kernels/stack_grad_kernel.h
+++ b/paddle/phi/kernels/stack_grad_kernel.h
@@ -20,7 +20,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void StackGradKernel(const Context& dev_ctx,
-                     const DenseTensor& out,
+                     const DenseTensor& out_grad,
                      int axis,
                      std::vector<DenseTensor*> x_grad);
 
diff --git a/paddle/phi/kernels/unstack_grad_kernel.h b/paddle/phi/kernels/unstack_grad_kernel.h
index de0e3004d8038..cb50f5ec9240c 100644
--- a/paddle/phi/kernels/unstack_grad_kernel.h
+++ b/paddle/phi/kernels/unstack_grad_kernel.h
@@ -20,7 +20,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void UnStackGradKernel(const Context& dev_ctx,
-                       const std::vector<const DenseTensor*>& x,
+                       const std::vector<const DenseTensor*>& out_grad,
                        int axis,
                        DenseTensor* x_grad);
 

From 4976153dd53605e0be3fecc4fec393396ab181c2 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 10:38:32 +0800
Subject: [PATCH 07/89] add dims check for nms_kernel (#49993)

---
 paddle/phi/kernels/cpu/nms_kernel.cc | 12 ++++++++++++
 paddle/phi/kernels/gpu/nms_kernel.cu | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/paddle/phi/kernels/cpu/nms_kernel.cc b/paddle/phi/kernels/cpu/nms_kernel.cc
index 4b56f6bb95105..6743f13fff719 100644
--- a/paddle/phi/kernels/cpu/nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/nms_kernel.cc
@@ -69,6 +69,18 @@ void NMSKernel(const Context& dev_ctx,
                const DenseTensor& boxes,
                float threshold,
                DenseTensor* output) {
+  PADDLE_ENFORCE_EQ(
+      boxes.dims().size(),
+      2,
+      phi::errors::InvalidArgument("The shape [%s] of boxes must be (N, 4).",
+                                   boxes.dims()));
+
+  PADDLE_ENFORCE_EQ(
+      boxes.dims()[1],
+      4,
+      phi::errors::InvalidArgument("The shape [%s] of boxes must be (N, 4).",
+                                   boxes.dims()));
+
   int64_t num_boxes = boxes.dims()[0];
   DenseTensor output_tmp;
   output_tmp.Resize(phi::make_ddim({num_boxes}));
diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu
index 79b0b8dfb1825..81f5ca8d1619f 100644
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -59,6 +59,18 @@ void NMSKernel(const Context& dev_ctx,
                const DenseTensor& boxes,
                float threshold,
                DenseTensor* output) {
+  PADDLE_ENFORCE_EQ(
+      boxes.dims().size(),
+      2,
+      phi::errors::InvalidArgument("The shape [%s] of boxes must be (N, 4).",
+                                   boxes.dims()));
+
+  PADDLE_ENFORCE_EQ(
+      boxes.dims()[1],
+      4,
+      phi::errors::InvalidArgument("The shape [%s] of boxes must be (N, 4).",
+                                   boxes.dims()));
+
   const int64_t num_boxes = boxes.dims()[0];
   const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
   dim3 block(threadsPerBlock);

From baf96a123b09a4755a3b4c787efaf256bf1f4cb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:42:40 +0800
Subject: [PATCH 08/89] fix the div 0 error of pixel_shuffle (#49996)

---
 paddle/phi/infermeta/unary.cc                             | 4 ++++
 python/paddle/fluid/tests/unittests/test_pixel_shuffle.py | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 55e895c6622a6..8cea16f770631 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2533,6 +2533,10 @@ void PixelShuffleInferMeta(const MetaTensor& x,
                         "Input should be a 4-D tensor of format [N, C, H, W] "
                         "or [N, H, W, C], but got %u.",
                         input_dims.size()));
+  PADDLE_ENFORCE_NE(
+      upscale_factor,
+      0,
+      phi::errors::InvalidArgument("upscale_factor should not be 0."));
 
   const bool channel_last = (data_format == "NHWC");
 
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 196a4ddbd4005..9600f5a872c56 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -227,6 +227,13 @@ def error_upscale_factor():
 
         self.assertRaises(TypeError, error_upscale_factor)
 
+        def error_0_upscale_factor():
+            with paddle.fluid.dygraph.guard():
+                x = paddle.uniform([1, 1, 1, 1], dtype='float64')
+                pixel_shuffle = F.pixel_shuffle(x, 0)
+
+        self.assertRaises(ValueError, error_0_upscale_factor)
+
         def error_data_format():
             with paddle.fluid.dygraph.guard():
                 x = np.random.random([2, 9, 4, 4]).astype("float64")

From 66682be0c0f0ca80f115f053977a971951e736d3 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 10:43:40 +0800
Subject: [PATCH 09/89] =?UTF-8?q?Fix=20=E5=A0=86=E6=A0=88=E6=BA=A2?=
 =?UTF-8?q?=E5=87=BA=20(stack=20overflow)=20of=20case9:=20paddle.repeat=5F?=
 =?UTF-8?q?interleave=20(#49982)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support negative index in repeat_interleave

* add unittest
---
 paddle/phi/infermeta/unary.cc                 | 39 ++++++++++++-------
 .../unittests/test_repeat_interleave_op.py    | 20 ++++++++++
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 8cea16f770631..e08f1769bef48 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3075,27 +3075,40 @@ void RepeatInterleaveInferMeta(const MetaTensor& x,
                                MetaTensor* out) {
   const auto& input_dim = x.dims();
   auto output_dim = phi::vectorize(input_dim);
+  auto n_dim = dim;
 
-  PADDLE_ENFORCE_EQ(
-      dim < input_dim.size() && dim >= (0 - input_dim.size()),
-      true,
+  if (n_dim < 0) n_dim += input_dim.size();
+
+  PADDLE_ENFORCE_LT(
+      dim,
+      input_dim.size(),
       phi::errors::OutOfRange(
           "Attr(dim) is out of range, It's expected "
-          "to be in range of [-%d, %d]. But received Attr(dim) = %d.",
-          input_dim.size(),
+          "to be in range of [%d, %d]. But received Attr(dim) = %d.",
+          -input_dim.size(),
           input_dim.size() - 1,
           dim));
-  PADDLE_ENFORCE_EQ(
-      repeats > 0,
-      true,
+  PADDLE_ENFORCE_GE(
+      dim,
+      (0 - input_dim.size()),
+      phi::errors::OutOfRange(
+          "Attr(dim) is out of range, It's expected "
+          "to be in range of [%d, %d]. But received Attr(dim) = %d.",
+          -input_dim.size(),
+          input_dim.size() - 1,
+          dim));
+
+  PADDLE_ENFORCE_GT(
+      repeats,
+      0,
       phi::errors::InvalidArgument("repeats should be larger than zero"));
 
-  PADDLE_ENFORCE_NE(out,
-                    nullptr,
-                    phi::errors::InvalidArgument(
-                        "repeat_interleave's output tensor can't be nullptr"));
+  PADDLE_ENFORCE_NOT_NULL(
+      out,
+      phi::errors::InvalidArgument(
+          "repeat_interleave's output tensor can't be nullptr"));
 
-  output_dim[dim] = input_dim[dim] * repeats;
+  output_dim[n_dim] = input_dim[n_dim] * repeats;
   out->set_dims(phi::make_ddim(output_dim));
   out->share_lod(x);
   out->set_dtype(x.dtype());
diff --git a/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py b/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
index 90877a3047e2c..4b5272c5a4bdf 100644
--- a/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
+++ b/python/paddle/fluid/tests/unittests/test_repeat_interleave_op.py
@@ -188,6 +188,26 @@ def test_repeat_interleave_api(self):
         expect_out = np.repeat(self.data_zero_dim_x, repeats)
         np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
 
+        # case 4 negative axis:
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32')
+            x.desc.set_need_check_feed(False)
+            index = paddle.static.data(
+                name='repeats_',
+                shape=[4],
+                dtype='int32',
+            )
+            index.desc.set_need_check_feed(False)
+            z = paddle.repeat_interleave(x, index, axis=-1)
+            exe = fluid.Executor(fluid.CPUPlace())
+            (res,) = exe.run(
+                feed={'x': self.data_x, 'repeats_': self.data_index},
+                fetch_list=[z.name],
+                return_numpy=False,
+            )
+        expect_out = np.repeat(self.data_x, self.data_index, axis=-1)
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
     def test_dygraph_api(self):
         self.input_data()
         # case axis none

From fb74147c6aa3ae8a1256f8e84f46af3632190f44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:44:57 +0800
Subject: [PATCH 10/89] Fix the div 0 error of matrix_power (#49942)

* add zero size check in matrix_power_kernel_impl.h

* add zero size check in matrix_power_kernel_impl.h

* add zero size check in unittest

* bug_fix

* bug_fix

* bug_fix

* bug_fix

* bug_fix

* bug fix

* bug_fix

* bug_fix

* add static check

* delete the dy codes
---
 paddle/phi/infermeta/unary.cc                               | 5 +++++
 python/paddle/fluid/tests/unittests/test_matrix_power_op.py | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index e08f1769bef48..8a3c33a4d6c72 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1889,6 +1889,11 @@ void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) {
                         "The Input(X) should have at least 2 dimensions. But "
                         "received a %d dimension tensor.",
                         n_dim));
+  for (int i = 0; i < n_dim; ++i)
+    PADDLE_ENFORCE_NE(
+        dims[i],
+        0,
+        phi::errors::InvalidArgument("The size of Input(X) should not be 0."));
   PADDLE_ENFORCE_EQ(dims[n_dim - 2],
                     dims[n_dim - 1],
                     phi::errors::InvalidArgument(
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
index 29f82b0350d65..7f26a7170191f 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
@@ -312,6 +312,10 @@ def test_errors(self):
         input = fluid.data(name="input_3", shape=[4, 5], dtype="float32")
         self.assertRaises(ValueError, paddle.linalg.matrix_power, input, 2)
 
+        # The size of input should not be 0
+        input = fluid.data(name="input_4", shape=[1, 1, 0, 0], dtype="float32")
+        self.assertRaises(ValueError, paddle.linalg.matrix_power, input, 2)
+
 
 class TestMatrixPowerSingularAPI(unittest.TestCase):
     def setUp(self):

From 7bb67db3d53297df8cce4b30992bb1035ba3bf62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:48:58 +0800
Subject: [PATCH 11/89] fix the div 0 errors in psroi_pool (#49965)

* fix the div 0 errors in psroi_pool

* fix case 7

* rool back sth.
---
 .../fluid/tests/unittests/test_psroi_pool_op.py  | 16 ++++++++++++++++
 python/paddle/vision/ops.py                      |  2 ++
 2 files changed, 18 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
index 40f3c52d4fc03..c33d218cd8621 100644
--- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
@@ -339,6 +339,22 @@ def test_channel_error():
         self.assertRaises(ValueError, test_channel_error)
 
 
+class TestPSROIPoolZeroDivError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+        self.boxes = paddle.to_tensor(
+            [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32'
+        )
+        self.boxes_num = paddle.to_tensor([1, 2], dtype='int32')
+
+    def test_errors(self):
+        def test_zero_div_error():
+            paddle.vision.ops.psroi_pool(self.x, self.boxes, self.boxes_num, 0)
+
+        self.assertRaises(ValueError, test_zero_div_error)
+
+
 class TestPSROIPoolStaticAPI(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 0696b5f7cc6a1..0d43bd0fc54ce 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1424,6 +1424,8 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         output_size = (output_size, output_size)
     pooled_height, pooled_width = output_size
     assert len(x.shape) == 4, "Input features with shape should be (N, C, H, W)"
+    if pooled_height * pooled_width == 0:
+        raise ValueError('output_size should not contain 0.')
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
         return _C_ops.psroi_pool(

From 82edc65ba2e533c25cf6cd34117f43268043ba44 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 10:50:16 +0800
Subject: [PATCH 12/89] =?UTF-8?q?Fix=20=E7=A9=BA=E6=8C=87=E9=92=88=20(Null?=
 =?UTF-8?q?=20pointer)=20of=20case=2014=20paddle.atan2=20(#49973)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add elements count check in atan2

* add unittest and pre-check in inferMeta

* add dimension check
---
 paddle/phi/infermeta/binary.cc                | 20 +++++++++++++++++++
 paddle/phi/kernels/impl/atan2_kernel_impl.h   |  8 ++++++++
 .../fluid/tests/unittests/test_atan2_op.py    | 12 +++++++++++
 3 files changed, 40 insertions(+)

diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 561938adca80a..3ca56e0602c1d 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -142,6 +142,26 @@ void KLDivInferMeta(const MetaTensor& x,
 }
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      y_dims.size(),
+      phi::errors::InvalidArgument("The rank (%d) of X shall be same as "
+                                   "rank (%d) of Y.",
+                                   x_dims.size(),
+                                   y_dims.size()));
+
+  if (x_dims.size() > 0)
+    PADDLE_ENFORCE_LE(x_dims[0],
+                      y_dims[0],
+                      phi::errors::InvalidArgument(
+                          "The count (%d) of elements of X shall not "
+                          "greater than count (%d) of elements of Y.",
+                          x_dims[0],
+                          y_dims[0]));
+
   out->share_meta(x);
   if (x.dtype() == DataType::INT32 || x.dtype() == DataType::INT64 ||
       y.dtype() == DataType::INT32 || y.dtype() == DataType::INT64) {
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index 2cae914e2f615..b7799a777046f 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -77,6 +77,14 @@ void Atan2Kernel(const Context& ctx,
   auto x_data = x.data<T>();
   auto y_data = y.data<T>();
 
+  PADDLE_ENFORCE_LE(
+      numel,
+      y.numel(),
+      phi::errors::InvalidArgument("The count (%d) of elements of X shall not "
+                                   "greater than count (%d) of elements of Y.",
+                                   numel,
+                                   y.numel()));
+
   auto* out_data = ctx.template Alloc<typename Atan2Out<T>::type>(
       out, size_t(x.numel() * sizeof(typename Atan2Out<T>::type)));
 
diff --git a/python/paddle/fluid/tests/unittests/test_atan2_op.py b/python/paddle/fluid/tests/unittests/test_atan2_op.py
index 77ad77e3252b8..6b62b25ac5d8a 100644
--- a/python/paddle/fluid/tests/unittests/test_atan2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_atan2_op.py
@@ -130,6 +130,18 @@ def run(place):
             run(place)
 
 
+class TestAtan2Error(unittest.TestCase):
+    def test_mismatch(self):
+        paddle.enable_static()
+
+        def test_mismatch_numel():
+            X = paddle.fluid.data('X', (1,), dtype=np.float64)
+            Y = paddle.fluid.data('Y', (0,), dtype=np.float64)
+            out = paddle.atan2(X, Y)
+
+        self.assertRaises(ValueError, test_mismatch_numel)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()

From dbfdefa750bb7f8148b6b59277310888df89447a Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 10:51:23 +0800
Subject: [PATCH 13/89] =?UTF-8?q?Fix=20=E5=A0=86=E6=A0=88=E6=BA=A2?=
 =?UTF-8?q?=E5=87=BA=20(stack=20overflow)=20of=20case10:=20paddle.unique?=
 =?UTF-8?q?=20(#49981)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add axis check in UniqueRawInferMeta

* add unittest for negative axis

* simplify check for unique
---
 paddle/phi/infermeta/unary.cc                 |  9 +++++++
 .../fluid/tests/unittests/test_unique.py      | 26 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 8a3c33a4d6c72..3b3202c291725 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -4648,6 +4648,7 @@ void UniqueRawInferMeta(const MetaTensor& x,
     if (axis_value < 0) {
       axis_value += x.dims().size();
     }
+
     PADDLE_ENFORCE_LT(
         axis_value,
         x.dims().size(),
@@ -4655,6 +4656,14 @@ void UniqueRawInferMeta(const MetaTensor& x,
                                      "the dimension size(%d) of x.",
                                      axis_value,
                                      x.dims().size()));
+    PADDLE_ENFORCE_GE(
+        axis_value,
+        0,
+        phi::errors::InvalidArgument(
+            "The axis(%d) + rank(x) (%d) should be greater than or equal to 0.",
+            axis_value,
+            -x.dims().size()));
+
     auto out_dims = x.dims();
     out_dims[axis_value] = -1;
     out->set_dims(out_dims);
diff --git a/python/paddle/fluid/tests/unittests/test_unique.py b/python/paddle/fluid/tests/unittests/test_unique.py
index 9183c1bd676bb..b3ae10a6c335e 100644
--- a/python/paddle/fluid/tests/unittests/test_unique.py
+++ b/python/paddle/fluid/tests/unittests/test_unique.py
@@ -190,6 +190,32 @@ def init_config(self):
         }
 
 
+class TestUniqueOpAxisNeg(TestUniqueOp):
+    def init_config(self):
+        self.inputs = {'X': np.random.random((6, 1, 8)).astype('float64')}
+        unique, indices, inverse, counts = np.unique(
+            self.inputs['X'],
+            return_index=True,
+            return_inverse=True,
+            return_counts=True,
+            axis=-1,
+        )
+        self.attrs = {
+            'dtype': int(core.VarDesc.VarType.INT32),
+            "return_index": True,
+            "return_inverse": True,
+            "return_counts": True,
+            "axis": [-1],
+            "is_sorted": True,
+        }
+        self.outputs = {
+            'Out': unique,
+            'Indices': indices,
+            "Index": inverse,
+            "Counts": counts,
+        }
+
+
 class TestUniqueOpAxis1(TestUniqueOp):
     def init_config(self):
         self.inputs = {'X': np.random.random((3, 8, 8)).astype('float64')}

From 52672ea5eeee47e57d1389528c48e2aae3c159ae Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 10:52:06 +0800
Subject: [PATCH 14/89] Fix Python IndexError of case17:
 paddle.nn.functional.interpolate (#49992)

* add dimension check for interpolate

* modify dimension check for interpolate

* add unittest to size check for interpolate

* fix incorrect shape check for interpolate

* split size check and add unittests
---
 .../tests/unittests/test_bicubic_interp_op.py   |  7 +++++++
 .../unittests/test_bicubic_interp_v2_op.py      | 17 +++++++++++++++++
 python/paddle/nn/functional/common.py           | 17 +++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py
index b5d1a7d0dfd5c..6482a5fddf9a8 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_op.py
@@ -390,6 +390,12 @@ def test_input_shape():
                     x, size=[12, 12], mode='BICUBIC', align_corners=False
                 )
 
+            def test_size_shape():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size=[12], mode='BICUBIC', align_corners=False
+                )
+
             def test_align_corcers():
                 x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
                 interpolate(x, size=[12, 12], mode='BICUBIC', align_corners=3)
@@ -481,6 +487,7 @@ def test_outshape_and_scale():
 
             self.assertRaises(ValueError, test_mode_type)
             self.assertRaises(ValueError, test_input_shape)
+            self.assertRaises(ValueError, test_size_shape)
             self.assertRaises(TypeError, test_align_corcers)
             self.assertRaises(ValueError, test_attr_data_format)
             self.assertRaises(TypeError, test_actual_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
index fed25ad18d258..a52a5b3f36d13 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -610,6 +610,20 @@ def test_size_type():
                 x, size={2, 2}, mode='bicubic', align_corners=False
             )
 
+        def test_size_length():
+            x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+            out = interpolate(x, size=[2], mode='bicubic', align_corners=False)
+
+        def test_size_tensor_ndim():
+            x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+            size = paddle.to_tensor(np.array([[2, 2]]))
+            out = interpolate(x, size=size, mode='bicubic', align_corners=False)
+
+        def test_size_tensor_length():
+            x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+            size = paddle.to_tensor(np.array([2]))
+            out = interpolate(x, size=size, mode='bicubic', align_corners=False)
+
         def test_input_shape_1():
             x = fluid.data(name="x", shape=[2, 1, 0, 0], dtype="float32")
             out = interpolate(
@@ -633,6 +647,9 @@ def test_input_shape_1():
         self.assertRaises(ValueError, test_size_and_scale)
         self.assertRaises(ValueError, test_size_and_scale2)
         self.assertRaises(TypeError, test_size_type)
+        self.assertRaises(ValueError, test_size_length)
+        self.assertRaises(ValueError, test_size_tensor_ndim)
+        self.assertRaises(ValueError, test_size_tensor_length)
         self.assertRaises(ValueError, test_input_shape_1)
 
     def test_errors(self):
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 6631682d0e949..d9f5b0b160dc0 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -397,6 +397,23 @@ def interpolate(
     if size is None and scale_factor is None:
         raise ValueError("One of size and scale_factor must not be None.")
 
+    if (isinstance(size, list) or isinstance(size, tuple)) and len(
+        size
+    ) != x.ndim - 2:
+        raise ValueError(
+            'The x and size should satisfy rank(x) - 2 == len(size).'
+        )
+
+    if isinstance(size, Variable):
+        if size.ndim != 1:
+            raise ValueError(
+                f"If size is a tensor, it's rank must be 1, but received {size.ndim}."
+            )
+        if size.shape[0] != x.ndim - 2:
+            raise ValueError(
+                'The x and size should satisfy rank(x) - 2 == size.shape[0].'
+            )
+
     if not isinstance(align_corners, bool):
         raise TypeError("Attr align_corners should be a bool value")
 

From 1755a1549987601af10a3e6228bfbe41b796ff2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 11:00:05 +0800
Subject: [PATCH 15/89] fix div 0 error in conv1_transpose (#50000)

---
 paddle/phi/kernels/funcs/concat_and_split_functor.cc |  5 +++++
 .../unittests/test_functional_conv1d_transpose.py    | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
index aa73ba5f68990..fd61484eb8526 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -37,6 +37,11 @@ struct ConcatFunctor<phi::CPUContext, T> {
     }
     int64_t out_rows = rows, out_cols = 0;
 
+    PADDLE_ENFORCE_NE(
+        rows,
+        0,
+        phi::errors::InvalidArgument("The input size should not be 0."));
+
     std::vector<int64_t> input_cols(input.size());
     for (size_t i = 0; i < num; ++i) {
       int64_t t_cols = input[i].numel() / rows;
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
index 1d4e079f9f84a..865c848f8ba1d 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv1d_transpose.py
@@ -82,5 +82,17 @@ def setUp(self):
         self.data_format = "NCL"
 
 
+class TestFunctionalConv1DErrorCase3(TestFunctionalConv1DError):
+    def setUp(self):
+        self.input = np.random.randn(6, 0, 6)
+        self.filter = np.random.randn(6, 0, 0)
+        self.bias = None
+        self.padding = 0
+        self.stride = 1
+        self.dilation = 1
+        self.groups = 1
+        self.data_format = "NCL"
+
+
 if __name__ == "__main__":
     unittest.main()

From 0d32f554c17a97aa534b4ff9901dcfa9a9c77f97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 11:01:20 +0800
Subject: [PATCH 16/89] fix the indexerror of conv2d_transpose (#50005)

---
 .../fluid/tests/unittests/test_conv2d_transpose_op.py | 11 +++++++++++
 python/paddle/static/nn/common.py                     |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index afbce517f6243..89339303567f2 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -989,6 +989,17 @@ def error_groups():
 
         self.assertRaises(ValueError, error_groups)
 
+        def error_0_filter_number():
+            out = paddle.static.nn.conv2d_transpose(
+                input=data,
+                groups=1,
+                num_filters=0,
+                filter_size=3,
+                data_format='NCHW',
+            )
+
+        self.assertRaises(ValueError, error_0_filter_number)
+
 
 class TestConv2DTransposeRepr(unittest.TestCase):
     def test_case(self):
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 5da81feb3369d..3b40153cbb797 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -1542,6 +1542,9 @@ def conv2d_transpose(
             "but received {}".format(len(input.shape))
         )
 
+    if num_filters == 0:
+        raise ValueError("num of filters should not be 0.")
+
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of Op(paddle.static.nn.layers.conv2d_transpose) got wrong value: received "

From da11aa40efa0e6ef4bfbcd72c9e3f8f86c39cd06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=AD=A3=E6=B5=B7?=
 <65210872+ccsuzzh@users.noreply.github.com>
Date: Tue, 31 Jan 2023 11:02:11 +0800
Subject: [PATCH 17/89] Fix Python IndexError of case13:
 paddle.static.nn.batch_norm (#50011)

* add channel_num check for paddle.static.nn.batch_norm

* fix bugs

* fix bugs
---
 python/paddle/fluid/tests/unittests/test_batch_norm_op.py | 4 ++++
 python/paddle/fluid/tests/unittests/test_fold_op.py       | 2 +-
 python/paddle/static/nn/common.py                         | 6 ++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index c2a6c468e5c8f..02171db3fca75 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -768,6 +768,10 @@ def test_errors(self):
             )
             self.assertRaises(TypeError, paddle.static.nn.batch_norm, x2)
 
+            # the first dimension of input for batch_norm must between [2d, 5d].
+            x3 = paddle.static.data("", shape=[0], dtype="float32")
+            self.assertRaises(ValueError, paddle.static.nn.batch_norm, x3)
+
 
 class TestDygraphBatchNormAPIError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 1f3193fa1fd49..a86161cc45023 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -179,7 +179,7 @@ def test_errors(self):
         with program_guard(Program(), Program()):
 
             def test_input_shape():
-                # input_shpae must be 3-D
+                # input_shape must be 3-D
                 x = paddle.randn(shape=[2, 3, 6, 7], dtype="float32")
                 out = fold(x, output_sizes=[2, 3], kernel_sizes=[2, 2])
 
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 3b40153cbb797..c43385a8e9140 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -2731,6 +2731,12 @@ def batch_norm(
         dtype = core.VarDesc.VarType.FP32
 
     input_shape = input.shape
+    if len(input.shape) < 2 or len(input.shape) > 5:
+        raise ValueError(
+            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'.format(
+                len(input.shape), input_shape
+            )
+        )
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
     else:

From a48ef36002c9f52bb7b4b6f6c3426cc913433ce1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 11:04:07 +0800
Subject: [PATCH 18/89] fix the NullPointerError of median (#50017)

---
 python/paddle/fluid/tests/unittests/test_median.py | 1 +
 python/paddle/tensor/stat.py                       | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_median.py b/python/paddle/fluid/tests/unittests/test_median.py
index a62e722dd0496..1f90faeac018b 100644
--- a/python/paddle/fluid/tests/unittests/test_median.py
+++ b/python/paddle/fluid/tests/unittests/test_median.py
@@ -86,6 +86,7 @@ def test_median_exception(self):
         x = paddle.arange(12).reshape([3, 4])
         self.assertRaises(ValueError, paddle.median, x, 1.0)
         self.assertRaises(ValueError, paddle.median, x, 2)
+        self.assertRaises(ValueError, paddle.median, paddle.to_tensor([]))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index e23f28aa76b1b..cc94aee415541 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -406,6 +406,9 @@ def median(x, axis=None, keepdim=False, name=None):
     if not isinstance(x, Variable):
         raise TypeError("In median, the input x should be a Tensor.")
 
+    if x.size == 0:
+        raise ValueError("In median, the size of input x should not be 0.")
+
     if len(x.shape) == 0:
         return x.clone()
 

From 48b3e86956fd8b25f11be60e04df0b63df857d4c Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 31 Jan 2023 11:27:46 +0800
Subject: [PATCH 19/89] [Decouple phi] Decouple custom_op in fluid and phi
 (#49866)

* decouple phi custom_op

* decouple phi custom_op, remove codes

* delete custom symbol of inference
---
 paddle/fluid/inference/paddle_inference.map          |  3 ---
 .../inference/paddle_inference_custom_device.map     |  3 ---
 paddle/phi/api/ext/op_meta_info.h                    | 10 ----------
 paddle/phi/api/lib/op_meta_info.cc                   | 12 ------------
 4 files changed, 28 deletions(-)

diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index acbdcf5d78358..3d2dc85cb8368 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -39,9 +39,6 @@
 			*paddle::GPUContextResource*;
 			*paddle::CPUContextResource*;
 
-			*paddle::LoadCustomOperatorLib*;
-			*paddle::RegisterAllCustomOperator*;
-
 			/* ut needs the following symbol, we need to modify all the ut to hidden such symbols */
 
 			/* Another question: the ut size will grow from 50M to 80M, why? */
diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map
index 83de7d9a77566..7434678d8679a 100644
--- a/paddle/fluid/inference/paddle_inference_custom_device.map
+++ b/paddle/fluid/inference/paddle_inference_custom_device.map
@@ -39,9 +39,6 @@
 			*paddle::GPUContextResource*;
 			*paddle::CPUContextResource*;
 
-			*paddle::LoadCustomOperatorLib*;
-			*paddle::RegisterAllCustomOperator*;
-
 			/* ut needs the following symbol, we need to modify all the ut to hidden such symbols */
 
 			/* Another question: the ut size will grow from 50M to 80M, why? */
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 7d2be9c90d79e..978599c3bc299 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -619,16 +619,6 @@ class PADDLE_API OpMetaInfoBuilder {
   size_t index_;
 };
 
-/////////////////////// Op register API /////////////////////////
-
-// For inference: compile directly with framework
-// Call after PD_BUILD_OP(...)
-void RegisterAllCustomOperator();
-
-// Using this api to load compiled custom operator's dynamic library and
-// register Custom Operator into it
-void LoadCustomOperatorLib(const std::string& dso_name);
-
 /////////////////////// Op register Macro /////////////////////////
 
 #define PD_BUILD_OP(op_name)                                                   \
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 8d64246bdb69f..a6b7921c30c61 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 
@@ -244,17 +243,6 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
   info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
   return *this;
 }
-
-/////////////////////// Op register API /////////////////////////
-
-void RegisterAllCustomOperator() {
-  auto& op_meta_info_map = OpMetaInfoMap::Instance();
-  framework::RegisterOperatorWithMetaInfoMap(op_meta_info_map);
-}
-
-void LoadCustomOperatorLib(const std::string& dso_name) {
-  paddle::framework::LoadOpMetaInfoAndRegisterOp(dso_name);
-}
 }  // namespace paddle
 
 #ifdef __cplusplus

From eba7b584fed4a1f55eb9422b390f2d90b8b505e1 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Tue, 31 Jan 2023 12:02:38 +0800
Subject: [PATCH 20/89] change no_event GC to fast GC for xpu (#49871)

---
 .../garbage_collector/garbage_collector.cc            | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
index 73e6664f66f1e..8ff8b9528322f 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc
@@ -60,7 +60,16 @@ CreateInterpreterCoreGarbageCollector(
       return std::unique_ptr<InterpreterCoreGarbageCollector>(
           new InterpreterCoreEventGarbageCollector(vec_instruction));
     }
-  } else if (platform::is_xpu_place(place) || platform::is_ipu_place(place)) {
+  } else if (platform::is_xpu_place(place)) {
+    // Because there is no multi-stream on XPU device, fast GC can
+    // be used.
+    // Previously, XPU used no_event GC. But `Wait` in no_event GC
+    // may cause GC delayed, causing no enough memory problem.
+    // TODO(pangyoki): Multi-stream allocator and multi-stream GC
+    // are needed to be adapted for XPU.
+    return std::unique_ptr<InterpreterCoreGarbageCollector>(
+        new InterpreterCoreFastGarbageCollector());
+  } else if (platform::is_ipu_place(place)) {
     return std::unique_ptr<InterpreterCoreGarbageCollector>(
         new InterpreterCoreNoEventGarbageCollector());
   } else {

From 7122760a5506fa1844893c0c3e97a23376ff855b Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Tue, 31 Jan 2023 12:06:19 +0800
Subject: [PATCH 21/89] [Fix deprecation warning in test] np.float =>
 np.float64 (#49970)

---
 python/paddle/optimizer/lr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 258e69978a2ec..07420be8915d3 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1406,7 +1406,7 @@ def step(self, metrics, epoch=None):
             metrics, (int, float, numpy.float32, numpy.float64)
         ):
             raise TypeError(
-                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".format(
+                "metrics must be 'int', 'float', 'np.float64', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".format(
                     type(metrics)
                 )
             )

From 754ab7050282606b861fa27291561c3de12af2fd Mon Sep 17 00:00:00 2001
From: TeFeng Chen <ctfeng66@163.com>
Date: Tue, 31 Jan 2023 12:38:07 +0800
Subject: [PATCH 22/89] support inplaced variable in cinn_launch (#49912)

* support inplaced variable in cinn_launch

* fix error hint when compiling

* fix inplaced output variable of the subgraph

* skip CinnCompiler check

* using existed definition

* fix namespace reference error

* modify error message

* update cinn tage

* fix namespace

* skip enforce check

* fix unittest attribute throw
---
 cmake/external/cinn.cmake                     |  4 +-
 .../framework/paddle2cinn/cinn_compiler.cc    | 10 ++-
 .../operators/cinn/cinn_instruction_run_op.h  |  3 +-
 .../operators/cinn/cinn_launch_context.cc     | 82 ++++++++++---------
 .../operators/cinn/cinn_launch_context.h      | 11 ++-
 .../cinn/cinn_launch_context_test.cc          |  2 +
 paddle/fluid/operators/cinn/test_helper.h     |  2 +
 7 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 742219b53f19e..3ec194a6bfb37 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -17,8 +17,8 @@ if(NOT WITH_CINN)
 endif()
 
 if(NOT CINN_GIT_TAG)
-  # 2023.01.12 commit
-  set(CINN_GIT_TAG 5d1ae0f4b8e3f7cd5b16dfc76d2161bf77e938ac)
+  # 2023.01.28 commit
+  set(CINN_GIT_TAG 1449890f7724babf2a343c6f8073bd28a7bbc683)
 endif()
 
 message(STATUS "CINN version: " ${CINN_GIT_TAG})
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 899d4177271c6..c01624a554961 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -26,6 +26,7 @@
 #include "cinn/auto_schedule/tuning.h"
 #include "cinn/common/target.h"
 #include "cinn/common/type.h"
+#include "cinn/frontend/op_mapper_registry.h"
 #include "cinn/frontend/optimize.h"
 #include "cinn/frontend/syntax.h"
 #include "cinn/hlir/framework/graph.h"
@@ -54,6 +55,7 @@ namespace paddle2cinn {
 using ::cinn::auto_schedule::AutoTuner;
 using ::cinn::common::Target;
 using ::cinn::frontend::Optimize;
+using ::cinn::frontend::paddle::InplaceOutSuffix;
 using ::cinn::hlir::framework::BuildScope;
 using ::cinn::hlir::framework::GraphCompiler;
 using inference::analysis::Dot;
@@ -239,11 +241,17 @@ void CinnCompiler::CheckCompiledValid(
     const std::map<std::string, const phi::DenseTensor *> &input_tensors,
     const CinnCompiledObject &compiled_obj) const {
   const auto &input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
+  const auto &inplace_var_names =
+      graph.Get<std::unordered_set<std::string>>(kInplaceVarNames);
   const auto &output_var_names =
       graph.Get<std::vector<std::string>>(kOutputVars);
   auto *launch_context = compiled_obj.launch_context.get();
   // 1. check all of the output variables will be assigned by compiled program
-  for (auto &&var_name : output_var_names) {
+  for (auto var_name : output_var_names) {
+    // inplace variables are renamed with a specified suffix
+    if (inplace_var_names.count(var_name)) {
+      var_name += InplaceOutSuffix;
+    }
     PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name),
                       true,
                       platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
index d0011eec0d68f..f9d0002883ae0 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
@@ -59,7 +59,8 @@ class CinnInstructionRunOpKernel : public framework::OpKernel<T> {
     auto share_argument_buffer_fn = [launch_context,
                                      &ctx](const std::string& var_name) {
       cinn_buffer_t* buffer = launch_context->GetCinnBufferOfVar(var_name);
-      framework::Variable* var = ctx.scope().GetVar(var_name);
+      std::string revise_var_name = launch_context->RedirectVarName(var_name);
+      framework::Variable* var = ctx.scope().GetVar(revise_var_name);
       auto* tensor = var->template GetMutable<phi::DenseTensor>();
       buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
           ctx.GetPlace(),
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 2aa8c1b8b89ba..af429e0f01e33 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -19,6 +19,7 @@
 #include <utility>
 #include <vector>
 
+#include "cinn/frontend/op_mapper_registry.h"
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/hlir/framework/instruction.h"
 #include "cinn/hlir/framework/scope.h"
@@ -50,6 +51,8 @@ using framework::ParallelExecutor;
 using framework::Scope;
 using CinnInstruction = ::cinn::hlir::framework::Instruction;
 using CinnRuntimeProgram = ::cinn::hlir::framework::Program;
+using ::cinn::frontend::paddle::InplaceOutSuffix;
+using framework::paddle2cinn::kInplaceVarNames;
 using framework::paddle2cinn::kMemOptVarInfoFromMainGraph;
 using framework::paddle2cinn::kSkipGcVarNames;
 using framework::paddle2cinn::Name2VarInfoMap;
@@ -72,6 +75,8 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
       graph.Get<std::vector<std::string>>(framework::paddle2cinn::kInputVars);
   const auto& output_var_names =
       graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
+  inplace_var_names_ =
+      graph.Get<std::unordered_set<std::string>>(kInplaceVarNames);
   internal_var_names_ =
       ExtractInternalVarNames(input_var_names, output_var_names);
   // initialize all execution arguments
@@ -83,7 +88,13 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
     }
   }
   for (auto&& var_name : output_var_names) {
-    AssignExternalVariable(var_name);
+    if (inplace_var_names_.count(var_name)) {
+      VLOG(4) << "Inplaced variable:" << var_name << " -> "
+              << var_name + InplaceOutSuffix << " as paddle2cinn varmap key";
+      AssignExternalVariable(var_name + InplaceOutSuffix);
+    } else {
+      AssignExternalVariable(var_name);
+    }
   }
   for (auto&& var_name : internal_var_names_) {
     AssignInternalVariable(var_name);
@@ -124,14 +135,13 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
       "Distribution of variables in the graph compiled:"
       "input[%lu],internal[%lu],output[%lu],"
       "outer_eager_deletion[%lu],skip_eager_deletion[%lu],"
-      "skip_gc_vars_[%lu],initialized_beforehand[%lu]",
+      "skip_gc_vars_[%lu]",
       input_var_names.size(),
       internal_var_names_.size(),
       output_var_names.size(),
       outer_varinfo.size(),
       skip_eager_vars_.size(),
-      skip_gc_vars_.size(),
-      initialized_beforehand_vars_.size());
+      skip_gc_vars_.size());
 }
 
 void CinnLaunchContext::BuildVarNameMap(
@@ -214,8 +224,12 @@ std::unordered_set<std::string> CinnLaunchContext::ExtractInternalVarNames(
                  [](const auto& name_pair) { return name_pair.first; });
 
   // exclude the input variables and output variables
-  auto exclude_names_fn = [&remain_var_names](const std::string& var_name) {
+  auto exclude_names_fn = [this,
+                           &remain_var_names](const std::string& var_name) {
     remain_var_names.erase(var_name);
+    if (inplace_var_names_.count(var_name)) {
+      remain_var_names.erase(var_name + InplaceOutSuffix);
+    }
   };
   std::for_each(
       input_var_names.begin(), input_var_names.end(), exclude_names_fn);
@@ -281,11 +295,12 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
   auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  std::string revise_var_name = RedirectVarName(var_name);
   // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [this, var_name](void* ctx, cinn_buffer_t* buffer) {
-        auto* tensor =
-            cached_scope_->GetVar(var_name)->GetMutable<phi::DenseTensor>();
+      [this, revise_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor = cached_scope_->GetVar(revise_var_name)
+                           ->GetMutable<phi::DenseTensor>();
         tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
         buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
             *cached_place_,
@@ -307,11 +322,12 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
                     platform::errors::InvalidArgument(
                         "Variable(%s) not applied in cinn", var_name));
   auto* cinn_buffer = GetCinnBufferOfVar(var_name);
+  std::string revise_var_name = RedirectVarName(var_name);
   // assign external malloc/free callbacks of cinn_buffer_t
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [this, var_name](void* ctx, cinn_buffer_t* buffer) {
-        auto* tensor =
-            cached_temp_scope_->Var(var_name)->GetMutable<phi::DenseTensor>();
+      [this, revise_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor = cached_temp_scope_->Var(revise_var_name)
+                           ->GetMutable<phi::DenseTensor>();
         tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
         buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
             *cached_place_,
@@ -322,8 +338,8 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
   // internal variables should release its buffer immediately
   // if no instruction use it
   cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [this, var_name](void* ctx, cinn_buffer_t* buffer) {
-        auto* tensor = cached_temp_scope_->GetVar(var_name)
+      [this, revise_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor = cached_temp_scope_->GetVar(revise_var_name)
                            ->GetMutable<phi::DenseTensor>();
         tensor->clear();
         return 0;
@@ -359,7 +375,6 @@ std::unique_ptr<framework::ProgramDesc> CinnLaunchContext::BuildCompiledProgram(
   //   are set by values of the corresponding compiled tensors,
   //   including the in/out variables where the equiality between their tensors
   //   and the CINN compiled ones is verified in corresponding cinn_launch_op.
-  std::unordered_set<std::string> has_refer_vars;
   for (auto&& arg : cinn_argument_names_) {
     const std::string& var_name = cinn2paddle_varmap_.at(arg);
     framework::VarDesc* var_desc = block->Var(var_name);
@@ -370,7 +385,6 @@ std::unique_ptr<framework::ProgramDesc> CinnLaunchContext::BuildCompiledProgram(
       auto* ori_desc = res->second;
       var_desc->SetPersistable(ori_desc->Persistable());
       var_desc->SetIsParameter(ori_desc->IsParameter());
-      has_refer_vars.insert(var_name);
     }
 
     auto cinn_tensor = GetCinnTensorOfVar(var_name);
@@ -404,13 +418,6 @@ std::unique_ptr<framework::ProgramDesc> CinnLaunchContext::BuildCompiledProgram(
     auto* ins = instructions.at(ins_idx).get();
     auto in_args = trans_and_pack_args_fn(ins->GetInArgs());
     auto out_args = trans_and_pack_args_fn(ins->GetOutArgs());
-    for (auto&& var_name : in_args) {
-      if (!has_refer_vars.count(var_name)) {
-        initialized_beforehand_vars_.emplace_back(var_name);
-      }
-    }
-    has_refer_vars.insert(out_args.begin(), out_args.end());
-
     auto* op_desc = block->AppendOp();
     op_desc->SetType("cinn_instruction_run");
     op_desc->SetInput(kX, in_args);
@@ -453,14 +460,6 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
                                   framework::proto::VarType::LOD_TENSOR);
   }
 
-  for (auto&& var_name : initialized_beforehand_vars_) {
-    auto* var = scope->GetVar(var_name);
-    auto* buffer = GetCinnBufferOfVar(var_name);
-    auto dim = framework::DDim(buffer->dims, buffer->dimensions);
-    var->GetMutable<phi::DenseTensor>()->Resize(dim);
-    var->GetMutable<phi::DenseTensor>()->mutable_data(
-        place, framework::paddle2cinn::TransToPaddleDataType(buffer->type));
-  }
   return parallel_executor_.get();
 }
 
@@ -493,17 +492,24 @@ framework::InterpreterCore* CinnLaunchContext::InitializeInterpreterCore(
     }
     UpdateCapturedEnv(*scope, place);
   }
-  for (auto&& var_name : initialized_beforehand_vars_) {
-    auto* var = scope->GetVar(var_name);
-    auto* buffer = GetCinnBufferOfVar(var_name);
-    auto dim = framework::DDim(buffer->dims, buffer->dimensions);
-    var->GetMutable<phi::DenseTensor>()->Resize(dim);
-    var->GetMutable<phi::DenseTensor>()->mutable_data(
-        place, framework::paddle2cinn::TransToPaddleDataType(buffer->type));
-  }
   return interpreter_core_.get();
 }
 
+std::string CinnLaunchContext::RedirectVarName(const std::string& var_name) {
+  auto pos = var_name.find(InplaceOutSuffix);
+  if (pos == std::string::npos) {
+    return var_name;
+  }
+  std::string remove_suffix_name = var_name.substr(0, pos);
+  if (!inplace_var_names_.count(remove_suffix_name)) {
+    LOG(WARNING) << "Variable:" << remove_suffix_name
+                 << " was not marked as inplaced by Paddle, but CINN does";
+  }
+  VLOG(4) << "Inplaced variable:" << var_name << " redirect to "
+          << remove_suffix_name;
+  return remove_suffix_name;
+}
+
 cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar(
     const std::string& var_name) {
   auto res = paddle2argument_.find(var_name);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index f4794e6335bb6..97016cc7f56f3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -96,6 +96,9 @@ class CinnLaunchContext {
     return skip_eager_vars_;
   }
 
+  // Redirect the name of a Paddle variable to the orignal if it was inplaced
+  std::string RedirectVarName(const std::string& var_name);
+
   // Return internal variable names list
   const std::unordered_set<std::string>& GetInternalVarNames() const {
     return internal_var_names_;
@@ -151,11 +154,13 @@ class CinnLaunchContext {
   std::unordered_map<std::string, std::string> cinn2paddle_varmap_;
   // a list of internal variable names in Paddle
   std::unordered_set<std::string> internal_var_names_;
+  // In CINN, there are two variables(in/out) mapped to the one inplaced
+  // variable of Paddle. To resovle this conflict, we add a output counterpart
+  // in Paddle with the name suffixed by @InplaceOut.
+  // This set stores which Paddle variable names are inplaced.
+  std::unordered_set<std::string> inplace_var_names_;
   // the names of the cinn arguments used in compiled executable program
   std::unordered_set<std::string> cinn_argument_names_;
-  // TODO(CtfGo): remove this list after fixing batch_norm bug
-  // due to duplicate association in the same variable.
-  std::vector<std::string> initialized_beforehand_vars_;
   // the variable scope compiled from cinn
   const std::shared_ptr<CinnScope> cinn_scope_;
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index 2b4bc9acf1284..c362650c15d71 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -90,6 +90,8 @@ const Graph& InitDefaultSubgraph() {
         new std::vector<std::string>({"var5"}));
     graph->GetOrInit<Name2VarInfoMap>(
         framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
+    graph->GetOrInit<std::unordered_set<std::string>>(
+        framework::paddle2cinn::kInplaceVarNames);
   });
   return *graph.get();
 }
diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h
index d35996771b4d9..040a185810136 100644
--- a/paddle/fluid/operators/cinn/test_helper.h
+++ b/paddle/fluid/operators/cinn/test_helper.h
@@ -84,6 +84,8 @@ std::unique_ptr<Graph> CreateOnlyElementwiseAddGraph(
                                    new std::vector<std::string>({out_name}));
   g->GetOrInit<Name2VarInfoMap>(
       framework::paddle2cinn::kMemOptVarInfoFromMainGraph);
+  g->GetOrInit<std::unordered_set<std::string>>(
+      framework::paddle2cinn::kInplaceVarNames);
   return g;
 }
 

From e24745957735baf64d6d03230e86b304da11fcbf Mon Sep 17 00:00:00 2001
From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com>
Date: Tue, 31 Jan 2023 12:55:24 +0800
Subject: [PATCH 23/89] [XPU] Add unitest for set_value_grad. (#50049)

---
 .../unittests/xpu/test_set_value_op_xpu.py    | 563 +++++++++++++++++-
 1 file changed, 558 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
index cb9bacb48d7c0..72bb45da7ec9d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
@@ -16,12 +16,10 @@
 
 import sys
 import unittest
+from functools import reduce
 
 import numpy as np
 
-# from functools import reduce
-
-
 sys.path.append("../")
 from op_test_xpu import XPUOpTest
 from xpu.get_test_cover_info import (
@@ -31,8 +29,7 @@
 )
 
 import paddle
-
-# from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.layer_helper import LayerHelper
 
 
 class XPUTestSetValueOp(XPUOpTestWrapper):
@@ -927,6 +924,562 @@ def test_error(self):
                 self._bool_tensor_error()
             self._broadcast_mismatch()
 
+    # 5. Test backward
+    class XPUTestBackward(XPUOpTest):
+        def setUp(self):
+            self.__class__.op_type = "set_value"
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+
+        def test_static(self):
+            paddle.enable_static()
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+
+            x_np = np.random.random(size=(4, 4)).astype('float32')
+            y_np = np.random.random(size=(4, 4)).astype('float32')
+            label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
+                y = paddle.static.data(name="y", shape=[4, 4], dtype='float32')
+
+                label = paddle.static.data(
+                    name="label", shape=[4, 1], dtype='int64'
+                )
+
+                z = paddle.add(x, y)
+                var = y[0, :]
+                z[0, :] = var
+
+                prediction = paddle.static.nn.fc(
+                    x=z, size=2, activation='softmax'
+                )
+
+                cost = paddle.nn.functional.cross_entropy(
+                    input=prediction, label=label
+                )
+                loss = paddle.mean(cost)
+                sgd = paddle.optimizer.SGD(learning_rate=0.01)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+
+            var_grad, z_grad = exe.run(
+                main_program,
+                feed={"x": x_np, "y": y_np, "label": label_np},
+                fetch_list=[var.name + "@GRAD", z.name + "@GRAD"],
+            )
+
+            self.assertTrue((var_grad == z_grad[0, :]).all())
+            paddle.disable_static()
+
+    class XPUTestGradientTruncated(XPUOpTest):
+        def setUp(self):
+            self.__class__.op_type = "set_value"
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+
+        def test_consistent_with_competitor(self):
+            paddle.disable_static()
+
+            def set_value(t, value):
+                a = t * t
+                a[0, 1] = value
+                y = a * a
+                return y.sum()
+
+            # case 1
+            array = np.arange(1, 1 + 2 * 3 * 4, dtype="float32").reshape(
+                [1, 2, 1, 3, 1, 4]
+            )
+            value = np.arange(100, 104, dtype="float32").reshape(1, 4)
+
+            inps = paddle.to_tensor(array, stop_gradient=False)
+            value = paddle.to_tensor(value, stop_gradient=False)
+
+            loss = set_value(inps, value)
+            loss.backward()
+
+            value_grad = np.array([[600.0, 606.0, 612.0, 618.0]])
+            input_grad = np.array(
+                [
+                    [
+                        [
+                            [
+                                [[4.0, 32.0, 108.0, 256.0]],
+                                [[500.0, 864.0, 1372.0, 2048.0]],
+                                [[2916.0, 4000.0, 5324.0, 6912.0]],
+                            ]
+                        ],
+                        [
+                            [
+                                [[0.0, 0.0, 0.0, 0.0]],
+                                [[0.0, 0.0, 0.0, 0.0]],
+                                [[0.0, 0.0, 0.0, 0.0]],
+                            ]
+                        ],
+                    ]
+                ]
+            )
+            np.testing.assert_array_equal(
+                inps.grad.numpy(),
+                input_grad,
+                err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+                    input_grad, inps.grad.numpy()
+                ),
+            )
+            np.testing.assert_array_equal(
+                value.grad.numpy(),
+                value_grad,
+                err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+                    value_grad, value.grad.numpy()
+                ),
+            )
+
+            # case 2
+            array = np.arange(1, 2 * 3 * 4 + 1, dtype="float32").reshape(
+                [4, 2, 3]
+            )
+            value = np.arange(100, 100 + 1, dtype="float32")
+
+            inps2 = paddle.to_tensor(array, stop_gradient=False)
+            value2 = paddle.to_tensor(value, stop_gradient=False)
+
+            loss = set_value(inps2, value2)
+            loss.backward()
+
+            value_grad2 = np.array([600.0])
+            input_grad2 = np.array(
+                [
+                    [[4.0, 32.0, 108.0], [0.0, 0.0, 0.0]],
+                    [[1372.0, 2048.0, 2916.0], [4000.0, 5324.0, 6912.0]],
+                    [[8788.0, 10976.0, 13500.0], [16384.0, 19652.0, 23328.0]],
+                    [[27436.0, 32000.0, 37044.0], [42592.0, 48668.0, 55296.0]],
+                ]
+            )
+            np.testing.assert_array_equal(
+                inps2.grad.numpy(),
+                input_grad2,
+                err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+                    input_grad, inps2.grad.numpy()
+                ),
+            )
+            np.testing.assert_array_equal(
+                value2.grad.numpy(),
+                value_grad2,
+                err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+                    value_grad, value2.grad.numpy()
+                ),
+            )
+
+            # case 3
+            def set_value3(t, value):
+                a = t * t
+                a[0, :, 0, :] = value
+                y = a * a
+                return y.sum()
+
+            array = np.arange(1, 1 + 2 * 3 * 4, dtype="float32").reshape(
+                [4, 3, 1, 1, 2, 1]
+            )
+            value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1)
+
+            inps = paddle.to_tensor(array, stop_gradient=False)
+            value = paddle.to_tensor(value, stop_gradient=False)
+
+            loss = set_value3(inps, value)
+            loss.backward()
+
+            value_grad = np.array([[[600.0], [606.0]]])
+            input_grad = np.array(
+                [
+                    [
+                        [[[[0.0], [0.0]]]],
+                        [[[[0.0], [0.0]]]],
+                        [[[[0.0], [0.0]]]],
+                    ],
+                    [
+                        [[[[1372.0], [2048.0]]]],
+                        [[[[2916.0], [4000.0]]]],
+                        [[[[5324.0], [6912.0]]]],
+                    ],
+                    [
+                        [[[[8788.0], [10976.0]]]],
+                        [[[[13500.0], [16384.0]]]],
+                        [[[[19652.0], [23328.0]]]],
+                    ],
+                    [
+                        [[[[27436.0], [32000.0]]]],
+                        [[[[37044.0], [42592.0]]]],
+                        [[[[48668.0], [55296.0]]]],
+                    ],
+                ]
+            )
+            np.testing.assert_array_equal(
+                inps.grad.numpy(),
+                input_grad,
+                err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+                    input_grad, inps.grad.numpy()
+                ),
+            )
+            np.testing.assert_array_equal(
+                value.grad.numpy(),
+                value_grad,
+                err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+                    value_grad, value.grad.numpy()
+                ),
+            )
+
+            # case 4: step >0
+            def set_value4(t, value):
+                a = t * t
+                a[0, :, 0, ::3] = value
+                y = a * a
+                return y.sum()
+
+            array = np.arange(1, 1 + 2 * 3 * 4, dtype="float32").reshape(
+                [2, 3, 1, 4, 1]
+            )
+            value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1)
+
+            inps = paddle.to_tensor(array, stop_gradient=False)
+            value = paddle.to_tensor(value, stop_gradient=False)
+
+            loss = set_value4(inps, value)
+            loss.backward()
+
+            value_grad = np.array([[[600.0], [606.0]]])
+            input_grad = np.array(
+                [
+                    [
+                        [[[0.0], [32.0], [108.0], [0.0]]],
+                        [[[0.0], [864.0], [1372.0], [0.0]]],
+                        [[[0.0], [4000.0], [5324.0], [0.0]]],
+                    ],
+                    [
+                        [[[8788.0], [10976.0], [13500.0], [16384.0]]],
+                        [[[19652.0], [23328.0], [27436.0], [32000.0]]],
+                        [[[37044.0], [42592.0], [48668.0], [55296.0]]],
+                    ],
+                ]
+            )
+            np.testing.assert_array_equal(
+                inps.grad.numpy(),
+                input_grad,
+                err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+                    input_grad, inps.grad.numpy()
+                ),
+            )
+            np.testing.assert_array_equal(
+                value.grad.numpy(),
+                value_grad,
+                err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+                    value_grad, value.grad.numpy()
+                ),
+            )
+
+            # case 5:a[0].shape==value.shape
+            def set_value5(t, value):
+                a = t * t
+                a[0] = value
+                y = a * a
+                return y.sum()
+
+            array = np.arange(1, 1 + 2 * 3 * 4, dtype="float32").reshape(
+                [2, 3, 4]
+            )
+            value = np.arange(100, 100 + 12, dtype="float32").reshape(3, 4)
+
+            inps = paddle.to_tensor(array, stop_gradient=False)
+            value = paddle.to_tensor(value, stop_gradient=False)
+
+            loss = set_value5(inps, value)
+            loss.backward()
+
+            value_grad = np.array(
+                [
+                    [200.0, 202.0, 204.0, 206.0],
+                    [208.0, 210.0, 212.0, 214.0],
+                    [216.0, 218.0, 220.0, 222.0],
+                ]
+            )
+            input_grad = np.array(
+                [
+                    [
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0, 0.0],
+                    ],
+                    [
+                        [8788.0, 10976.0, 13500.0, 16384.0],
+                        [19652.0, 23328.0, 27436.0, 32000.0],
+                        [37044.0, 42592.0, 48668.0, 55296.0],
+                    ],
+                ]
+            )
+            np.testing.assert_array_equal(
+                inps.grad.numpy(),
+                input_grad,
+                err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+                    input_grad, inps.grad.numpy()
+                ),
+            )
+            np.testing.assert_array_equal(
+                value.grad.numpy(),
+                value_grad,
+                err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+                    value_grad, value.grad.numpy()
+                ),
+            )
+
+            # case 6: pass stop_gradient from value to x
+            x = paddle.zeros([8, 8], dtype='float32')
+            value = paddle.to_tensor([10], dtype='float32', stop_gradient=False)
+
+            self.assertTrue(x.stop_gradient)
+            self.assertTrue(x.is_leaf)
+
+            x[0, :] = value
+
+            self.assertTrue(not x.stop_gradient)
+            self.assertTrue(not x.is_leaf)
+
+        def test_static_graph(self):
+            paddle.enable_static()
+
+            to_string = lambda x, i: x + '_' + str(i)
+            numel = lambda input_shape: reduce(lambda x, y: x * y, input_shape)
+
+            def op1(x):
+                value = paddle.fluid.layers.fill_constant([1], "float32", 1)
+                # test stop_gradient
+                value.stop_gradient = True
+                x.stop_gradient = False
+                start = paddle.fluid.layers.fill_constant(
+                    [1], "int32", 5, force_cpu=True
+                )
+                end = paddle.fluid.layers.fill_constant(
+                    [1], "int32", 0, force_cpu=True
+                )
+                step = paddle.fluid.layers.fill_constant(
+                    [1], "int32", -2, force_cpu=True
+                )
+
+                inputs = {
+                    'Input': x,
+                    'ValueTensor': value,
+                    'StartsTensorList': [
+                        start,
+                    ],
+                    'EndsTensorList': [
+                        end,
+                    ],
+                    'StepsTensorList': [
+                        step,
+                    ],
+                }
+
+                helper = LayerHelper("set_value")
+                y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+                helper.append_op(
+                    type="set_value",
+                    inputs=inputs,
+                    outputs={'Out': y},
+                    attrs={'axes': [0]},
+                )
+
+                return y, value
+
+            def op2(x):
+                value = paddle.fluid.layers.fill_constant(
+                    [1, 3, 2], "float32", 1
+                )
+                # test stop_gradient
+                value.stop_gradient = False
+                x.stop_gradient = False
+                attrs = {
+                    'axes': [0],
+                    'starts': [6],
+                    'ends': [0],
+                    'steps': [-4],
+                    'decrease_axes': [],
+                    'none_axes': [],
+                    'dtype': paddle.float32,
+                }
+                inputs = {'Input': x, 'ValueTensor': value}
+
+                helper = LayerHelper("set_value")
+                y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+                helper.append_op(
+                    type="set_value",
+                    inputs=inputs,
+                    outputs={'Out': y},
+                    attrs=attrs,
+                )
+
+                return y, value
+
+            def op3(x):
+                value = paddle.fluid.layers.fill_constant([1], "float32", 1)
+                x.stop_gradient = True
+                value.stop_gradient = False
+                start = paddle.fluid.layers.fill_constant(
+                    [1], "int32", 0, force_cpu=True
+                )
+                end = paddle.fluid.layers.fill_constant(
+                    [1], "int32", 5, force_cpu=True
+                )
+                step = paddle.fluid.layers.fill_constant(
+                    [1], "int32", 3, force_cpu=True
+                )
+
+                inputs = {
+                    'Input': x,
+                    'ValueTensor': value,
+                    'StartsTensorList': [
+                        start,
+                    ],
+                    'EndsTensorList': [
+                        end,
+                    ],
+                    'StepsTensorList': [
+                        step,
+                    ],
+                }
+
+                helper = LayerHelper("set_value")
+                y = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+                helper.append_op(
+                    type="set_value",
+                    inputs=inputs,
+                    outputs={'Out': y},
+                    attrs={'axes': [0]},
+                )
+
+                return y, value
+
+            def set_value(array, i, op):
+                name_x = to_string('x', i)
+                x = paddle.static.data(
+                    name=name_x, shape=array.shape, dtype='float32'
+                )
+
+                # set_value_op in __get/setitem__ is an inplace operation.
+                # When `input.stop_gradient = True` and `value.stop_gradient = False`,
+                # set_value_grad_op will not be run during backward.
+                y, value = op(x)
+                y2 = y + 1
+                loss = paddle.sum(y2)
+                sgd = paddle.optimizer.Adam()
+                sgd.minimize(loss)
+                place = self.place
+
+                prog = paddle.static.default_main_program()
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                fetch_list = []
+                if not x.stop_gradient:
+                    fetch_list.append(x.grad_name)
+                if not value.stop_gradient:
+                    fetch_list.append(value.grad_name)
+                out = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list)
+                return out
+
+            input_shape = [7, 6, 5, 4, 3, 2]
+
+            array = np.arange(0, numel(input_shape), dtype="float32").reshape(
+                input_shape
+            )
+
+            for i in range(len(input_shape)):
+                program = paddle.static.Program()
+                with paddle.static.program_guard(program):
+                    out1 = set_value(array, i, op1)
+                    self.assertTrue((out1[0][5:0:-2] == 0).all())
+
+                if len(array.shape) > 2:
+                    program2 = paddle.static.Program()
+                    with paddle.static.program_guard(program2):
+                        out2 = set_value(array, i, op2)
+                        self.assertTrue((out2[0][6:0:-4] == 0).all())
+
+                program3 = paddle.static.Program()
+                with paddle.static.program_guard(program3):
+                    out3 = set_value(array, i, op3)
+                    self.assertTrue(
+                        (numel(out1[0][0:5:3].shape) == out3[0]).all()
+                    )
+
+                array = array[0]
+            paddle.disable_static()
+
+    class XPUTestSetValueInplace(XPUOpTest):
+        def setUp(self):
+            self.__class__.op_type = "set_value"
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+
+        def test_inplace(self):
+            paddle.disable_static()
+            with paddle.fluid.dygraph.guard():
+                paddle.seed(100)
+                a = paddle.rand(shape=[1, 4])
+                a.stop_gradient = False
+                b = a[:]
+                c = b
+                b[paddle.to_tensor(0)] = 1.0
+
+                self.assertTrue(id(b) == id(c))
+                np.testing.assert_array_equal(b.numpy(), c.numpy())
+                self.assertEqual(b.inplace_version, 0)
+
+            paddle.enable_static()
+
+    class XPUTestSetValueInplaceLeafVar(XPUOpTest):
+        def setUp(self):
+            self.__class__.op_type = "set_value"
+            self.__class__.no_need_check_grad = True
+            self.place = paddle.XPUPlace(0)
+
+        def test_inplace_var_become_leaf_var(self):
+            paddle.disable_static()
+
+            a_grad_1, b_grad_1, a_grad_2, b_grad_2 = 0, 1, 2, 3
+            with paddle.fluid.dygraph.guard():
+                paddle.seed(100)
+                a = paddle.rand(shape=[1, 4])
+                b = paddle.rand(shape=[1, 4])
+                a.stop_gradient = False
+                b.stop_gradient = False
+                c = a / b
+                c.sum().backward()
+                a_grad_1 = a.grad.numpy()
+                b_grad_1 = b.grad.numpy()
+
+            with paddle.fluid.dygraph.guard():
+                paddle.seed(100)
+                a = paddle.rand(shape=[1, 4])
+                b = paddle.rand(shape=[1, 4])
+                a.stop_gradient = False
+                b.stop_gradient = False
+                c = a / b
+                d = paddle.zeros((4, 4))
+                self.assertTrue(d.stop_gradient)
+                d[0, :] = c
+                self.assertFalse(d.stop_gradient)
+                d[0, :].sum().backward()
+                a_grad_2 = a.grad.numpy()
+                b_grad_2 = b.grad.numpy()
+
+            np.testing.assert_array_equal(a_grad_1, a_grad_2)
+            np.testing.assert_array_equal(b_grad_1, b_grad_2)
+            paddle.enable_static()
+
 
 support_types = get_xpu_op_support_types('set_value')
 for stype in support_types:

From 2e156ac8e9b7ab58580b2b60360bfc59f4ea2e39 Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Tue, 31 Jan 2023 13:15:28 +0800
Subject: [PATCH 24/89] support 0d tensor for interpolate (#49929)

* support 0d tensor for interpolate

* support 0d tensor for interpolate

* add xpu unittest for interp

* update unittest for interpolate

* fix coverage

* fix code style

* fix for coverage

* fix coverage
---
 paddle/phi/infermeta/multiary.cc              |  46 ++++----
 .../phi/kernels/funcs/interpolate_function.h  |  14 +--
 .../unittests/test_bilinear_interp_v2_op.py   |  75 +++++++++++++
 .../tests/unittests/test_zero_dim_tensor.py   | 101 ++++++++++++++++++
 .../unittests/xpu/test_zero_dim_tensor_xpu.py |  55 ++++++++++
 python/paddle/nn/functional/common.py         |  34 ++++--
 6 files changed, 291 insertions(+), 34 deletions(-)

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 6b238209d4ac2..ef94266b4ebe1 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1424,16 +1424,18 @@ static void Interpolate1DInferShapeCheck(
   if (scale_tensor) {
     auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
-        scale_tensor_dim.size(),
-        1,
+        scale_tensor_dim.size() == 1 || scale_tensor_dim.size() == 0,
+        true,
         phi::errors::InvalidArgument(
-            "Scale's dimension size must be 1, but got dimension = %d .",
+            "Scale's dimension size must be 1 or 0, but got dimension = %d .",
             scale_tensor_dim.size()));
-    PADDLE_ENFORCE_EQ(scale_tensor_dim[0],
-                      1,
-                      phi::errors::InvalidArgument(
-                          "Scale's shape must be 1, but got shape = %d .",
-                          scale_tensor_dim[0]));
+    if (scale_tensor_dim.size() == 1) {
+      PADDLE_ENFORCE_EQ(scale_tensor_dim[0],
+                        1,
+                        phi::errors::InvalidArgument(
+                            "Scale's shape must be 1, but got shape = %d .",
+                            scale_tensor_dim[0]));
+    }
     out_w_tmp = -1;
   } else {
     if (scale.size() > 0) {
@@ -1550,19 +1552,25 @@ static void Interpolate2DInferShapeCheck(
   }
 
   int out_h_tmp, out_w_tmp;
+
   if (scale_tensor) {
     auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
-        scale_tensor_dim.size(),
-        1,
+        scale_tensor_dim.size() == 1 || scale_tensor_dim.size() == 0,
+        true,
         phi::errors::InvalidArgument(
-            "Scale's dimension size must be 1, but got dimension = %d .",
+            "Scale's dimension size must be 1 or 0, but got dimension = %d .",
             scale_tensor_dim.size()));
-    PADDLE_ENFORCE_EQ(scale_tensor_dim[0] == 2 || scale_tensor_dim[0] == 1,
-                      true,
-                      phi::errors::InvalidArgument(
-                          "Scale's shape must be 2 or 1, but got shape = %d .",
-                          scale_tensor_dim[0]));
+
+    if (scale_tensor_dim.size() == 1) {
+      PADDLE_ENFORCE_EQ(
+          scale_tensor_dim[0] == 2 || scale_tensor_dim[0] == 1,
+          true,
+          phi::errors::InvalidArgument(
+              "Scale's shape must be 2 or 1, but got shape = %d .",
+              scale_tensor_dim[0]));
+    }
+
     out_h_tmp = -1;
     out_w_tmp = -1;
   } else {
@@ -1695,10 +1703,10 @@ static void Interpolate3DInferShapeCheck(
   if (scale_tensor) {
     auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
-        scale_tensor_dim.size(),
-        1,
+        scale_tensor_dim.size() == 1 || scale_tensor_dim.size() == 0,
+        true,
         phi::errors::InvalidArgument(
-            "Scale's dimension size must be 1, but got size = %d .",
+            "Scale's dimension size must be 1 or 0, but got size = %d .",
             scale_tensor_dim.size()));
     PADDLE_ENFORCE_EQ(scale_tensor_dim[0] == 3 || scale_tensor_dim[0] == 1,
                       true,
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 89b02317f3e95..53b0577fc29d7 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -85,12 +85,14 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(),
-        phi::make_ddim({1}),
-        errors::InvalidArgument("The shape of dimension tensor should be [1],"
-                                "but received d%.",
-                                tensor->dims()));
+    PADDLE_ENFORCE_EQ(tensor->dims() == phi::make_ddim({1}) ||
+                          tensor->dims() == phi::make_ddim({}),
+                      true,
+                      errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1] or [],"
+                          "but received d%.",
+                          tensor->dims()));
+
 #ifdef PADDLE_WITH_XPU
     if (tensor->place().GetType() == phi::AllocationType::XPU) {
       DenseTensor temp;
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index ed7b1375e54aa..f274752c1c875 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -816,5 +816,80 @@ def test_main(self):
         np.testing.assert_allclose(x_g_np_1, x_g_np_2, atol=1e-2, rtol=1e-2)
 
 
+class TestBilinearInterpOpAPI_0DTensorScale(unittest.TestCase):
+    def test_case(self):
+        import paddle
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False
+            )
+            scale_0d = paddle.full([], 2)
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale_0d,
+                mode="bilinear",
+                align_corners=False,
+            )
+            np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-05)
+
+
+class TestBilinearInterpOpAPI_0DTensorScale2(unittest.TestCase):
+    def test_case(self):
+        import paddle
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False
+            )
+            scale_0d = [paddle.full([], 2), paddle.full([], 2)]
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale_0d,
+                mode="bilinear",
+                align_corners=False,
+            )
+            np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-05)
+
+
+class TestBilinearInterpOpAPI_0DTensorOutSize(unittest.TestCase):
+    def test_case(self):
+        import paddle
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False
+            )
+            output_size = [
+                paddle.full([], 12, dtype="int32"),
+                paddle.full([], 12, dtype="int32"),
+            ]
+            out = interpolate(
+                x=input_x,
+                size=output_size,
+                mode="bilinear",
+                align_corners=False,
+            )
+            np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-05)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index 11d85b52446b2..2d07ab31334df 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -1388,6 +1388,72 @@ def test_atan2(self):
         self.assertEqual(x1.grad.numpy(), 0.5)
         self.assertEqual(x2.grad.numpy(), 0)
 
+    def test_interpolate(self):
+        from paddle.nn.functional import interpolate
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+        origin_result = interpolate(
+            x=input_x, size=[12, 12], mode="bilinear", align_corners=False
+        )
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+        out1 = interpolate(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        out1.backward()
+
+        self.assertEqual(out1.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        scale_1 = [paddle.full([], 2), paddle.full([], 2)]
+        out2 = interpolate(
+            x=input_x,
+            scale_factor=scale_1,
+            mode="bilinear",
+            align_corners=False,
+        )
+        out2.backward()
+
+        self.assertEqual(out2.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        scale_2 = paddle.full([], 2)
+        out3 = interpolate(
+            x=input_x,
+            scale_factor=scale_2,
+            mode="bilinear",
+            align_corners=False,
+        )
+        out3.backward()
+
+        # for coverage
+        scale_3 = paddle.full([1], 2)
+        input_3d = paddle.rand([2, 3, 6])
+        out4 = interpolate(
+            x=input_3d,
+            scale_factor=scale_3,
+            mode="LINEAR",
+            align_corners=False,
+            data_format="NCW",
+        )
+
+        self.assertEqual(out3.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        np.testing.assert_allclose(
+            origin_result.numpy(), out1.numpy(), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            origin_result.numpy(), out2.numpy(), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            origin_result.numpy(), out3.numpy(), rtol=1e-05
+        )
+
     def test_maseked_select(self):
         x = paddle.rand([])
         x.stop_gradient = False
@@ -2223,6 +2289,41 @@ def test_atan2(self):
 
         self.assertEqual(res[0].shape, ())
 
+    @prog_scope()
+    def test_interpolate(self):
+        from paddle.nn.functional import interpolate
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+
+        out1 = interpolate(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        paddle.static.append_backward(out1.sum())
+        prog = paddle.static.default_main_program()
+        res1 = self.exe.run(prog, feed={}, fetch_list=[out1, input_x.grad_name])
+
+        scale_1 = paddle.full([], 2)
+        out2 = interpolate(
+            x=input_x,
+            scale_factor=scale_1,
+            mode="bilinear",
+            align_corners=False,
+        )
+        paddle.static.append_backward(out2.sum())
+        prog = paddle.static.default_main_program()
+        res2 = self.exe.run(prog, feed={}, fetch_list=[out2, input_x.grad_name])
+
+        self.assertEqual(res1[0].shape, (2, 3, 12, 12))
+        self.assertEqual(res1[1].shape, (2, 3, 6, 6))
+        self.assertEqual(res2[0].shape, (2, 3, 12, 12))
+        self.assertEqual(res2[1].shape, (2, 3, 6, 6))
+
     @prog_scope()
     def test_maseked_select(self):
         x = paddle.rand([])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index c0597d0ad53ea..f6f64aefe9db7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -883,6 +883,61 @@ def test_allclose(self):
         y = paddle.full([], 0.6)
         self.assertFalse(paddle.allclose(x, y))
 
+    def test_interpolate(self):
+        from paddle.nn.functional import interpolate
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+        origin_result = interpolate(
+            x=input_x, size=[12, 12], mode="bilinear", align_corners=False
+        )
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+        out1 = interpolate(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        out1.backward()
+
+        self.assertEqual(out1.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        scale_1 = [paddle.full([], 2), paddle.full([], 2)]
+        out2 = interpolate(
+            x=input_x,
+            scale_factor=scale_1,
+            mode="bilinear",
+            align_corners=False,
+        )
+        out2.backward()
+
+        self.assertEqual(out2.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        scale_2 = paddle.full([], 2)
+        out3 = interpolate(
+            x=input_x,
+            scale_factor=scale_2,
+            mode="bilinear",
+            align_corners=False,
+        )
+        out3.backward()
+
+        self.assertEqual(out3.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        np.testing.assert_allclose(
+            origin_result.numpy(), out1.numpy(), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            origin_result.numpy(), out2.numpy(), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            origin_result.numpy(), out3.numpy(), rtol=1e-05
+        )
+
     def test_equalall(self):
         x = paddle.full([], 0.5)
         y = paddle.full([], 0.6)
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index d9f5b0b160dc0..57a1e0023d4fc 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy
+
 import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.layer_helper import LayerHelper
@@ -102,6 +104,10 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
+    helper = LayerHelper("unfold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
+
     assert len(x.shape) == 4, "input should be the format of [N, C, H, W]"
 
     if isinstance(kernel_sizes, int):
@@ -145,9 +151,6 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     if in_dygraph_mode():
         return _C_ops.unfold(x, kernel_sizes, strides, paddings, dilations)
 
-    helper = LayerHelper("unfold", **locals())
-
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="unfold",
@@ -432,9 +435,12 @@ def interpolate(
         ):
             if len(size) == 0:
                 raise ValueError("output size can not be empty")
+        if size is None:
+            raise ValueError("output size can not be None in AREA mode")
         if len(x.shape) == 3:
             return paddle.nn.functional.adaptive_avg_pool1d(x, size)
         elif len(x.shape) == 4:
+            print("size :", size)
             return paddle.nn.functional.adaptive_avg_pool2d(x, size)
         elif len(x.shape) == 5:
             return paddle.nn.functional.adaptive_avg_pool3d(x, size)
@@ -494,9 +500,10 @@ def _is_list_or_turple_(data):
                     out_shape = list(out_shape.numpy())
                 else:
                     out_shape = list(out_shape)
+
                 for i, dim in enumerate(out_shape):
                     if isinstance(dim, Variable):
-                        out_shape[i] = dim.numpy()[0]
+                        out_shape[i] = dim.numpy().item()
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError("size should be a list or tuple or Variable.")
             # Validate the shape
@@ -568,11 +575,18 @@ def _is_list_or_turple_(data):
 
     else:
         if in_dynamic_mode() and isinstance(scale, Variable):
-            scale = list(scale.numpy())
+            if scale.shape == []:
+                scale = float(scale)
+            else:
+                scale = list(scale.numpy())
         if isinstance(scale, Variable):
             scale.stop_gradient = True
             inputs["Scale"] = scale
-        elif isinstance(scale, float) or isinstance(scale, int):
+        elif (
+            isinstance(scale, float)
+            or isinstance(scale, int)
+            or isinstance(scale, numpy.ndarray)
+        ):
             if scale <= 0:
                 raise ValueError("Attr(scale) should be greater than zero.")
             scale_list = []
@@ -2253,6 +2267,11 @@ def fold(
             # y.shape = [2,3,4,5]
 
     """
+
+    helper = LayerHelper("fold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
+
     assert len(x.shape) == 3, "input should be the format of [N, C, L]"
 
     def _is_list_or_turple_(data):
@@ -2322,9 +2341,6 @@ def _is_list_or_turple_(data):
             dilations,
         )
     else:
-        helper = LayerHelper("fold", **locals())
-
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
         helper.append_op(
             type="fold",

From ce4637c1d9acba3356a3730d258aba121f3d79f8 Mon Sep 17 00:00:00 2001
From: 201716010711 <87008376+201716010711@users.noreply.github.com>
Date: Mon, 30 Jan 2023 21:42:39 -0800
Subject: [PATCH 25/89] support fp16 squaredl2norm  (#48315)

---
 .../gpu/squared_l2_norm_grad_kernel.cu        | 38 +++++++++++++++++-
 .../phi/kernels/gpu/squared_l2_norm_kernel.cu | 29 ++++++++++++--
 .../tests/unittests/test_gradient_clip.py     |  6 +--
 .../unittests/test_squared_l2_norm_op.py      | 40 +++++++++++++++++++
 python/paddle/nn/clip.py                      |  9 ++---
 5 files changed, 106 insertions(+), 16 deletions(-)
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py

diff --git a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
index 908a7557d1b48..7fc355b51ac32 100644
--- a/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/squared_l2_norm_grad_kernel.cu
@@ -15,12 +15,46 @@
 #include "paddle/phi/kernels/squared_l2_norm_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/squared_l2_norm_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+
+namespace phi {
+/**
+ * x*y*2.0
+ */
+template <typename T>
+struct DoubleMulFunctor {
+  __device__ __forceinline__ T operator()(const T a, const T b) const {
+    return b * a * static_cast<T>(2.0f);
+  }
+};
+
+template <typename T, typename Context>
+void SquaredL2NormGradKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& dout,
+                             DenseTensor* dx) {
+  dev_ctx.template Alloc<T>(dx);
+
+  PADDLE_ENFORCE_EQ(
+      dout.numel(),
+      1,
+      phi::errors::InvalidArgument(
+          "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
+  std::vector<const DenseTensor*> ins{&x, &dout};
+  std::vector<DenseTensor*> outs{dx};
+
+  funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+      dev_ctx, ins, &outs, -1, phi::DoubleMulFunctor<T>());
+}
+}  // namespace phi
 
 PD_REGISTER_KERNEL(squared_l2_norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SquaredL2NormGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu b/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
index d585d209b42ca..81108145653e1 100644
--- a/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
@@ -15,9 +15,30 @@
 #include "paddle/phi/kernels/squared_l2_norm_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/squared_l2_norm_kernel_impl.h"
-
-PD_REGISTER_KERNEL(
-    squared_l2_norm, GPU, ALL_LAYOUT, phi::SquaredL2NormKernel, float, double) {
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+namespace phi {
+template <typename T, typename Context>
+void SquaredL2NormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<int> origin_reduce_dims;
+  for (size_t i = 0; i < x.dims().size(); i++) {
+    origin_reduce_dims.push_back(i);
+  }
+  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::SquareFunctor<T, T>>(
+      dev_ctx, x, out, kps::SquareFunctor<T, T>(), origin_reduce_dims, false);
 }
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(squared_l2_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SquaredL2NormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index c74917c2a076a..66fe40bf8ab6d 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -254,10 +254,8 @@ def test_none_grad_fp16(self):
         self.assertListEqual(
             ops,
             [
-                'square',
-                'reduce_sum',
-                'square',
-                'reduce_sum',
+                'squared_l2_norm',
+                'squared_l2_norm',
                 'sum',
                 'cast',
                 'sqrt',
diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
old mode 100644
new mode 100755
index 8124254e7b2cc..a7076e18a5821
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
@@ -30,6 +30,46 @@ def test_squared_l2_norm(x):
         return _legacy_C_ops.squared_l2_norm(x)
 
 
+class TestSquaredL2NormF16Op(unittest.TestCase):
+    def init_test_case(self):
+        X = np.random.uniform(-0.1, 0.1, (8, 5, 10)).astype('float32')
+        return X
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_np)
+
+        x.stop_gradient = False
+        y = test_squared_l2_norm(x)
+        x_g = paddle.grad(y, [x])
+
+        paddle.enable_static()
+        return y, x_g
+
+    def test_main(self):
+        x_np = self.init_test_case()
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float32')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float16')
+
+        def assert_equal(x, y):
+            np.testing.assert_allclose(x, y, rtol=1e-05, atol=0.0)
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+
+
+class TestSquaredL2NormF16Op1(TestSquaredL2NormF16Op):
+    def init_test_case(self):
+        X = np.random.uniform(-2.0, 2.0, (30, 10)).astype('float32')
+        return X
+
+
+class TestSquaredL2NormF16Op2(TestSquaredL2NormF16Op):
+    def init_test_case(self):
+        X = np.random.uniform(-5.0, 5.0, (20, 10, 20)).astype('float32')
+        return X
+
+
 class TestL2LossOp(OpTest):
     """Test squared_l2_norm"""
 
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 10eeb6319063c..53eed3cae5802 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -207,11 +207,8 @@ def _squared_l2_norm(x):
     """
 
     x = _cast_to_mp_type_if_enabled(x)
-    if (
-        core.is_compiled_with_xpu()
-        or x.dtype == core.VarDesc.VarType.FP16
-        or x.dtype == core.VarDesc.VarType.BF16
-    ):
+
+    if core.is_compiled_with_xpu():
         square = paddle.square(x)
         sum_square = paddle.sum(square)
         return sum_square
@@ -220,7 +217,7 @@ def _squared_l2_norm(x):
         return _C_ops.squared_l2_norm(x)
 
     op_type = 'squared_l2_norm'
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'float16'], op_type)
     helper = LayerHelper(op_type, **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
 

From 0f173d5a62b41355b457c6434f6c1c1bf1ae4a79 Mon Sep 17 00:00:00 2001
From: wangxiaoning <71813629+wangxn12138@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:11:37 +0800
Subject: [PATCH 26/89] support fp16 index_select (#50101)

---
 python/paddle/tensor/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 9b1dce1df867d..3ec79b55b0bbe 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -326,7 +326,7 @@ def index_select(x, index, axis=0, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['float32', 'float64', 'int32', 'int64'],
+            ['float16', 'float32', 'float64', 'int32', 'int64'],
             'paddle.tensor.search.index_select',
         )
         check_variable_and_dtype(

From 3a7e470b3e1166a663e52081c797c2c12897723b Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:19:51 +0800
Subject: [PATCH 27/89] remove fluid.ir.RegisterPassHelper PassDesc and
 RegisterPass (#49578)

* remove fluid.ir.RegisterPassHelper PassDesc and RegisterPass

* proto import problems

* change import way of pass_desc_pb2

* change sys.path

* change the way of import framwork_pb2

* add fluid_path directory from path.dirname

* fluid_path changed
---
 python/paddle/fluid/ir.py                     | 466 +----------------
 .../unittests/ir/test_ir_generate_pass.py     |   3 +-
 .../incubate/passes/fuse_resnet_unit_pass.py  |   2 +-
 python/paddle/incubate/passes/ir.py           | 483 ++++++++++++++++++
 4 files changed, 488 insertions(+), 466 deletions(-)
 create mode 100644 python/paddle/incubate/passes/ir.py

diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py
index fb077ed8b5f0d..c444b4cedaafa 100644
--- a/python/paddle/fluid/ir.py
+++ b/python/paddle/fluid/ir.py
@@ -16,18 +16,8 @@
 import inspect
 from os import path
 import paddle
-from . import core, unique_name
-from .framework import _apply_pass, OpProtoHolder
-
-from .proto import framework_pb2
-
-try:
-    from .proto import pass_desc_pb2
-except ModuleNotFoundError:
-    import sys
-
-    sys.path.append(path.join(path.dirname(__file__), 'proto'))
-    from .proto import pass_desc_pb2
+from . import core
+from .framework import _apply_pass
 
 
 def get_data_vars(program):
@@ -138,455 +128,3 @@ def apply_pass(name):
         build_strategy.enable_inplace = False
     build_strategy._clear_finalized()
     return build_strategy
-
-
-class RegisterPassHelper:
-    _register_helpers = list()
-
-    def __init__(self, pass_pairs, pass_type=str(), input_specs=dict()):
-        self._pass_type = pass_type
-        self._pass_pairs = pass_pairs
-        self._input_specs = input_specs
-        RegisterPassHelper._register_helpers.append(self)
-
-    def _get_args_from_func(self, func):
-        args = list()
-        arg_specs = inspect.getfullargspec(func)
-        for arg_name in arg_specs.args:
-            input_spec = self._input_specs.get(arg_name)
-            if isinstance(input_spec, paddle.static.InputSpec):
-                args.append(
-                    PassDesc.VarHelper(
-                        arg_name, input_spec.shape, input_spec.dtype
-                    )
-                )
-            elif isinstance(input_spec, paddle.ParamAttr):
-                args.append(paddle.ParamAttr(arg_name))
-            else:
-                args.append(PassDesc.VarHelper(arg_name, [-1]))
-        return args
-
-    def _prune_program_desc(self, ops):
-        for op_desc in ops:
-            default_attrs = core.get_op_attrs_default_value(
-                op_desc.type.encode()
-            )
-            remove_attrs = list()
-            for attr in op_desc.attrs:
-                # attr must not in
-                if attr.name not in [
-                    "op_namescope",
-                    "op_callstack",
-                    "op_device",
-                ]:
-                    attr_list_fields = attr.ListFields()
-                    # attr format must be: name, type, value
-                    if len(attr_list_fields) == 3:
-                        attr_value = attr.ListFields()[-1][-1]
-                        default_attr_value = default_attrs.get(attr.name)
-                        # value must not default
-                        if default_attr_value != attr_value:
-                            continue
-                remove_attrs.append(attr)
-            for attr in remove_attrs:
-                op_desc.attrs.remove(attr)
-
-    def _func_to_program_desc(self, func, ops):
-        vars = list()
-        program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        with paddle.static.program_guard(program, startup_program):
-            args = self._get_args_from_func(func)
-            vars.extend(args)
-            outs = func(*args)
-            if not isinstance(outs, (list, tuple)):
-                outs = [outs]
-            for out in outs:
-                if isinstance(out, PassDesc.OpHelper):
-                    op_outs = out.Outputs()
-                    if len(op_outs) != 1:
-                        raise ValueError(
-                            "Operator '{}' has multiple outputs, please specify one output variable.".format(
-                                out._type
-                            )
-                        )
-                    for op_out in op_outs.values():
-                        vars.extend(op_out)
-                else:
-                    vars.append(out)
-        block_desc = program.current_block().desc
-        for i in range(block_desc.op_size()):
-            ops.add().ParseFromString(block_desc.op(i).serialize_to_string())
-        self._prune_program_desc(ops)
-        return vars, program.current_block().ops
-
-    def _convert_vars_to_pass_desc(self, patterns, replaces, desc):
-        def _add_element_conditions(conditions, elements):
-            for element in elements:
-                if element._condition:
-                    conditions.append(element._condition)
-                _add_element_conditions(conditions, element._elements)
-
-        for (pattern, replace) in zip(patterns, replaces):
-            # Convert maps of inputs and outputs.
-            var_map = desc.var_maps.add()
-            var_map.pattern_var = pattern.name
-            var_map.replace_var = replace.name
-            conditions = desc.var_attr_conditions
-            # Convert shape condition.
-            if pattern.name in self._input_specs:
-                condition = conditions.add()
-                pattern.Attr("shape")._to_pass_desc_attr(condition.attr)
-                condition.condition_value.name = ""
-                condition.condition_value.type = framework_pb2.AttrType.LONGS
-                condition.condition_value.longs.extend(pattern.shape)
-                condition.type = pass_desc_pb2.PassDesc.ConditionType.kEQ
-            # Convert attr conditions.
-            if PassDesc.VarHelper == pattern.__class__:
-                for attr in pattern._attrs.values():
-                    _add_element_conditions(conditions, [attr])
-
-    def _convert_ops_to_pass_desc(self, patterns, replaces, desc):
-        for replace in replaces:
-            if isinstance(replace, PassDesc.OpHelper):
-                for attr in replace._attrs.values():
-                    # Convert attr maps.
-                    mapped = attr._mapped
-                    if inspect.isfunction(mapped):
-                        mapped = mapped(patterns)
-                    attr_map = desc.op_attr_maps.add()
-                    mapped._to_pass_desc_attr(attr_map.pattern_attr)
-                    attr._to_pass_desc_attr(attr_map.replace_attr)
-                    if mapped._operation is not None:
-                        attr_map.operation.CopyFrom(mapped._operation)
-
-    def SerializeMultiPassDesc(self):
-        switch_static_mode = paddle.in_dynamic_mode()
-        if switch_static_mode:
-            paddle.enable_static()
-        multi_pass_desc = pass_desc_pb2.MultiPassDesc()
-        multi_pass_desc.pass_type = self._pass_type
-        # Traverse all pass pairs and convert them to PassDesc data.
-        # Here need to add cache in the future.
-        for (pattern, replace) in self._pass_pairs:
-            pass_desc = multi_pass_desc.pass_descs.add()
-            # Convert ProgramDescs of pattern and replace subgraphs.
-            pattern_vars, pattern_ops = self._func_to_program_desc(
-                pattern, pass_desc.pattern
-            )
-            replace_vars, replace_ops = self._func_to_program_desc(
-                replace, pass_desc.replace
-            )
-            self._convert_vars_to_pass_desc(
-                pattern_vars, replace_vars, pass_desc
-            )
-            self._convert_ops_to_pass_desc(pattern_ops, replace_ops, pass_desc)
-        if switch_static_mode:
-            paddle.disable_static()
-        return multi_pass_desc.SerializeToString()
-
-
-class PassDesc:
-    class AttrHelper:
-        def __init__(self, obj, name, element_index=None):
-            self._obj = obj
-            self._name = name
-            self._operation_type = None
-            self._element_index = element_index
-            self._elements = list()
-            self._operation = None
-            self._condition = None
-            self._mapped = None
-
-        def __getitem__(self, index):
-            element = PassDesc.AttrHelper(
-                self._obj, self._name, element_index=index
-            )
-            self._elements.append(element)
-            return element
-
-        def _to_pass_desc_attr(self, pass_desc_attr):
-            if isinstance(self._obj, PassDesc.VarHelper):
-                pass_desc_attr.role = pass_desc_pb2.PassDesc.RoleType.kVariable
-                pass_desc_attr.var_name = self._obj.name
-            else:
-                pass_desc_attr.role = pass_desc_pb2.PassDesc.RoleType.kOperator
-                pass_desc_attr.op_index = self._obj._index
-            pass_desc_attr.name = self._name
-            if self._operation_type is not None:
-                pass_desc_attr.operation = self._operation_type
-            if self._element_index is not None:
-                pass_desc_attr.element_index = self._element_index
-
-        def _to_op_desc_attr(self, value, op_desc_attr):
-            op_desc_attr.name = ""
-            if isinstance(value, int):
-                op_desc_attr.type = framework_pb2.AttrType.INT
-                op_desc_attr.i = value
-            else:
-                raise NotImplementedError("Unimplemented transform operation.")
-
-        def _clone_with_operation(self, type, value=None):
-            attr = PassDesc.AttrHelper(
-                self._obj, self._name, self._element_index
-            )
-            self._elements.append(attr)
-            if value is None:
-                attr._operation_type = type
-                return attr
-            operation = pass_desc_pb2.PassDesc.Operation()
-            operation.type = type
-            if isinstance(value, PassDesc.AttrHelper):
-                value._to_pass_desc_attr(operation.attr)
-            else:
-                self._to_op_desc_attr(value, operation.value)
-            attr._operation = operation
-            attr._operation_type = self._operation_type
-            return attr
-
-        def __sub__(self, value):
-            return self._clone_with_operation(
-                pass_desc_pb2.PassDesc.OperationType.kSub, value
-            )
-
-        def __add__(self, value):
-            return self._clone_with_operation(
-                pass_desc_pb2.PassDesc.OperationType.kAdd, value
-            )
-
-        def Mod(self, value):
-            return self._clone_with_operation(
-                pass_desc_pb2.PassDesc.OperationType.kMod, value
-            )
-
-        def Size(self):
-            return self._clone_with_operation(
-                pass_desc_pb2.PassDesc.OperationType.kSize
-            )
-
-        def _set_with_condition(self, type, value):
-            condition = pass_desc_pb2.PassDesc.AttrCondition()
-            self._to_pass_desc_attr(condition.attr)
-            condition.type = type
-            if isinstance(value, PassDesc.AttrHelper):
-                value._to_pass_desc_attr(condition.condition_attr)
-            else:
-                self._to_op_desc_attr(value, condition.condition_value)
-            if self._operation:
-                condition.operation.CopyFrom(self._operation)
-            self._condition = condition
-
-        def EQ(self, value):
-            self._set_with_condition(
-                pass_desc_pb2.PassDesc.ConditionType.kEQ, value
-            )
-
-        def MappedPattern(
-            self, var=None, op=None, index=0, name=None, element_index=None
-        ):
-            if all([var, op]):
-                raise ValueError("Only mapped one of which var or op.")
-
-            def mapped_var(pattern_ops):
-                raise NotImplementedError(
-                    "Mapping to variable is not implemented."
-                )
-
-            def mapped_op(pattern_ops):
-                ops = [o for o in pattern_ops if o._type == op]
-                if len(ops) <= index:
-                    raise ValueError(
-                        "Index '{}' of operator '{}' is incorrect.".format(
-                            index, op
-                        )
-                    )
-                return PassDesc.AttrHelper(
-                    ops[index], name, element_index=element_index
-                )
-
-            self._mapped = mapped_op if var is None else mapped_var
-
-    class VarHelper(paddle.static.Variable):
-        def __init__(self, *args, **kwargs):
-            block = paddle.static.default_main_program().current_block()
-            self._var = paddle.static.data(*args, **kwargs)
-            self._attrs = dict()
-
-        def __getattr__(self, name):
-            return getattr(self._var, name)
-
-        def Attr(self, name):
-            attr = self._attrs.get(name)
-            if attr is None:
-                attr = PassDesc.AttrHelper(self, name)
-                self._attrs[name] = attr
-            return attr
-
-    class OpHelper:
-        def __init__(self, type=None):
-            self._type = type
-
-        def __getattr__(self, name):
-            op = PassDesc.OpHelper(name)
-            op.Init()
-            return op
-
-        def __call__(self, *args, **kwargs):
-            if len(args) > 0:
-                raise ValueError(
-                    "Each input argument needs to specify a parameter name."
-                )
-            for (in_name, in_args) in kwargs.items():
-                op_input = self._inputs.get(in_name)
-                if op_input is None:
-                    raise ValueError(
-                        "Operator '{}' does not have input named '{}'.".format(
-                            self._type, in_name
-                        )
-                    )
-                if isinstance(in_args, (list, tuple)):
-                    if len(in_args) == 0:
-                        raise ValueError(
-                            "Input '{}' of operator '{}' cannot be empty.".format(
-                                in_name, self._type
-                            )
-                        )
-                else:
-                    in_args = [in_args]
-                for in_arg in in_args:
-                    if isinstance(in_arg, PassDesc.OpHelper):
-                        op_outs = in_arg.Outputs()
-                        if len(op_outs) != 1:
-                            raise ValueError(
-                                "The size of outputs of operator '{}' is not equal 1, please specify one output variable.".format(
-                                    in_arg._type
-                                )
-                            )
-                        for op_out in op_outs.values():
-                            op_input.extend(op_out)
-                    else:
-                        op_input.append(in_arg)
-                self._desc.set_input(in_name, [i.name for i in op_input])
-            block = paddle.static.default_main_program().current_block()
-            for out_name, op_output in self._outputs.items():
-                op_output_name = unique_name.generate(self._type)
-                op_output.append(block.create_var(name=op_output_name))
-                self._desc.set_output(out_name, [op_output_name])
-            return self
-
-        def Init(self):
-            block = paddle.static.default_main_program().current_block()
-            self._proto = OpProtoHolder.instance().op_proto_map.get(self._type)
-            if self._proto is None:
-                raise AttributeError(
-                    "type object 'OpHelper' has no attribute '{}'".format(
-                        self._type
-                    )
-                )
-            self._index = len(block.ops)
-            self._desc = block.desc.append_op()
-            self._desc.set_type(self._type)
-            self._attrs = dict()
-            self._inputs = {i.name: list() for i in self._proto.inputs}
-            self._outputs = {o.name: list() for o in self._proto.outputs}
-            block.ops.append(self)
-
-        def Attr(self, name):
-            attr = self._attrs.get(name)
-            if attr is None:
-                attr = PassDesc.AttrHelper(self, name)
-                self._attrs[name] = attr
-            return attr
-
-        def SetAttr(self, name, value):
-            if isinstance(value, PassDesc.AttrHelper):
-                self.Attr(name)._mapped = value
-            else:
-                self._desc._set_attr(name, value)
-
-        def Output(self, name):
-            output = self._outputs.get(name)
-            if output is None:
-                raise ValueError(
-                    "Operator '{}' does not have output named '{}'.".format(
-                        self._type, name
-                    )
-                )
-            return output
-
-        def Outputs(self):
-            return self._outputs
-
-        def SetOutputs(self, **kwargs):
-            for param, arg in kwargs.items():
-                if arg is None:
-                    self._desc.remove_output(param)
-                else:
-                    self._desc.set_output(param, [arg.name])
-
-    OP = OpHelper()
-
-
-def RegisterPass(function=None, input_specs=dict()):
-    """
-    The function decorator of Register Pass. Decorator @RegisterPass handles
-    the function and register it into a core.Pass instance. Use name of function
-    as Pass type.
-
-    Args:
-        function (callable): The function with return of callable pair(s) that
-            represents the pattern subgraph and the replace subgraph.
-        input_specs (dict[str, InputSpec]): Dict of InputSpec to specific the shape/dtype
-            information of Tensor. Some operators limit the shape and dtype of datas when
-            create subgraph with Paddle APIs. So user need specify InputSpec of data to
-            ensure create a correctly subgraph. Of course, this argument is not limited to
-            matching subgraph. The default is dict().
-
-    Returns:
-        callables: Callable pair(s).
-
-    Examples:
-        .. code-block:: python
-
-        import paddle
-        from paddle.fluid.ir import RegisterPass
-
-        @RegisterPass
-        def multi_add_to_addn():
-            def pattern(x, y, z):
-                return paddle.add(paddle.add(x, y), z)
-            def replace(x, y, z):
-                return paddle.add_n([x, y, z])
-            return pattern, replace
-    """
-
-    def _is_pass_pair(check_pair):
-        if isinstance(check_pair, (list, tuple)):
-            if len(check_pair) == 2:
-                if all(map(inspect.isfunction, check_pair)):
-                    return True
-        return False
-
-    def decorated(python_func):
-        pass_type = python_func.__name__
-        signature = inspect.signature(python_func)
-        if len(signature.parameters) > 0:
-            raise NotImplementedError(
-                "Pass function with parameter is not supported now."
-            )
-        elif len(signature.parameters) == 0:
-            pass_pairs = python_func()
-            if _is_pass_pair(pass_pairs):
-                pass_pairs = [pass_pairs]
-            elif not all(map(_is_pass_pair, pass_pairs)):
-                raise ValueError(
-                    "Return value of Pass function must be (callable, callable)."
-                )
-            helper = RegisterPassHelper(pass_pairs, pass_type, input_specs)
-            core.register_pass(pass_type, helper.SerializeMultiPassDesc)
-        return python_func
-
-    if inspect.isfunction(function):
-        return decorated(function)
-
-    return decorated
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
index 2025f94ffd439..2f3a2f2d771c2 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
@@ -17,7 +17,8 @@
 import numpy as np
 
 import paddle
-from paddle.fluid import core, ir
+from paddle.fluid import core
+from paddle.incubate.passes import ir
 from paddle.static import InputSpec
 
 
diff --git a/python/paddle/incubate/passes/fuse_resnet_unit_pass.py b/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
index 6441427f469d6..7acf28eecb334 100644
--- a/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
+++ b/python/paddle/incubate/passes/fuse_resnet_unit_pass.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid.ir as ir
+import paddle.incubate.passes.ir as ir
 
 
 def set_resnet_unit_attrs(resnet_unit, has_shortcut):
diff --git a/python/paddle/incubate/passes/ir.py b/python/paddle/incubate/passes/ir.py
new file mode 100644
index 0000000000000..cf6568a545f39
--- /dev/null
+++ b/python/paddle/incubate/passes/ir.py
@@ -0,0 +1,483 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from os import path
+
+import paddle
+from paddle.fluid.proto import framework_pb2
+
+from ...fluid import core, unique_name
+from ...fluid.framework import OpProtoHolder
+
+try:
+    from paddle.fluid.proto import pass_desc_pb2
+except ModuleNotFoundError:
+    import sys
+
+    fluid_path = path.dirname(__file__) + '/../../fluid'
+    sys.path.append(path.join(fluid_path, 'proto'))
+    from paddle.fluid.proto import pass_desc_pb2
+
+
+class RegisterPassHelper:
+    _register_helpers = list()
+
+    def __init__(self, pass_pairs, pass_type=str(), input_specs=dict()):
+        self._pass_type = pass_type
+        self._pass_pairs = pass_pairs
+        self._input_specs = input_specs
+        RegisterPassHelper._register_helpers.append(self)
+
+    def _get_args_from_func(self, func):
+        args = list()
+        arg_specs = inspect.getfullargspec(func)
+        for arg_name in arg_specs.args:
+            input_spec = self._input_specs.get(arg_name)
+            if isinstance(input_spec, paddle.static.InputSpec):
+                args.append(
+                    PassDesc.VarHelper(
+                        arg_name, input_spec.shape, input_spec.dtype
+                    )
+                )
+            elif isinstance(input_spec, paddle.ParamAttr):
+                args.append(paddle.ParamAttr(arg_name))
+            else:
+                args.append(PassDesc.VarHelper(arg_name, [-1]))
+        return args
+
+    def _prune_program_desc(self, ops):
+        for op_desc in ops:
+            default_attrs = core.get_op_attrs_default_value(
+                op_desc.type.encode()
+            )
+            remove_attrs = list()
+            for attr in op_desc.attrs:
+                # attr must not in
+                if attr.name not in [
+                    "op_namescope",
+                    "op_callstack",
+                    "op_device",
+                ]:
+                    attr_list_fields = attr.ListFields()
+                    # attr format must be: name, type, value
+                    if len(attr_list_fields) == 3:
+                        attr_value = attr.ListFields()[-1][-1]
+                        default_attr_value = default_attrs.get(attr.name)
+                        # value must not default
+                        if default_attr_value != attr_value:
+                            continue
+                remove_attrs.append(attr)
+            for attr in remove_attrs:
+                op_desc.attrs.remove(attr)
+
+    def _func_to_program_desc(self, func, ops):
+        vars = list()
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(program, startup_program):
+            args = self._get_args_from_func(func)
+            vars.extend(args)
+            outs = func(*args)
+            if not isinstance(outs, (list, tuple)):
+                outs = [outs]
+            for out in outs:
+                if isinstance(out, PassDesc.OpHelper):
+                    op_outs = out.Outputs()
+                    if len(op_outs) != 1:
+                        raise ValueError(
+                            "Operator '{}' has multiple outputs, please specify one output variable.".format(
+                                out._type
+                            )
+                        )
+                    for op_out in op_outs.values():
+                        vars.extend(op_out)
+                else:
+                    vars.append(out)
+        block_desc = program.current_block().desc
+        for i in range(block_desc.op_size()):
+            ops.add().ParseFromString(block_desc.op(i).serialize_to_string())
+        self._prune_program_desc(ops)
+        return vars, program.current_block().ops
+
+    def _convert_vars_to_pass_desc(self, patterns, replaces, desc):
+        def _add_element_conditions(conditions, elements):
+            for element in elements:
+                if element._condition:
+                    conditions.append(element._condition)
+                _add_element_conditions(conditions, element._elements)
+
+        for (pattern, replace) in zip(patterns, replaces):
+            # Convert maps of inputs and outputs.
+            var_map = desc.var_maps.add()
+            var_map.pattern_var = pattern.name
+            var_map.replace_var = replace.name
+            conditions = desc.var_attr_conditions
+            # Convert shape condition.
+            if pattern.name in self._input_specs:
+                condition = conditions.add()
+                pattern.Attr("shape")._to_pass_desc_attr(condition.attr)
+                condition.condition_value.name = ""
+                condition.condition_value.type = framework_pb2.AttrType.LONGS
+                condition.condition_value.longs.extend(pattern.shape)
+                condition.type = pass_desc_pb2.PassDesc.ConditionType.kEQ
+            # Convert attr conditions.
+            if PassDesc.VarHelper == pattern.__class__:
+                for attr in pattern._attrs.values():
+                    _add_element_conditions(conditions, [attr])
+
+    def _convert_ops_to_pass_desc(self, patterns, replaces, desc):
+        for replace in replaces:
+            if isinstance(replace, PassDesc.OpHelper):
+                for attr in replace._attrs.values():
+                    # Convert attr maps.
+                    mapped = attr._mapped
+                    if inspect.isfunction(mapped):
+                        mapped = mapped(patterns)
+                    attr_map = desc.op_attr_maps.add()
+                    mapped._to_pass_desc_attr(attr_map.pattern_attr)
+                    attr._to_pass_desc_attr(attr_map.replace_attr)
+                    if mapped._operation is not None:
+                        attr_map.operation.CopyFrom(mapped._operation)
+
+    def SerializeMultiPassDesc(self):
+        switch_static_mode = paddle.in_dynamic_mode()
+        if switch_static_mode:
+            paddle.enable_static()
+        multi_pass_desc = pass_desc_pb2.MultiPassDesc()
+        multi_pass_desc.pass_type = self._pass_type
+        # Traverse all pass pairs and convert them to PassDesc data.
+        # Here need to add cache in the future.
+        for (pattern, replace) in self._pass_pairs:
+            pass_desc = multi_pass_desc.pass_descs.add()
+            # Convert ProgramDescs of pattern and replace subgraphs.
+            pattern_vars, pattern_ops = self._func_to_program_desc(
+                pattern, pass_desc.pattern
+            )
+            replace_vars, replace_ops = self._func_to_program_desc(
+                replace, pass_desc.replace
+            )
+            self._convert_vars_to_pass_desc(
+                pattern_vars, replace_vars, pass_desc
+            )
+            self._convert_ops_to_pass_desc(pattern_ops, replace_ops, pass_desc)
+        if switch_static_mode:
+            paddle.disable_static()
+        return multi_pass_desc.SerializeToString()
+
+
+class PassDesc:
+    class AttrHelper:
+        def __init__(self, obj, name, element_index=None):
+            self._obj = obj
+            self._name = name
+            self._operation_type = None
+            self._element_index = element_index
+            self._elements = list()
+            self._operation = None
+            self._condition = None
+            self._mapped = None
+
+        def __getitem__(self, index):
+            element = PassDesc.AttrHelper(
+                self._obj, self._name, element_index=index
+            )
+            self._elements.append(element)
+            return element
+
+        def _to_pass_desc_attr(self, pass_desc_attr):
+            if isinstance(self._obj, PassDesc.VarHelper):
+                pass_desc_attr.role = pass_desc_pb2.PassDesc.RoleType.kVariable
+                pass_desc_attr.var_name = self._obj.name
+            else:
+                pass_desc_attr.role = pass_desc_pb2.PassDesc.RoleType.kOperator
+                pass_desc_attr.op_index = self._obj._index
+            pass_desc_attr.name = self._name
+            if self._operation_type is not None:
+                pass_desc_attr.operation = self._operation_type
+            if self._element_index is not None:
+                pass_desc_attr.element_index = self._element_index
+
+        def _to_op_desc_attr(self, value, op_desc_attr):
+            op_desc_attr.name = ""
+            if isinstance(value, int):
+                op_desc_attr.type = framework_pb2.AttrType.INT
+                op_desc_attr.i = value
+            else:
+                raise NotImplementedError("Unimplemented transform operation.")
+
+        def _clone_with_operation(self, type, value=None):
+            attr = PassDesc.AttrHelper(
+                self._obj, self._name, self._element_index
+            )
+            self._elements.append(attr)
+            if value is None:
+                attr._operation_type = type
+                return attr
+            operation = pass_desc_pb2.PassDesc.Operation()
+            operation.type = type
+            if isinstance(value, PassDesc.AttrHelper):
+                value._to_pass_desc_attr(operation.attr)
+            else:
+                self._to_op_desc_attr(value, operation.value)
+            attr._operation = operation
+            attr._operation_type = self._operation_type
+            return attr
+
+        def __sub__(self, value):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kSub, value
+            )
+
+        def __add__(self, value):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kAdd, value
+            )
+
+        def Mod(self, value):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kMod, value
+            )
+
+        def Size(self):
+            return self._clone_with_operation(
+                pass_desc_pb2.PassDesc.OperationType.kSize
+            )
+
+        def _set_with_condition(self, type, value):
+            condition = pass_desc_pb2.PassDesc.AttrCondition()
+            self._to_pass_desc_attr(condition.attr)
+            condition.type = type
+            if isinstance(value, PassDesc.AttrHelper):
+                value._to_pass_desc_attr(condition.condition_attr)
+            else:
+                self._to_op_desc_attr(value, condition.condition_value)
+            if self._operation:
+                condition.operation.CopyFrom(self._operation)
+            self._condition = condition
+
+        def EQ(self, value):
+            self._set_with_condition(
+                pass_desc_pb2.PassDesc.ConditionType.kEQ, value
+            )
+
+        def MappedPattern(
+            self, var=None, op=None, index=0, name=None, element_index=None
+        ):
+            if all([var, op]):
+                raise ValueError("Only mapped one of which var or op.")
+
+            def mapped_var(pattern_ops):
+                raise NotImplementedError(
+                    "Mapping to variable is not implemented."
+                )
+
+            def mapped_op(pattern_ops):
+                ops = [o for o in pattern_ops if o._type == op]
+                if len(ops) <= index:
+                    raise ValueError(
+                        "Index '{}' of operator '{}' is incorrect.".format(
+                            index, op
+                        )
+                    )
+                return PassDesc.AttrHelper(
+                    ops[index], name, element_index=element_index
+                )
+
+            self._mapped = mapped_op if var is None else mapped_var
+
+    class VarHelper(paddle.static.Variable):
+        def __init__(self, *args, **kwargs):
+            block = paddle.static.default_main_program().current_block()
+            self._var = paddle.static.data(*args, **kwargs)
+            self._attrs = dict()
+
+        def __getattr__(self, name):
+            return getattr(self._var, name)
+
+        def Attr(self, name):
+            attr = self._attrs.get(name)
+            if attr is None:
+                attr = PassDesc.AttrHelper(self, name)
+                self._attrs[name] = attr
+            return attr
+
+    class OpHelper:
+        def __init__(self, type=None):
+            self._type = type
+
+        def __getattr__(self, name):
+            op = PassDesc.OpHelper(name)
+            op.Init()
+            return op
+
+        def __call__(self, *args, **kwargs):
+            if len(args) > 0:
+                raise ValueError(
+                    "Each input argument needs to specify a parameter name."
+                )
+            for (in_name, in_args) in kwargs.items():
+                op_input = self._inputs.get(in_name)
+                if op_input is None:
+                    raise ValueError(
+                        "Operator '{}' does not have input named '{}'.".format(
+                            self._type, in_name
+                        )
+                    )
+                if isinstance(in_args, (list, tuple)):
+                    if len(in_args) == 0:
+                        raise ValueError(
+                            "Input '{}' of operator '{}' cannot be empty.".format(
+                                in_name, self._type
+                            )
+                        )
+                else:
+                    in_args = [in_args]
+                for in_arg in in_args:
+                    if isinstance(in_arg, PassDesc.OpHelper):
+                        op_outs = in_arg.Outputs()
+                        if len(op_outs) != 1:
+                            raise ValueError(
+                                "The size of outputs of operator '{}' is not equal 1, please specify one output variable.".format(
+                                    in_arg._type
+                                )
+                            )
+                        for op_out in op_outs.values():
+                            op_input.extend(op_out)
+                    else:
+                        op_input.append(in_arg)
+                self._desc.set_input(in_name, [i.name for i in op_input])
+            block = paddle.static.default_main_program().current_block()
+            for out_name, op_output in self._outputs.items():
+                op_output_name = unique_name.generate(self._type)
+                op_output.append(block.create_var(name=op_output_name))
+                self._desc.set_output(out_name, [op_output_name])
+            return self
+
+        def Init(self):
+            block = paddle.static.default_main_program().current_block()
+            self._proto = OpProtoHolder.instance().op_proto_map.get(self._type)
+            if self._proto is None:
+                raise AttributeError(
+                    "type object 'OpHelper' has no attribute '{}'".format(
+                        self._type
+                    )
+                )
+            self._index = len(block.ops)
+            self._desc = block.desc.append_op()
+            self._desc.set_type(self._type)
+            self._attrs = dict()
+            self._inputs = {i.name: list() for i in self._proto.inputs}
+            self._outputs = {o.name: list() for o in self._proto.outputs}
+            block.ops.append(self)
+
+        def Attr(self, name):
+            attr = self._attrs.get(name)
+            if attr is None:
+                attr = PassDesc.AttrHelper(self, name)
+                self._attrs[name] = attr
+            return attr
+
+        def SetAttr(self, name, value):
+            if isinstance(value, PassDesc.AttrHelper):
+                self.Attr(name)._mapped = value
+            else:
+                self._desc._set_attr(name, value)
+
+        def Output(self, name):
+            output = self._outputs.get(name)
+            if output is None:
+                raise ValueError(
+                    "Operator '{}' does not have output named '{}'.".format(
+                        self._type, name
+                    )
+                )
+            return output
+
+        def Outputs(self):
+            return self._outputs
+
+        def SetOutputs(self, **kwargs):
+            for param, arg in kwargs.items():
+                if arg is None:
+                    self._desc.remove_output(param)
+                else:
+                    self._desc.set_output(param, [arg.name])
+
+    OP = OpHelper()
+
+
+def RegisterPass(function=None, input_specs=dict()):
+    """
+    The function decorator of Register Pass. Decorator @RegisterPass handles
+    the function and register it into a core.Pass instance. Use name of function
+    as Pass type.
+
+    Args:
+        function (callable): The function with return of callable pair(s) that
+            represents the pattern subgraph and the replace subgraph.
+        input_specs (dict[str, InputSpec]): Dict of InputSpec to specific the shape/dtype
+            information of Tensor. Some operators limit the shape and dtype of datas when
+            create subgraph with Paddle APIs. So user need specify InputSpec of data to
+            ensure create a correctly subgraph. Of course, this argument is not limited to
+            matching subgraph. The default is dict().
+
+    Returns:
+        callables: Callable pair(s).
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        from paddle.fluid.ir import RegisterPass
+
+        @RegisterPass
+        def multi_add_to_addn():
+            def pattern(x, y, z):
+                return paddle.add(paddle.add(x, y), z)
+            def replace(x, y, z):
+                return paddle.add_n([x, y, z])
+            return pattern, replace
+    """
+
+    def _is_pass_pair(check_pair):
+        if isinstance(check_pair, (list, tuple)):
+            if len(check_pair) == 2:
+                if all(map(inspect.isfunction, check_pair)):
+                    return True
+        return False
+
+    def decorated(python_func):
+        pass_type = python_func.__name__
+        signature = inspect.signature(python_func)
+        if len(signature.parameters) > 0:
+            raise NotImplementedError(
+                "Pass function with parameter is not supported now."
+            )
+        elif len(signature.parameters) == 0:
+            pass_pairs = python_func()
+            if _is_pass_pair(pass_pairs):
+                pass_pairs = [pass_pairs]
+            elif not all(map(_is_pass_pair, pass_pairs)):
+                raise ValueError(
+                    "Return value of Pass function must be (callable, callable)."
+                )
+            helper = RegisterPassHelper(pass_pairs, pass_type, input_specs)
+            core.register_pass(pass_type, helper.SerializeMultiPassDesc)
+        return python_func
+
+    if inspect.isfunction(function):
+        return decorated(function)
+
+    return decorated

From a1f28a48951f6c6541cd107382ccf08317bb4e76 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Tue, 31 Jan 2023 14:29:11 +0800
Subject: [PATCH 28/89] [Paddle Inference] change the default values of some
 gflags (#50074)

---
 .../fluid/inference/api/analysis_predictor.cc | 177 ++++++++++--------
 1 file changed, 100 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6ccad994b06a8..e89bcfa2c6a99 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -17,6 +17,7 @@
 #include <glog/logging.h>
 
 #include <algorithm>
+#include <cstdlib>
 #include <fstream>
 #include <memory>
 #include <set>
@@ -1384,13 +1385,6 @@ template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
     const AnalysisConfig &config) {
-  // TODO(NHZlX): Should add the link to the doc of
-  // paddle_infer::CreatePredictor<paddle_infer::Config>
-  if (config.glog_info_disabled()) {
-    FLAGS_logtostderr = 1;
-    FLAGS_minloglevel = 2;  // GLOG_ERROR
-  }
-  VLOG(3) << "create AnalysisConfig";
   PADDLE_ENFORCE_EQ(
       config.is_valid(),
       true,
@@ -1403,83 +1397,112 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
   std::call_once(custom_operators_registered,
                  []() { inference::RegisterAllCustomOperator(); });
 
-  if (config.use_gpu()) {
-    static std::once_flag gflags_initialized;
-    static bool process_level_allocator_enabled;
-
-    std::call_once(gflags_initialized, [&]() {
-      std::vector<std::string> gflags;
-      PADDLE_ENFORCE_GE(
-          config.memory_pool_init_size_mb(),
-          0.f,
+  auto SetGflags = [](const AnalysisConfig &config) {
+    auto SetGflag = [](const char *name, const char *value) {
+      std::string ret = ::GFLAGS_NAMESPACE::SetCommandLineOption(name, value);
+      PADDLE_ENFORCE_EQ(
+          ret.empty(),
+          false,
           platform::errors::InvalidArgument(
-              "The size of memory pool should be greater than 0."));
-      PADDLE_ENFORCE_GE(
-          config.gpu_device_id(),
-          0,
-          platform::errors::InvalidArgument(
-              "Invalid device id (%d). The device id should be greater than 0.",
-              config.gpu_device_id()));
-      gflags.push_back("dummy");
-
-      float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
-      if (fraction_of_gpu_memory > 0.95f) {
-        LOG(ERROR)
-            << "Allocate too much memory for the GPU memory pool, assigned "
-            << config.memory_pool_init_size_mb() << " MB";
-        LOG(ERROR) << "Try to shink the value by setting "
-                      "AnalysisConfig::EnableGpu(...)";
-      }
+              "Fail to set gflag: %s, please make sure the gflag exists.",
+              name));
+      VLOG(3) << "set gflag: --" << name << "=" << value;
+    };
+    // TODO(NHZlX): Should add the link to the doc of
+    // paddle_infer::CreatePredictor<paddle_infer::Config>
+    if (config.glog_info_disabled()) {
+      FLAGS_logtostderr = 1;
+      FLAGS_minloglevel = 2;  // GLOG_ERROR
+    }
 
-      if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
-        std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                           std::to_string(fraction_of_gpu_memory);
-        VLOG(3) << "set flag: " << flag;
-        gflags.push_back(flag);
-      }
+    if (config.use_gpu()) {
+      static std::once_flag gflags_initialized;
+      static bool process_level_allocator_enabled;
+
+      std::call_once(gflags_initialized, [&]() {
+        PADDLE_ENFORCE_GE(
+            config.memory_pool_init_size_mb(),
+            0.f,
+            platform::errors::InvalidArgument(
+                "The size of memory pool should be greater than 0."));
+        PADDLE_ENFORCE_GE(config.gpu_device_id(),
+                          0,
+                          platform::errors::InvalidArgument(
+                              "Invalid device id (%d). The device id should be "
+                              "greater than 0.",
+                              config.gpu_device_id()));
+
+        float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+        if (fraction_of_gpu_memory > 0.95f) {
+          LOG(ERROR)
+              << "Allocate too much memory for the GPU memory pool, assigned "
+              << config.memory_pool_init_size_mb() << " MB";
+          LOG(ERROR) << "Try to shink the value by setting "
+                        "AnalysisConfig::EnableUseGpu(...)";
+        }
+        if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
+          std::string value = std::to_string(fraction_of_gpu_memory);
+          SetGflag("fraction_of_gpu_memory_to_use", value.data());
+        }
 
-      // TODO(Shixiaowei02): Add a mandatory scheme to use the thread local
-      // allocator when multi-stream is enabled.
-      if (config.thread_local_stream_enabled()) {
-        gflags.push_back("--allocator_strategy=thread_local");
-        process_level_allocator_enabled = false;
-      } else {
-        process_level_allocator_enabled = true;
-      }
+        // TODO(Shixiaowei02): Add a mandatory scheme to use the thread local
+        // allocator when multi-stream is enabled.
+        if (config.thread_local_stream_enabled()) {
+          SetGflag("allocator_strategy", "thread_local");
+          process_level_allocator_enabled = false;
+        } else {
+          process_level_allocator_enabled = true;
+        }
 
-      // support set flags from enviorment.
-      const phi::ExportedFlagInfoMap &env_map = phi::GetExportedFlagInfoMap();
-      std::ostringstream os;
-      os << "--tryfromenv=";
-      for (auto &pair : env_map) {
-        os << pair.second.name << ",";
-      }
-      auto tryfromenv_str = os.str();
-      gflags.push_back(os.str().substr(0, tryfromenv_str.size() - 1));
-
-      if (framework::InitGflags(gflags)) {
-        VLOG(3) << "The following gpu analysis configurations only take effect "
-                   "for the first predictor: ";
-        for (size_t i = 1; i < gflags.size(); ++i) {
-          VLOG(3) << gflags[i];
+        // for inference, the following default values are better.
+        if (std::getenv("FLAGS_conv_workspace_size_limit") == nullptr) {
+          SetGflag("conv_workspace_size_limit", "32");
         }
-      } else {
-        LOG(WARNING) << "The one-time configuration of analysis predictor "
-                        "failed, which may be due to native predictor called "
-                        "first and its configurations taken effect.";
-      }
-    });
+        if (std::getenv("FLAGS_initial_cpu_memory_in_mb") == nullptr) {
+          SetGflag("initial_cpu_memory_in_mb", "0");
+        }
+
+        // support set gflags from environment.
+        std::vector<std::string> gflags;
+        const phi::ExportedFlagInfoMap &env_map = phi::GetExportedFlagInfoMap();
+        std::ostringstream os;
+        for (auto &pair : env_map) {
+          os << pair.second.name << ",";
+        }
+        std::string tryfromenv_str = os.str();
+        if (!tryfromenv_str.empty()) {
+          tryfromenv_str.pop_back();
+          tryfromenv_str = "--tryfromenv=" + tryfromenv_str;
+          gflags.push_back(tryfromenv_str);
+        }
+        if (framework::InitGflags(gflags)) {
+          VLOG(3)
+              << "The following gpu analysis configurations only take effect "
+                 "for the first predictor: ";
+          for (const auto &gflag : gflags) {
+            VLOG(3) << gflag;
+          }
+        } else {
+          LOG(WARNING) << "The one-time configuration of analysis predictor "
+                          "failed, which may be due to native predictor called "
+                          "first and its configurations taken effect.";
+        }
+      });
 
-    if (config.thread_local_stream_enabled() &&
-        process_level_allocator_enabled) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "When binding threads and streams, the use of "
-          "process-level allocators will result in undefined result "
-          "errors due to memory asynchronous operations."
-          "The thread and stream binding configuration of all "
-          "predictors should be the same in a single process."));
+      if (config.thread_local_stream_enabled() &&
+          process_level_allocator_enabled) {
+        PADDLE_THROW(platform::errors::Fatal(
+            "When binding threads and streams, the use of "
+            "process-level allocators will result in undefined result "
+            "errors due to memory asynchronous operations."
+            "The thread and stream binding configuration of all "
+            "predictors should be the same in a single process."));
+      }
     }
-  }
+  };
+  SetGflags(config);
+
+  VLOG(3) << "create AnalysisPredictor";
 
   std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
   // Each config can only be used for one predictor.

From 26bdea0fd12ce2f3b2ed8c0f104c1d8621eeda4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:40:01 +0800
Subject: [PATCH 29/89] fix div 0 error in floormod (#49997)

* fix mod 0 error

* fix div 0 error in floormod
---
 .../phi/kernels/funcs/elementwise_functor.h   |  1 +
 .../unittests/test_elementwise_floormod_op.py | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_floormod_op.py

diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index b98247fdf0c67..2636e9814dd2f 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -501,6 +501,7 @@ struct MinGradXYFunctor {
 template <typename T, typename Enable = void>
 struct RemainderFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const {
+    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
     T res = a % b;
 
     // Accoding to #PR26732: in dividen % divsor
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floormod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floormod_op.py
new file mode 100644
index 0000000000000..33e6fc2c47d45
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floormod_op.py
@@ -0,0 +1,35 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+
+
+class TestFloorModOp(unittest.TestCase):
+    def test_dygraph(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            # mod by zero
+            x = paddle.to_tensor([59], dtype='int32')
+            y = paddle.to_tensor([0], dtype='int32')
+            try:
+                paddle.floor_mod(x, y)
+            except Exception as e:
+                print("Error: Mod by zero encounter in floor_mod\n")
+
+
+if __name__ == '__main__':
+    unittest.main()

From c64296bf36d4b3f8902b2281969b9512fb1ff472 Mon Sep 17 00:00:00 2001
From: ZZK <359521840@qq.com>
Date: Tue, 31 Jan 2023 14:44:30 +0800
Subject: [PATCH 30/89] Bump Cutlass version to 2.11.0 (#50073)

---
 cmake/external/cutlass.cmake                      |  2 +-
 .../cutlass/{ => moe}/default_moe_fc_traits.h     |  0
 .../{ => moe}/linear_combination_ft_gelu.h        |  0
 .../fusion/cutlass/{ => moe}/moe_cutlass_kernel.h | 15 +++++++++------
 .../fusion/cutlass/{ => moe}/moe_kernel_impl.h    |  0
 paddle/phi/kernels/fusion/cutlass/moe_kernel.cu   | 10 ++++++----
 6 files changed, 16 insertions(+), 11 deletions(-)
 rename paddle/phi/kernels/fusion/cutlass/{ => moe}/default_moe_fc_traits.h (100%)
 rename paddle/phi/kernels/fusion/cutlass/{ => moe}/linear_combination_ft_gelu.h (100%)
 rename paddle/phi/kernels/fusion/cutlass/{ => moe}/moe_cutlass_kernel.h (98%)
 rename paddle/phi/kernels/fusion/cutlass/{ => moe}/moe_kernel_impl.h (100%)

diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index c96631206dfd7..eee868900b585 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
 
 set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
-set(CUTLASS_TAG v2.10.0)
+set(CUTLASS_TAG v2.11.0)
 
 include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
 include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")
diff --git a/paddle/phi/kernels/fusion/cutlass/default_moe_fc_traits.h b/paddle/phi/kernels/fusion/cutlass/moe/default_moe_fc_traits.h
similarity index 100%
rename from paddle/phi/kernels/fusion/cutlass/default_moe_fc_traits.h
rename to paddle/phi/kernels/fusion/cutlass/moe/default_moe_fc_traits.h
diff --git a/paddle/phi/kernels/fusion/cutlass/linear_combination_ft_gelu.h b/paddle/phi/kernels/fusion/cutlass/moe/linear_combination_ft_gelu.h
similarity index 100%
rename from paddle/phi/kernels/fusion/cutlass/linear_combination_ft_gelu.h
rename to paddle/phi/kernels/fusion/cutlass/moe/linear_combination_ft_gelu.h
diff --git a/paddle/phi/kernels/fusion/cutlass/moe_cutlass_kernel.h b/paddle/phi/kernels/fusion/cutlass/moe/moe_cutlass_kernel.h
similarity index 98%
rename from paddle/phi/kernels/fusion/cutlass/moe_cutlass_kernel.h
rename to paddle/phi/kernels/fusion/cutlass/moe/moe_cutlass_kernel.h
index f037f4e01b143..f0fcafba453c4 100644
--- a/paddle/phi/kernels/fusion/cutlass/moe_cutlass_kernel.h
+++ b/paddle/phi/kernels/fusion/cutlass/moe/moe_cutlass_kernel.h
@@ -42,6 +42,7 @@
 #include "cutlass/gemm/kernel/grouped_problem_visitor.h"
 #include "cutlass/layout/matrix.h"
 #include "cutlass/trace.h"
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -350,14 +351,16 @@ template <typename ThreadblockShape,
           int ThreadCount,
           bool Transposed = false>
 struct GemmMoeProblemVisitor
-    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<Transposed>,
-                               ThreadblockShape,
-                               GroupScheduleMode_,
-                               PrefetchTileCount,
-                               ThreadCount> {
+    : public MoeProblemVisitor<
+          detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>,
+          ThreadblockShape,
+          GroupScheduleMode_,
+          PrefetchTileCount,
+          ThreadCount> {
   static bool const kTransposed = Transposed;
 
-  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<Transposed>;
+  using ProblemSizeHelper =
+      detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
   using Base = MoeProblemVisitor<ProblemSizeHelper,
                                  ThreadblockShape,
                                  GroupScheduleMode_,
diff --git a/paddle/phi/kernels/fusion/cutlass/moe_kernel_impl.h b/paddle/phi/kernels/fusion/cutlass/moe/moe_kernel_impl.h
similarity index 100%
rename from paddle/phi/kernels/fusion/cutlass/moe_kernel_impl.h
rename to paddle/phi/kernels/fusion/cutlass/moe/moe_kernel_impl.h
diff --git a/paddle/phi/kernels/fusion/cutlass/moe_kernel.cu b/paddle/phi/kernels/fusion/cutlass/moe_kernel.cu
index a0c6719c3c3e6..48c93367dffcf 100644
--- a/paddle/phi/kernels/fusion/cutlass/moe_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/moe_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/fusion/cutlass/moe_kernel_impl.h"
+#include "paddle/phi/kernels/fusion/cutlass/moe/moe_kernel_impl.h"
 
 // Ignore CUTLASS warnings about type punning
 #pragma GCC diagnostic push
@@ -32,13 +32,15 @@
 #include "cutlass/gemm/kernel/default_gemm_grouped.h"
 #include "cutlass/numeric_conversion.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/kernels/fusion/cutlass/default_moe_fc_traits.h"
-#include "paddle/phi/kernels/fusion/cutlass/linear_combination_ft_gelu.h"
-#include "paddle/phi/kernels/fusion/cutlass/moe_cutlass_kernel.h"
+#include "paddle/phi/kernels/fusion/cutlass/moe/default_moe_fc_traits.h"
+#include "paddle/phi/kernels/fusion/cutlass/moe/linear_combination_ft_gelu.h"
+#include "paddle/phi/kernels/fusion/cutlass/moe/moe_cutlass_kernel.h"
 #pragma GCC diagnostic pop
+
 namespace phi {
 
 namespace {
+
 inline int getSMVersion() {
   const int device = phi::backends::gpu::GetCurrentDeviceId();
   const phi::gpuDeviceProp prop =

From 118aee6f2b0d3b57f26cb93c181961339d3bfc5d Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:51:46 +0800
Subject: [PATCH 31/89] not use shm cache default (#50089)

---
 paddle/phi/core/flags.cc                      |  4 ++--
 .../fluid/dataloader/dataloader_iter.py       | 21 +++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 526457499c884..680661c890519 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1198,11 +1198,11 @@ PADDLE_DEFINE_EXPORTED_bool(trt_ibuilder_cache,
  * mmap_allocator related FLAG
  * Name: use_shm_cache
  * Since Version: 2.5.0
- * Value Range: bool, default=true
+ * Value Range: bool, default=false
  * Example:
  * Note: . If True, mmap_allocator will cache shm file to decrease munmap
  * operation.
  */
 PADDLE_DEFINE_EXPORTED_bool(use_shm_cache,
-                            true,
+                            false,
                             "Use shm cache in mmap_allocator.");
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index c7c49c794a101..66c6dff6c1913 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -410,13 +410,22 @@ def __init__(self, loader):
         # Note(zhangbo): shm_buffer_size is used for MemoryMapAllocationPool.
         # MemoryMapAllocationPool is used to cache and reuse shm, thus reducing munmap in dataloader.
         # For more details, please see: paddle/fluid/memory/allocation/mmap_allocator.h
-        try:
-            self._worker_shm_buffer_size = (2 + 1) * len(self._dataset[0])
-        except:
+        if os.environ.get('FLAGS_use_shm_cache', False) in [
+            1,
+            '1',
+            True,
+            'True',
+            'true',
+        ]:
+            try:
+                self._worker_shm_buffer_size = (2 + 1) * len(self._dataset[0])
+            except:
+                self._worker_shm_buffer_size = 0
+                warnings.warn(
+                    "Setting the shm cache buffer size to 0, equivalent to not using the shm cache policy."
+                )
+        else:
             self._worker_shm_buffer_size = 0
-            warnings.warn(
-                "Setting the shm cache buffer size to 0, equivalent to not using the shm cache policy."
-            )
         self._main_thread_shm_buffer_size = (
             (self._worker_shm_buffer_size) * 2 * self._num_workers
         )

From 9a4acfee2fb1e90ded399511cf0f8ee1def0229f Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Tue, 31 Jan 2023 14:56:13 +0800
Subject: [PATCH 32/89] optimize 2D sync_batch_norm (#49663)

---
 paddle/phi/kernels/funcs/norm_utils.cu.h      | 120 ++++++++-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 229 ++++--------------
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |  86 ++-----
 .../phi/kernels/gpu/sync_batch_norm_utils.h   | 206 +++++++++++++++-
 4 files changed, 388 insertions(+), 253 deletions(-)

diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index 0971db10529a9..80f37750adcf9 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -26,6 +26,7 @@ namespace cub = hipcub;
 #endif
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
@@ -36,8 +37,6 @@ namespace cub = hipcub;
 namespace phi {
 namespace funcs {
 
-using DataLayout = phi::DataLayout;
-
 // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
 // axis=(n,h,w)) *
 //          np.sum(dy, axis=(n,h,w)) -
@@ -670,5 +669,122 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     }
   }
 }
+
+template <typename T, typename BnT>
+__device__ __forceinline__ void BlockReduceByVetical(BnT x_sum,
+                                                     BnT x_square_sum,
+                                                     BnT *smem_sum,
+                                                     BnT *smem_square_sum,
+                                                     BnT *x_sum_out,
+                                                     BnT *x_square_sum_out) {
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset * 2) {
+      smem_sum[tid] = x_sum;
+      smem_square_sum[tid] = x_square_sum;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset) {
+      int pair_tid = tid + offset * blockDim.x;
+      x_sum += smem_sum[pair_tid];
+      x_square_sum += smem_square_sum[pair_tid];
+    }
+  }
+  if (threadIdx.y == 0) {
+    *x_sum_out = x_sum;
+    *x_square_sum_out = x_square_sum;
+  }
+}
+
+template <typename T, typename BnT>
+__device__ __forceinline__ void ReduceSumPost(const int C,  // channels
+                                              const int c,  // channel index
+                                              BnT *sum1,
+                                              BnT *sum2,
+                                              bool *is_last_block_done,
+                                              BnT *cache1,
+                                              BnT *cache2,
+                                              BnT *block_data_ptr,
+                                              int *flag_ptr) {
+  volatile BnT *staging_sum = block_data_ptr;
+  volatile BnT *staging_sum2 = &block_data_ptr[C * gridDim.y];
+  // write block data to global memory
+  if (threadIdx.y == 0) {
+    staging_sum[c + blockIdx.y * C] = *sum1;
+    staging_sum2[c + blockIdx.y * C] = *sum2;
+  }
+
+  // make sure write is visible to all blocks
+  __threadfence();
+  __syncthreads();
+
+  // mark block done
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
+    *is_last_block_done = (old == (gridDim.y - 1));
+  }
+
+  __syncthreads();
+
+  if (*is_last_block_done) {
+    *sum1 = static_cast<BnT>(0);
+    *sum2 = static_cast<BnT>(0);
+    // thread sum
+    for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+      *sum1 += staging_sum[c + y * C];
+      *sum2 += staging_sum2[c + y * C];
+    }
+
+    // vertical block sum
+    funcs::BlockReduceByVetical<T, BnT>(
+        *sum1, *sum2, &cache1[0], &cache2[0], sum1, sum2);
+  }
+}
+
+template <typename T, typename BnT, typename Context>
+void SetLaunchConfigInfoForChannelLast(const Context &ctx,
+                                       DenseTensor *block_data_tensor,
+                                       DenseTensor *flag_tensor,
+                                       BnT **block_data_ptr,
+                                       int **flag_ptr,
+                                       const int N,
+                                       const int H,
+                                       const int W,
+                                       const int D,
+                                       const int C,
+                                       const int block_size,
+                                       dim3 *block,
+                                       dim3 *grid) {
+  const int MAX_GRID_SIZE = 128;
+  const int WARP_SIZE = 32;
+
+  int block_x = std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE);
+  int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
+                         block_size / block_x);
+  if (block_x * block_y != block_size) {
+    block_x =
+        std::min(phi::funcs::details::GetLastPow2(C), block_size / block_y);
+  }
+  int grid_x = (C + block_x - 1) / block_x;
+  int grid_y = std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16),
+                        MAX_GRID_SIZE);
+
+  block->x = block_x;
+  block->y = block_y;
+  grid->x = grid_x;
+  grid->y = grid_y;
+
+  if (grid->y > 1) {
+    *block_data_tensor = phi::Empty<BnT, Context>(ctx, {2 * C * grid->y});
+    *flag_tensor = phi::Empty<int, Context>(ctx, {grid->x});
+
+    *block_data_ptr = block_data_tensor->data<BnT>();
+    *flag_ptr = flag_tensor->data<int>();
+    funcs::SetConstant<Context, int> set_zero;
+    set_zero(ctx, flag_tensor, static_cast<int>(0));
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 01a7aa0162718..58d05d6075816 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -245,34 +245,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
   }
 }
 
-template <typename T>
-__device__ __forceinline__ void BlockReduceByVetical(
-    BatchNormParamType<T> x_sum,
-    BatchNormParamType<T> x_square_sum,
-    BatchNormParamType<T> *smem_sum,
-    BatchNormParamType<T> *smem_square_sum,
-    BatchNormParamType<T> *x_sum_out,
-    BatchNormParamType<T> *x_square_sum_out) {
-  int tid = threadIdx.x + threadIdx.y * blockDim.x;
-#pragma unroll
-  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
-    if (threadIdx.y < offset * 2) {
-      smem_sum[tid] = x_sum;
-      smem_square_sum[tid] = x_square_sum;
-    }
-    __syncthreads();
-    if (threadIdx.y < offset) {
-      int pair_tid = tid + offset * blockDim.x;
-      x_sum += smem_sum[pair_tid];
-      x_square_sum += smem_square_sum[pair_tid];
-    }
-  }
-  if (threadIdx.y == 0) {
-    *x_sum_out = x_sum;
-    *x_square_sum_out = x_square_sum;
-  }
-}
-
 template <typename T, int BlockDim>
 static __global__ void BNBackward2DChannelLastStage1(
     const T *x,
@@ -309,53 +281,25 @@ static __global__ void BNBackward2DChannelLastStage1(
     }
 
     // vertical block sum
-    BlockReduceByVetical<T>(x_sum,
-                            x_square_sum,
-                            &smem_sum[0],
-                            &smem_square_sum[0],
-                            &x_sum,
-                            &x_square_sum);
+    funcs::BlockReduceByVetical<T, BatchNormParamType<T>>(x_sum,
+                                                          x_square_sum,
+                                                          &smem_sum[0],
+                                                          &smem_square_sum[0],
+                                                          &x_sum,
+                                                          &x_square_sum);
 
     if (gridDim.y > 1) {
-      volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
-      volatile BatchNormParamType<T> *staging_square_sum =
-          &block_data_ptr[C * gridDim.y];
-      // write block data to global memory
-      if (threadIdx.y == 0) {
-        staging_sum[i + blockIdx.y * C] = x_sum;
-        staging_square_sum[i + blockIdx.y * C] = x_square_sum;
-      }
-
-      // make sure write is visible to all blocks
-      __threadfence();
-      __syncthreads();
-
       __shared__ bool is_last_block_done;
-      // mark block done
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
-        is_last_block_done = (old == (gridDim.y - 1));
-      }
-
-      __syncthreads();
-
+      funcs::ReduceSumPost<T, BatchNormParamType<T>>(C,
+                                                     i,
+                                                     &x_sum,
+                                                     &x_square_sum,
+                                                     &is_last_block_done,
+                                                     smem_sum,
+                                                     smem_square_sum,
+                                                     block_data_ptr,
+                                                     flag_ptr);
       if (is_last_block_done) {
-        x_sum = static_cast<BatchNormParamType<T>>(0);
-        x_square_sum = static_cast<BatchNormParamType<T>>(0);
-        // thread sum
-        for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
-          x_sum += staging_sum[i + y * C];
-          x_square_sum += staging_square_sum[i + y * C];
-        }
-
-        // vertical block sum
-        BlockReduceByVetical<T>(x_sum,
-                                x_square_sum,
-                                &smem_sum[0],
-                                &smem_square_sum[0],
-                                &x_sum,
-                                &x_square_sum);
-
         // final compute
         if (threadIdx.y == 0) {
           BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
@@ -417,45 +361,21 @@ static __global__ void BNBackward2DChannelLastStage2(
     }
 
     // vertical block sum
-    BlockReduceByVetical<T>(
+    funcs::BlockReduceByVetical<T, BatchNormParamType<T>>(
         ds_sum, db_sum, &smem_ds_sum[0], &smem_db_sum[0], &ds_sum, &db_sum);
 
     if (gridDim.y > 1) {
-      volatile BatchNormParamType<T> *staging_ds_sum = block_data_ptr;
-      volatile BatchNormParamType<T> *staging_db_sum =
-          &block_data_ptr[C * gridDim.y];
-      // write block data to global memory
-      if (threadIdx.y == 0) {
-        staging_ds_sum[i + blockIdx.y * C] = ds_sum;
-        staging_db_sum[i + blockIdx.y * C] = db_sum;
-      }
-
-      // make sure write is visible to all blocks
-      __threadfence();
-      __syncthreads();
-
       __shared__ bool is_last_block_done;
-      // mark block done
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
-        is_last_block_done = (old == (gridDim.y - 1));
-      }
-
-      __syncthreads();
-
+      funcs::ReduceSumPost<T, BatchNormParamType<T>>(C,
+                                                     i,
+                                                     &ds_sum,
+                                                     &db_sum,
+                                                     &is_last_block_done,
+                                                     smem_ds_sum,
+                                                     smem_db_sum,
+                                                     block_data_ptr,
+                                                     flag_ptr);
       if (is_last_block_done) {
-        ds_sum = static_cast<BatchNormParamType<T>>(0);
-        db_sum = static_cast<BatchNormParamType<T>>(0);
-        // thread sum
-        for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
-          ds_sum += staging_ds_sum[i + y * C];
-          db_sum += staging_db_sum[i + y * C];
-        }
-
-        // vertical block sum
-        BlockReduceByVetical<T>(
-            ds_sum, db_sum, &smem_ds_sum[0], &smem_db_sum[0], &ds_sum, &db_sum);
-
         // final compute
         if (threadIdx.y == 0) {
           dscale[i] = ds_sum * inv_var_val;
@@ -563,51 +483,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
   }
 }
 
-template <typename T, typename Context>
-void SetLaunchConfigInfoForChannelLast(const Context &ctx,
-                                       DenseTensor *block_data_tensor,
-                                       DenseTensor *flag_tensor,
-                                       BatchNormParamType<T> **block_data_ptr,
-                                       int **flag_ptr,
-                                       const int N,
-                                       const int H,
-                                       const int W,
-                                       const int D,
-                                       const int C,
-                                       const int block_size,
-                                       dim3 *block,
-                                       dim3 *grid) {
-  const int MAX_GRID_SIZE = 128;
-  const int WARP_SIZE = 32;
-
-  int block_x = std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE);
-  int block_y = std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
-                         block_size / block_x);
-  if (block_x * block_y != block_size) {
-    block_x =
-        std::min(phi::funcs::details::GetLastPow2(C), block_size / block_y);
-  }
-  int grid_x = (C + block_x - 1) / block_x;
-  int grid_y = std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16),
-                        MAX_GRID_SIZE);
-
-  block->x = block_x;
-  block->y = block_y;
-  grid->x = grid_x;
-  grid->y = grid_y;
-
-  if (grid->y > 1) {
-    *block_data_tensor =
-        phi::Empty<BatchNormParamType<T>, Context>(ctx, {2 * C * grid->y});
-    *flag_tensor = phi::Empty<int, Context>(ctx, {grid->x});
-
-    *block_data_ptr = block_data_tensor->data<BatchNormParamType<T>>();
-    *flag_ptr = flag_tensor->data<int>();
-    funcs::SetConstant<Context, int> set_zero;
-    set_zero(ctx, flag_tensor, static_cast<int>(0));
-  }
-}
-
 template <typename T, typename Context>
 void BatchNormGradRawKernel(const Context &ctx,
                             const DenseTensor &x,
@@ -931,19 +806,20 @@ void BatchNormGradRawKernel(const Context &ctx,
           BatchNormParamType<T> *block_data_ptr = nullptr;
           int *flag_ptr = nullptr;
 
-          SetLaunchConfigInfoForChannelLast<T>(ctx,
-                                               &block_data_tensor,
-                                               &flag_tensor,
-                                               &block_data_ptr,
-                                               &flag_ptr,
-                                               N,
-                                               H,
-                                               W,
-                                               D,
-                                               C,
-                                               block_size,
-                                               &block,
-                                               &grid);
+          funcs::SetLaunchConfigInfoForChannelLast<T, BatchNormParamType<T>>(
+              ctx,
+              &block_data_tensor,
+              &flag_tensor,
+              &block_data_ptr,
+              &flag_ptr,
+              N,
+              H,
+              W,
+              D,
+              C,
+              block_size,
+              &block,
+              &grid);
 
           // 1. reduce_sum(x) => mean, inv_var
           auto *mean_ptr =
@@ -1294,19 +1170,20 @@ void BatchNormGradRawKernel(const Context &ctx,
         BatchNormParamType<T> *block_data_ptr = nullptr;
         int *flag_ptr = nullptr;
 
-        SetLaunchConfigInfoForChannelLast<T>(ctx,
-                                             &block_data_tensor,
-                                             &flag_tensor,
-                                             &block_data_ptr,
-                                             &flag_ptr,
-                                             N,
-                                             H,
-                                             W,
-                                             D,
-                                             C,
-                                             block_size,
-                                             &block,
-                                             &grid);
+        funcs::SetLaunchConfigInfoForChannelLast<T, BatchNormParamType<T>>(
+            ctx,
+            &block_data_tensor,
+            &flag_tensor,
+            &block_data_ptr,
+            &flag_ptr,
+            N,
+            H,
+            W,
+            D,
+            C,
+            block_size,
+            &block,
+            &grid);
         BNBackward2DChannelLastStage2<T, block_size>
             <<<grid, block, 0, ctx.stream()>>>(
                 transformed_d_y.template data<T>(),
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 60d0d1a01bb30..fc460574b74b7 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 
@@ -171,34 +172,6 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
-template <typename T>
-__device__ __forceinline__ void merge_block_vertical(
-    BatchNormParamType<T> x_sum,
-    BatchNormParamType<T> x_square_sum,
-    BatchNormParamType<T> *smem_sum,
-    BatchNormParamType<T> *smem_square_sum,
-    BatchNormParamType<T> *x_sum_out,
-    BatchNormParamType<T> *x_square_sum_out) {
-  int tid = threadIdx.x + threadIdx.y * blockDim.x;
-#pragma unroll
-  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
-    if (threadIdx.y < offset * 2) {
-      smem_sum[tid] = x_sum;
-      smem_square_sum[tid] = x_square_sum;
-    }
-    __syncthreads();
-    if (threadIdx.y < offset) {
-      int pair_tid = tid + offset * blockDim.x;
-      x_sum += smem_sum[pair_tid];
-      x_square_sum += smem_square_sum[pair_tid];
-    }
-  }
-  if (threadIdx.y == 0) {
-    *x_sum_out = x_sum;
-    *x_square_sum_out = x_square_sum;
-  }
-}
-
 template <typename T>
 __device__ __forceinline__ void merge_block_horizonal(
     BatchNormParamType<T> x_sum,
@@ -269,53 +242,26 @@ static __global__ void BNForwardTraining2DChannelLastCompStat(
     }
 
     // vertical block sum
-    merge_block_vertical<T>(x_sum,
-                            x_square_sum,
-                            &smem_sum[0],
-                            &smem_square_sum[0],
-                            &x_sum,
-                            &x_square_sum);
+    funcs::BlockReduceByVetical<T, BatchNormParamType<T>>(x_sum,
+                                                          x_square_sum,
+                                                          &smem_sum[0],
+                                                          &smem_square_sum[0],
+                                                          &x_sum,
+                                                          &x_square_sum);
 
     if (gridDim.y > 1) {
-      volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
-      volatile BatchNormParamType<T> *staging_square_sum =
-          &block_data_ptr[C * gridDim.y];
-      // write block data to global memory
-      if (threadIdx.y == 0) {
-        staging_sum[i + blockIdx.y * C] = x_sum;
-        staging_square_sum[i + blockIdx.y * C] = x_square_sum;
-      }
-
-      // make sure write is visible to all blocks
-      __threadfence();
-      __syncthreads();
-
       __shared__ bool is_last_block_done;
-      // mark block done
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
-        is_last_block_done = (old == (gridDim.y - 1));
-      }
-
-      __syncthreads();
+      funcs::ReduceSumPost<T, BatchNormParamType<T>>(C,
+                                                     i,
+                                                     &x_sum,
+                                                     &x_square_sum,
+                                                     &is_last_block_done,
+                                                     smem_sum,
+                                                     smem_square_sum,
+                                                     block_data_ptr,
+                                                     flag_ptr);
 
       if (is_last_block_done) {
-        x_sum = static_cast<BatchNormParamType<T>>(0);
-        x_square_sum = static_cast<BatchNormParamType<T>>(0);
-        // thread sum
-        for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
-          x_sum += staging_sum[i + y * C];
-          x_square_sum += staging_square_sum[i + y * C];
-        }
-
-        // vertical block sum
-        merge_block_vertical<T>(x_sum,
-                                x_square_sum,
-                                &smem_sum[0],
-                                &smem_square_sum[0],
-                                &x_sum,
-                                &x_square_sum);
-
         // final compute
         if (threadIdx.y == 0) {
           BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
index 81717cd445bc0..71d0ccfa0eb4b 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -34,6 +34,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
+#include "paddle/phi/kernels/funcs/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 
 namespace phi {
@@ -168,6 +169,61 @@ __global__ void KeBackwardLocalStats(const T *dy,
   }
 }
 
+template <typename T, const int BlockDim, DataLayout layout>
+__global__ void KeBackwardLocalStats2D(const T *dy,
+                                       const T *x,
+                                       const BatchNormParamType<T> *means,
+                                       int N,
+                                       int M,
+                                       int C,
+                                       BatchNormParamType<T> *block_data_ptr,
+                                       int *flag_ptr,
+                                       BatchNormParamType<T> *sum_dy_prod) {
+  __shared__ BatchNormParamType<T> smem_sum[BlockDim];
+  __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
+  for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < C;
+       k += gridDim.x * blockDim.x) {
+    BatchNormParamType<T> sum1 = 0.;
+    BatchNormParamType<T> sum2 = 0.;
+    auto mean = means[k];
+    for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < N * M;
+         i += gridDim.y * blockDim.y) {
+      int id = layout == DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M
+                                           : i * C + k;
+      auto g = static_cast<BatchNormParamType<T>>(dy[id]);
+      sum1 += g;
+      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
+      sum2 += g * (x_i - mean);
+    }
+    funcs::BlockReduceByVetical<T, BatchNormParamType<T>>(
+        sum1, sum2, &smem_sum[0], &smem_square_sum[0], &sum1, &sum2);
+
+    if (gridDim.y > 1) {
+      __shared__ bool is_last_block_done;
+      funcs::ReduceSumPost<T, BatchNormParamType<T>>(C,
+                                                     k,
+                                                     &sum1,
+                                                     &sum2,
+                                                     &is_last_block_done,
+                                                     smem_sum,
+                                                     smem_square_sum,
+                                                     block_data_ptr,
+                                                     flag_ptr);
+      if (is_last_block_done) {
+        // final compute
+        if (threadIdx.y == 0) {
+          sum_dy_prod[k] = sum1;
+          sum_dy_prod[k + C] = sum2;
+        }
+      }
+    }
+  }
+  if (blockIdx.y == 0 && blockIdx.x == 0 && threadIdx.y == 0 &&
+      threadIdx.x == 0) {
+    sum_dy_prod[2 * C] = 1.0;
+  }
+}
+
 template <typename T, int BlockDim, DataLayout layout>
 static __global__ void KeBNBackwardScaleBias(
     const T *dy,
@@ -213,6 +269,68 @@ static __global__ void KeBNBackwardScaleBias(
   }
 }
 
+template <typename T, int BlockDim, DataLayout layout>
+static __global__ void KeBNBackwardScaleBias2D(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *inv_variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *block_data_ptr,
+    int *flag_ptr,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  __shared__ BatchNormParamType<T> smem_sum[BlockDim];
+  __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
+       i += gridDim.x * blockDim.x) {
+    BatchNormParamType<T> ds_sum = 0.;
+    BatchNormParamType<T> db_sum = 0.;
+
+    auto inv_var_i = inv_variance[i];
+    auto mean_i = mean[i];
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
+         j += gridDim.y * blockDim.y) {
+      const int id = layout == DataLayout::kNCHW
+                         ? ((j / HxW) * C + i) * HxW + (j % HxW)
+                         : j * outer_size + i;
+      auto x_i = static_cast<BatchNormParamType<T>>(x[id]);
+      auto dy_i = static_cast<BatchNormParamType<T>>(dy[id]);
+      ds_sum += dy_i * (x_i - mean_i);
+      db_sum += dy_i;
+    }
+
+    funcs::BlockReduceByVetical<T, BatchNormParamType<T>>(
+        ds_sum, db_sum, &smem_sum[0], &smem_square_sum[0], &ds_sum, &db_sum);
+
+    if (gridDim.y > 1) {
+      __shared__ bool is_last_block_done;
+      funcs::ReduceSumPost<T, BatchNormParamType<T>>(C,
+                                                     i,
+                                                     &ds_sum,
+                                                     &db_sum,
+                                                     &is_last_block_done,
+                                                     smem_sum,
+                                                     smem_square_sum,
+                                                     block_data_ptr,
+                                                     flag_ptr);
+      if (is_last_block_done) {
+        // final compute
+        if (threadIdx.y == 0) {
+          dscale[i] = ds_sum * inv_var_i;
+          dbias[i] = db_sum;
+        }
+      }
+    }
+  }
+}
+
 template <typename T, DataLayout layout>
 static __global__ void KeBNRestoreData(T *x,
                                        const BatchNormParamType<T> *scale,
@@ -410,9 +528,46 @@ void SyncBatchNormGradFunctor(
         <<<grid, threads, 0, stream>>>(
             dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
   } else {
-    KeBackwardLocalStats<T, threads, DataLayout::kNHWC>
-        <<<grid, threads, 0, stream>>>(
-            dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
+    if (x_dims.size() == 2 && N >= 65535) {
+      dim3 block;
+      dim3 grid;
+      const int block_size = 512;
+
+      // init intermediate storage
+      DenseTensor block_data_tensor;
+      DenseTensor flag_tensor;
+      BatchNormParamType<T> *block_data_ptr = nullptr;
+      int *flag_ptr = nullptr;
+
+      funcs::SetLaunchConfigInfoForChannelLast<T, BatchNormParamType<T>>(
+          ctx,
+          &block_data_tensor,
+          &flag_tensor,
+          &block_data_ptr,
+          &flag_ptr,
+          N,
+          H,
+          W,
+          D,
+          C,
+          block_size,
+          &block,
+          &grid);
+      KeBackwardLocalStats2D<T, block_size, DataLayout::kNHWC>
+          <<<grid, block, 0, stream>>>(dy_d,
+                                       x_d,
+                                       saved_mean_ptr,
+                                       N,
+                                       fsize,
+                                       C,
+                                       block_data_ptr,
+                                       flag_ptr,
+                                       stats);
+    } else {
+      KeBackwardLocalStats<T, threads, DataLayout::kNHWC>
+          <<<grid, threads, 0, stream>>>(
+              dy_d, x_d, saved_mean_ptr, N, fsize, C, stats);
+    }
   }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -476,8 +631,33 @@ void SyncBatchNormGradFunctor(
     }
   } else {
     if (d_scale && d_bias) {
-      KeBNBackwardScaleBias<T, threads, DataLayout::kNHWC>
-          <<<grid, threads, 0, stream>>>(dy_d,
+      if (x_dims.size() == 2 && N >= 65535) {
+        dim3 block;
+        dim3 grid;
+        const int block_size = 512;
+
+        // init intermediate storage
+        DenseTensor block_data_tensor;
+        DenseTensor flag_tensor;
+        BatchNormParamType<T> *block_data_ptr = nullptr;
+        int *flag_ptr = nullptr;
+
+        funcs::SetLaunchConfigInfoForChannelLast<T, BatchNormParamType<T>>(
+            ctx,
+            &block_data_tensor,
+            &flag_tensor,
+            &block_data_ptr,
+            &flag_ptr,
+            N,
+            H,
+            W,
+            D,
+            C,
+            block_size,
+            &block,
+            &grid);
+        KeBNBackwardScaleBias2D<T, block_size, DataLayout::kNHWC>
+            <<<grid, block, 0, stream>>>(dy_d,
                                          x_d,
                                          saved_mean_ptr,
                                          saved_inv_var,
@@ -485,8 +665,24 @@ void SyncBatchNormGradFunctor(
                                          N,
                                          C,
                                          fsize,
+                                         block_data_ptr,
+                                         flag_ptr,
                                          d_scale->data<BatchNormParamType<T>>(),
                                          d_bias->data<BatchNormParamType<T>>());
+      } else {
+        KeBNBackwardScaleBias<T, threads, DataLayout::kNHWC>
+            <<<grid, threads, 0, stream>>>(
+                dy_d,
+                x_d,
+                saved_mean_ptr,
+                saved_inv_var,
+                epsilon,
+                N,
+                C,
+                fsize,
+                d_scale->data<BatchNormParamType<T>>(),
+                d_bias->data<BatchNormParamType<T>>());
+      }
     }
     if (d_x) {
       KeBNBackwardData<T, DataLayout::kNHWC><<<grid2, block, 0, stream>>>(

From 1048b166d9db56dafd93d714ad0bcf9fd92cfaeb Mon Sep 17 00:00:00 2001
From: Roc <30228238+sljlp@users.noreply.github.com>
Date: Tue, 31 Jan 2023 15:01:47 +0800
Subject: [PATCH 33/89] fix send start msg (#50085)

---
 paddle/fluid/distributed/fleet_executor/carrier.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 2b75c3ba066ec..4a759646067df 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -179,6 +179,7 @@ void Carrier::Start() {
                         "Using carrier before initialized."));
   InterceptorMessage start_msg;
   start_msg.set_dst_id(SOURCE_ID);
+  start_msg.set_src_id(SOURCE_ID);
   start_msg.set_message_type(START);
   Send(start_msg);
   // TODO(wangxi): async step

From 78ec942b7b4918ff710a13b28c79a91e7f2e5a03 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 15:08:12 +0800
Subject: [PATCH 34/89] =?UTF-8?q?Fix=20=E7=A9=BA=E6=8C=87=E9=92=88=20(Null?=
 =?UTF-8?q?=20pointer)=20of=20case15:=20paddle.broadcast=5Ftensors=20(#499?=
 =?UTF-8?q?80)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix incorrect output shape of broadcast

* add unittest
---
 paddle/phi/infermeta/multiary.cc                   |  2 +-
 .../tests/unittests/test_broadcast_tensors_op.py   | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index ef94266b4ebe1..545b3c6f52354 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -791,7 +791,7 @@ void BroadcastTensorsInferMeta(const std::vector<const MetaTensor*>& x,
 
       // We performed bcast semantics check at python level
       // So input tensors should all have legal shape
-      target_dim_size = std::max(target_dim_size, dim_size);
+      target_dim_size = dim_size == 1 ? target_dim_size : dim_size;
     }
     target_dims[target_rank - index - 1] = target_dim_size;
   }
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
index 6eec711c49e0a..9879aac254fb7 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py
@@ -33,14 +33,12 @@ def find_output_shape(input_list):
         rank = len(x.shape)
         output_rank = max(output_rank, rank)
 
-    output_shape = [0 for i in range(output_rank)]
+    output_shape = [1 for i in range(output_rank)]
     for i in range(output_rank):
         for x in input_list:
             shape = list(reversed(x.shape))
-            size = 1
-            if i < len(shape):
-                size = shape[i]
-            output_shape[i] = max(output_shape[i], size)
+            if i < len(shape) and shape[i] != 1:
+                output_shape[i] = shape[i]
 
     return list(reversed(output_shape))
 
@@ -80,6 +78,11 @@ def gen_mixed_tensors_test(dtype):
     return make_inputs_outputs(input_shapes, dtype)
 
 
+def gen_empty_tensors_test(dtype):
+    input_shapes = [(0), (0), (0)]
+    return make_inputs_outputs(input_shapes, dtype)
+
+
 class TestCPUBroadcastTensorsOp(OpTest):
     def set_place(self):
         self.place = core.CPUPlace()
@@ -95,6 +98,7 @@ def setUp(self):
             gen_rank_diff_test,
             gen_no_broadcast_test,
             gen_mixed_tensors_test,
+            gen_empty_tensors_test,
         ]
         self.set_place()
         self.set_dtypes()

From fce05d7d91300179320c5242d1b92ea9de418bf2 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 31 Jan 2023 15:17:52 +0800
Subject: [PATCH 35/89] update pybind11, 2.4.3->2.6.0 (#50068)

---
 cmake/external/pybind11.cmake                        | 2 +-
 python/paddle/fluid/dygraph/varbase_patch_methods.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index e236767cec156..49e3111bae9c5 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -16,7 +16,7 @@ include(ExternalProject)
 
 set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind)
 set(PYBIND_REPOSITORY ${GIT_URL}/pybind/pybind11.git)
-set(PYBIND_TAG v2.4.3)
+set(PYBIND_TAG v2.6.0)
 
 set(PYBIND_INCLUDE_DIR ${THIRD_PARTY_PATH}/pybind/src/extern_pybind/include)
 include_directories(${PYBIND_INCLUDE_DIR})
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 9f0d8297f349b..d6d45f23146f8 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -1079,7 +1079,7 @@ def __hash__(self):
         # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
         # So, we need to overwrite it to a more readable one.
         # See details in https://github.com/pybind/pybind11/issues/2537.
-        origin = getattr(core.VarDesc.VarType, "__repr__")
+        origin = getattr(core.VarDesc.VarType, "__str__")
 
         def dtype_str(dtype):
             if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
@@ -1092,7 +1092,7 @@ def dtype_str(dtype):
                 # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR
                 return origin(dtype)
 
-        setattr(core.VarDesc.VarType, "__repr__", dtype_str)
+        setattr(core.VarDesc.VarType, "__str__", dtype_str)
         _already_patch_repr = True
 
     # patch math methods for varbase

From 5d110365c2a05dcaf7fbe0b808e6d4c533d581e8 Mon Sep 17 00:00:00 2001
From: jameszhang <zhangxiaoci@baidu.com>
Date: Tue, 31 Jan 2023 15:19:13 +0800
Subject: [PATCH 36/89] [KUNLUN] rename test_pool_max_op.py (#49945)

* [KUNLUN] rename test_pool_max_op.py

* update xpu toolchain
---
 cmake/external/xpu.cmake                                        | 2 +-
 .../xpu/{test_pool_max_op.py => test_pool_max_op_xpu.py}        | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename python/paddle/fluid/tests/unittests/xpu/{test_pool_max_op.py => test_pool_max_op_xpu.py} (100%)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 4711ce83da7d5..7eee112036adc 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -7,7 +7,7 @@ set(XPU_PROJECT "extern_xpu")
 set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 
-set(XPU_BASE_DATE "20230114")
+set(XPU_BASE_DATE "20230119")
 set(XPU_XCCL_BASE_VERSION "1.0.7")
 
 if(NOT DEFINED XPU_BASE_URL)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool_max_op.py
rename to python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py

From 7aaaa1c61eaf2d699d5ef1e946de6566f1d92ce8 Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Tue, 31 Jan 2023 15:38:04 +0800
Subject: [PATCH 37/89] Add unified device management api (#48651)

* [CustomDevice] add custom device api

* update

* update

* test=document_fix

* update

* update

* add  examples
---
 .../platform/device_event_custom_device.cc    |   2 +-
 paddle/fluid/pybind/CMakeLists.txt            |   1 +
 paddle/fluid/pybind/cuda_streams_py.cc        |   4 +
 paddle/fluid/pybind/custom_device_py.cc       | 572 ++++++++++++++++++
 paddle/fluid/pybind/custom_device_py.h        |  28 +
 paddle/fluid/pybind/pybind.cc                 |   2 +
 paddle/phi/backends/CMakeLists.txt            |  22 +-
 paddle/phi/backends/callback_manager.h        |  10 -
 paddle/phi/backends/custom/custom_context.cc  |  14 +
 paddle/phi/backends/custom/custom_context.h   |   8 +-
 paddle/phi/backends/custom/custom_device.cc   |   7 -
 paddle/phi/backends/device_base.h             |   3 -
 paddle/phi/backends/device_guard.h            |   4 -
 paddle/phi/backends/device_manager.cc         |   5 +-
 paddle/phi/backends/device_manager.h          |   5 +-
 paddle/phi/backends/event.cc                  |   2 +-
 paddle/phi/backends/event.h                   |   2 +-
 python/paddle/device/__init__.py              | 480 +++++++++++++++
 python/paddle/device/cuda/__init__.py         |  19 +
 python/paddle/device/xpu/__init__.py          |   7 +
 python/paddle/fluid/core.py                   |   7 +
 .../custom_runtime/test_custom_cpu_plugin.py  |  29 +
 22 files changed, 1189 insertions(+), 44 deletions(-)
 create mode 100644 paddle/fluid/pybind/custom_device_py.cc
 create mode 100644 paddle/fluid/pybind/custom_device_py.h

diff --git a/paddle/fluid/platform/device_event_custom_device.cc b/paddle/fluid/platform/device_event_custom_device.cc
index a45cb43baf2ec..6d284d657818a 100644
--- a/paddle/fluid/platform/device_event_custom_device.cc
+++ b/paddle/fluid/platform/device_event_custom_device.cc
@@ -76,7 +76,7 @@ bool DeviceEventQueryCustomDevice(const DeviceEvent* event) {
 void DeviceEventFinishCustomDevice(const DeviceEvent* event) {
   auto* wrapper =
       static_cast<CustomDeviceEventWrapper*>(event->GetEvent().get());
-  wrapper->inner_event_->Synchonrize();
+  wrapper->inner_event_->Synchronize();
 }
 
 void DeviceEventCustomDeviceWaitCustomDevice(const DeviceEvent* event,
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index dba7f0d032b39..283b305d71806 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -138,6 +138,7 @@ set(PYBIND_SRCS
     generator_py.cc
     communication.cc
     cuda_streams_py.cc
+    custom_device_py.cc
     xpu_streams_py.cc
     jit.cc
     auto_parallel_py.cc)
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 8898088596e71..41202daa9c521 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -243,6 +243,10 @@ void BindCudaStream(py::module *m_ptr) {
             print(ptr)
 
            )DOC")
+      .def_property_readonly("place",
+                             [](phi::CUDAStream &self) {
+                               return platform::CUDAPlace(self.place());
+                             })
 #endif
       .def(
           "__init__",
diff --git a/paddle/fluid/pybind/custom_device_py.cc b/paddle/fluid/pybind/custom_device_py.cc
new file mode 100644
index 0000000000000..d3b4183f2f4f0
--- /dev/null
+++ b/paddle/fluid/pybind/custom_device_py.cc
@@ -0,0 +1,572 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/custom_device_py.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+void BindCustomDevicePy(py::module *m_ptr) {
+  auto &m = *m_ptr;
+  // Bind Methods
+  m.def(
+      "_get_current_custom_device_stream",
+      [](const std::string &device_type, int device_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto place = paddle::platform::CustomPlace(
+            device_type,
+            device_id == -1 ? phi::DeviceManager::GetDevice(device_type)
+                            : device_id);
+
+        return static_cast<const phi::CustomContext *>(
+                   paddle::platform::DeviceContextPool::Instance().Get(place))
+            ->GetStream();
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit _get_current_custom_device_stream."));
+#endif
+      },
+      py::return_value_policy::reference,
+      py::arg("device_type"),
+      py::arg("device_id") = -1);
+  m.def(
+      "_set_current_custom_device_stream",
+      [](const std::string &device_type,
+         int device_id,
+         std::shared_ptr<phi::stream::Stream> stream) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        auto place = paddle::platform::CustomPlace(
+            device_type,
+            device_id == -1 ? phi::DeviceManager::GetDevice(device_type)
+                            : device_id);
+        static_cast<phi::CustomContext *>(
+            paddle::platform::DeviceContextPool::Instance().Get(place))
+            ->SetStream(stream);
+        return stream;
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit _set_current_custom_device_stream."));
+#endif
+      },
+      py::arg("device_type"),
+      py::arg("device_id") = -1,
+      py::arg("stream") = nullptr);
+  m.def("_synchronize_custom_device",
+        [](const std::string &device_type, int device_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+          auto place = paddle::platform::CustomPlace(
+              device_type,
+              device_id == -1 ? phi::DeviceManager::GetDevice(device_type)
+                              : device_id);
+          phi::DeviceManager::SynchronizeDevice(place);
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit _synchronize_custom_device."));
+#endif
+        });
+
+  py::class_<phi::stream::Stream, std::shared_ptr<phi::stream::Stream>>(
+      m, "CustomDeviceStream", R"DOC(
+      The handle of the custom device stream.
+
+      Parameters:
+        device(paddle.CustomPlace()|str): The device which wanted to allocate the stream.
+
+        device_id(int, optional): The id of the device which wanted to allocate the stream.
+        If device is None or negative integer, device will be the current device.
+        If device is positive integer, it must less than the device count. Default: None.
+
+        priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
+        If priority is None, the priority is 2(normal). Default: None.
+
+        blocking(int|None, optional): Whether the stream is executed synchronously. Default: False.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            s3 = paddle.device.custom.Stream('custom_cpu')
+            s2 = paddle.device.custom.Stream('custom_cpu', 0)
+            s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'))
+            s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'), 1)
+            s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'), 1, True)
+
+  )DOC")
+      .def(
+          "__init__",
+          [](phi::stream::Stream &self,
+             const platform::CustomPlace &place,
+             int priority,
+             bool blocking) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            new (&self) phi::stream::Stream();
+            self.Init(
+                place,
+                static_cast<phi::stream::Stream::Priority>(priority),
+                static_cast<phi::stream::Stream::Flag>(
+                    blocking ? phi::stream::Stream::Flag::kDefaultFlag
+                             : phi::stream::Stream::Flag::kStreamNonBlocking));
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          py::arg("device"),
+          py::arg("priority") = 2,
+          py::arg("blocking") = false)
+      .def(
+          "__init__",
+          [](phi::stream::Stream &self,
+             const std::string &device_type,
+             int device_id,
+             int priority,
+             bool blocking) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            new (&self) phi::stream::Stream();
+            self.Init(
+                phi::CustomPlace(
+                    device_type,
+                    device_id == -1 ? phi::DeviceManager::GetDevice(device_type)
+                                    : device_id),
+                static_cast<phi::stream::Stream::Priority>(priority),
+                static_cast<phi::stream::Stream::Flag>(
+                    blocking ? phi::stream::Stream::Flag::kDefaultFlag
+                             : phi::stream::Stream::Flag::kStreamNonBlocking));
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          py::arg("device"),
+          py::arg("device_id") = -1,
+          py::arg("priority") = 2,
+          py::arg("blocking") = false)
+      .def(
+          "wait_event",
+          [](const phi::stream::Stream &self, phi::event::Event *event) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            self.WaitEvent(event);
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          R"DOC(
+      Makes all future work submitted to stream wait for all work captured in event.
+
+      Parameters:
+        event(CustomDeviceEvent): The event to wait on.
+
+      Examples:
+        .. code-block:: python
+
+          # required: custom_device
+          import paddle
+          place = paddle.CustomPlace('custom_cpu', 0)
+          s = paddle.device.custom.Stream(place)
+          event = paddle.device.custom.Event(place)
+          s.wait_event(event)
+
+           )DOC")
+      .def(
+          "wait_stream",
+          [](const phi::stream::Stream &self, phi::stream::Stream *other) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            phi::event::Event event;
+            event.Init(self.GetPlace());
+            event.Record(other);
+            self.WaitEvent(&event);
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          R"DOC(
+      Synchronizes with the given stream.
+
+      Parameters:
+        stream(CUDAStream): The stream to synchronize with.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            place = paddle.CustomPlace('custom_cpu', 0)
+            s1 = paddle.device.custom.Stream(place)
+            s2 = paddle.device.custom.Stream(place)
+            s1.wait_stream(s2)
+
+           )DOC")
+      .def(
+          "query",
+          [](const phi::stream::Stream &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            return self.Query();
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          R"DOC(
+      Return the status whether if all operations in stream have completed.
+
+      Returns: A boolean value.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            place = paddle.CustomPlace('custom_cpu', 0)
+            s = paddle.device.custom.Stream(place)
+            is_done = s.query()
+
+           )DOC")
+      .def(
+          "synchronize",
+          [](const phi::stream::Stream &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            self.Synchronize();
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          R"DOC(
+      Waits for stream tasks to complete.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            place = paddle.CustomPlace('custom_cpu', 0)
+            s = paddle.device.custom.Stream(place)
+            s.synchronize()
+
+           )DOC")
+      .def(
+          "record_event",
+          [](const phi::stream::Stream &self, phi::event::Event *event) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            if (event == nullptr) {
+              event = new phi::event::Event;
+              event->Init(self.GetPlace());
+            }
+            event->Record(&self);
+            return event;
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          R"DOC(
+      Record an event in the stream.
+
+      Parameters:
+          event(CustomDeviceEvent, optional): The event to be record. If event is None, a new event is created.
+          Default: None.
+
+      Returns:
+          The recored event.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            place = paddle.CustomPlace('custom_cpu', 0)
+            s = paddle.device.custom.Stream(place)
+            event = s.record_event()
+
+           )DOC",
+          py::arg("event") = nullptr)
+      .def_property_readonly(
+          "raw_stream",
+          [](const phi::stream::Stream &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            VLOG(10) << self.raw_stream();
+            return reinterpret_cast<std::uintptr_t>(self.raw_stream());
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+          },
+          R"DOC(
+      return the raw stream of type CustomDeviceStream as type int.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            import ctypes
+            stream  = paddle.device.custom.current_stream().raw_stream
+            print(stream)
+
+            ptr = ctypes.c_void_p(stream)  # convert back to void*
+            print(ptr)
+
+           )DOC")
+      .def_property_readonly("place", [](const phi::stream::Stream &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        return reinterpret_cast<const phi::CustomPlace &>(self.GetPlace());
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceStream."));
+#endif
+      });
+
+  py::class_<phi::event::Event, std::shared_ptr<phi::event::Event>>(
+      m, "CustomDeviceEvent", R"DOC(
+      The handle of the custom device event.
+
+      Parameters:
+        device(paddle.CustomPlace()|str): The device which wanted to allocate the stream.
+
+        device_id(int, optional): The id of the device which wanted to allocate the stream.
+        If device is None or negative integer, device will be the current device.
+        If device is positive integer, it must less than the device count. Default: None.
+
+        enable_timing(bool, optional): Whether the event will measure time. Default: False.
+
+        blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
+
+        interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            place = paddle.CustomPlace('custom_cpu', 0)
+            event = paddle.device.custom.Event(place)
+
+  )DOC")
+      .def(
+          "__init__",
+          [](phi::event::Event &self,
+             const platform::CustomPlace &place,
+             bool enable_timing,
+             bool blocking,
+             bool interprocess) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            auto flag = static_cast<phi::event::Event::Flag>(
+                static_cast<uint32_t>(
+                    enable_timing ? 0
+                                  : phi::event::Event::Flag::DisableTiming) |
+                static_cast<uint32_t>(
+                    !blocking ? 0 : phi::event::Event::Flag::BlockingSync) |
+                static_cast<uint32_t>(
+                    !interprocess ? 0 : phi::event::Event::Flag::Interprocess)
+
+            );
+            new (&self) phi::event::Event();
+            self.Init(place, flag);
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+          },
+          py::arg("device"),
+          py::arg("enable_timing") = false,
+          py::arg("blocking") = false,
+          py::arg("interprocess") = false)
+      .def(
+          "__init__",
+          [](phi::event::Event &self,
+             const std::string &device_type,
+             int device_id,
+             bool enable_timing,
+             bool blocking,
+             bool interprocess) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            auto flag = static_cast<phi::event::Event::Flag>(
+                static_cast<uint32_t>(
+                    enable_timing ? 0
+                                  : phi::event::Event::Flag::DisableTiming) |
+                static_cast<uint32_t>(
+                    !blocking ? 0 : phi::event::Event::Flag::BlockingSync) |
+                static_cast<uint32_t>(
+                    !interprocess ? 0 : phi::event::Event::Flag::Interprocess)
+
+            );
+            new (&self) phi::event::Event();
+            self.Init(
+                phi::CustomPlace(
+                    device_type,
+                    device_id == -1 ? phi::DeviceManager::GetDevice(device_type)
+                                    : device_id),
+                flag);
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+          },
+          py::arg("device"),
+          py::arg("device_id") = -1,
+          py::arg("enable_timing") = false,
+          py::arg("blocking") = false,
+          py::arg("interprocess") = false)
+      .def(
+          "record",
+          [](phi::event::Event &self, phi::stream::Stream *stream) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            if (stream == nullptr) {
+              stream = static_cast<const phi::CustomContext *>(
+                           paddle::platform::DeviceContextPool::Instance().Get(
+                               self.GetPlace()))
+                           ->GetStream()
+                           .get();
+            }
+            self.Record(stream);
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+          },
+          R"DOC(
+          Records the event in the given stream.
+
+          Parameters:
+            stream(CustomDeviceStream, optional): The handle of custom device stream. If None, the stream is the current stream. Default: None.
+
+          Examples:
+            .. code-block:: python
+
+              # required: custom_device
+              import paddle
+              place = paddle.CustomPlace('custom_cpu', 0)
+              event = paddle.device.custom.Event(place)
+              event.record()
+
+        )DOC")
+      .def(
+          "query",
+          [](const phi::event::Event &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            return self.Query();
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+          },
+          R"DOC(
+          Queries the event's status.
+
+          Returns: A boolean which indicates all work currently captured by the event has been completed.
+
+          Examples:
+            .. code-block:: python
+
+                # required: custom_device
+                import paddle
+                place = paddle.CustomPlace('custom_cpu', 0)
+                event = paddle.device.cuda.Event(place)
+                is_done = event.query()
+
+           )DOC")
+      .def(
+          "synchronize",
+          [](const phi::event::Event &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            self.Synchronize();
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+          },
+          R"DOC(
+            Waits for an event to complete.
+
+            Examples:
+              .. code-block:: python
+
+                # required: custom_device
+                import paddle
+                place = paddle.CustomPlace('custom_cpu', 0)
+                event = paddle.device.custom.Event(place)
+                event.synchronize()
+
+           )DOC")
+      .def_property_readonly(
+          "raw_event",
+          [](const phi::event::Event &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+            VLOG(10) << self.raw_event();
+            return reinterpret_cast<std::uintptr_t>(self.raw_event());
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+          },
+          R"DOC(
+      return the raw event of type CustomDeviceEvent as type int.
+
+      Examples:
+        .. code-block:: python
+
+            # required: custom_device
+            import paddle
+            import ctypes
+            place = paddle.CustomPlace('custom_cpu', 0)
+            event = paddle.device.custom.Event(place)
+            raw_event = event.raw_event
+            print(raw_event)
+
+            ptr = ctypes.c_void_p(raw_event)  # convert back to void*
+            print(ptr)
+
+           )DOC")
+      .def_property_readonly("place", [](const phi::event::Event &self) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+        return reinterpret_cast<const phi::CustomPlace &>(self.GetPlace());
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Paddle is not compiled with CustomDevice. "
+            "Cannot visit CustomDeviceEvent."));
+#endif
+      });
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/custom_device_py.h b/paddle/fluid/pybind/custom_device_py.h
new file mode 100644
index 0000000000000..26aed199bc729
--- /dev/null
+++ b/paddle/fluid/pybind/custom_device_py.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindCustomDevicePy(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d2f622537216b..36e2436406812 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -88,6 +88,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/custom_device_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/imperative.h"
@@ -629,6 +630,7 @@ PYBIND11_MODULE(libpaddle, m) {
   BindCudaStream(&m);
   BindXpuStream(&m);
   BindJit(&m);
+  BindCustomDevicePy(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
   phi::backends::cpu::CpuTotalPhysicalMemory();
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index cfb55565a2eb3..3b2314b0963cf 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -36,18 +36,18 @@ if(WITH_MKLDNN)
   list(APPEND BACKENDS_DEPS mkldnn)
 endif()
 
+list(
+  APPEND
+  BACKENDS_SRCS
+  callback_manager.cc
+  device_guard.cc
+  stream.cc
+  event.cc
+  device_base.cc
+  device_manager.cc)
+
 if(WITH_CUSTOM_DEVICE)
-  list(
-    APPEND
-    BACKENDS_SRCS
-    callback_manager.cc
-    device_guard.cc
-    stream.cc
-    event.cc
-    device_base.cc
-    device_manager.cc
-    custom/custom_context.cc
-    custom/custom_device.cc)
+  list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc)
 endif()
 
 add_library(phi_backends "${BACKENDS_SRCS}")
diff --git a/paddle/phi/backends/callback_manager.h b/paddle/phi/backends/callback_manager.h
index 2bb26745288df..1a5c201620a49 100644
--- a/paddle/phi/backends/callback_manager.h
+++ b/paddle/phi/backends/callback_manager.h
@@ -13,16 +13,6 @@
 // limitations under the License.
 
 #pragma once
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index e34e0f94b7067..14c2afe3950ba 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -36,6 +36,12 @@ struct CustomContext::Impl {
     return reinterpret_cast<void*>(stream_->raw_stream());
   }
 
+  std::shared_ptr<phi::stream::Stream> GetStream() const { return stream_; }
+
+  void SetStream(std::shared_ptr<phi::stream::Stream> stream) {
+    stream_ = stream;
+  }
+
   void Wait() const { stream_->Wait(); }
 
   Place place_;
@@ -49,6 +55,14 @@ const Place& CustomContext::GetPlace() const { return impl_->GetPlace(); }
 
 void* CustomContext::stream() const { return impl_->stream(); }
 
+std::shared_ptr<phi::stream::Stream> CustomContext::GetStream() const {
+  return impl_->GetStream();
+}
+
+void CustomContext::SetStream(std::shared_ptr<phi::stream::Stream> stream) {
+  impl_->SetStream(stream);
+}
+
 void CustomContext::Wait() const { return impl_->Wait(); }
 
 CustomContext::CustomContext(const CustomPlace& place)
diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h
index d007cb62cd4f9..18d0dfedb2188 100644
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
+#include "paddle/phi/backends/stream.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -30,9 +31,14 @@ class CustomContext : public DeviceContext,
 
   const Place& GetPlace() const override;
 
-  /*! \brief  Return stream in the device context. */
+  /*! \brief  Return raw stream in the device context. */
   void* stream() const;
 
+  /*! \brief  Return stream in the device context. */
+  std::shared_ptr<phi::stream::Stream> GetStream() const;
+
+  void SetStream(std::shared_ptr<phi::stream::Stream> stream);
+
   // Wait for all operations completion in the stream.
   void Wait() const override;
 
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index 2c986df278173..c0e28a90e9ff3 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -146,13 +146,6 @@ class CustomDevice : public DeviceInterface {
                         stream::Stream::Priority::kNormal,
                     const stream::Stream::Flag& flag =
                         stream::Stream::Flag::kDefaultFlag) override {
-    if (priority != stream::Stream::Priority::kNormal ||
-        flag != stream::Stream::Flag::kDefaultFlag) {
-      PADDLE_THROW(phi::errors::Unavailable(
-          "priority != stream::Stream::Priority::kNormal || flag != "
-          "stream::Stream::Flag::kDefaultFlag is not allowed on "
-          "CustomDevice."));
-    }
     const auto device = &devices_pool[dev_id];
     C_Stream c_stream;
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h
index 7030777474d5a..893aa39d8a51b 100644
--- a/paddle/phi/backends/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include <vector>
 
 #include "paddle/phi/backends/c_comm_lib.h"
@@ -275,5 +274,3 @@ class DeviceInterface {  // Driver / Runtime
 };
 
 }  // namespace phi
-
-#endif
diff --git a/paddle/phi/backends/device_guard.h b/paddle/phi/backends/device_guard.h
index 668951f8a1c98..eb14236d251b3 100644
--- a/paddle/phi/backends/device_guard.h
+++ b/paddle/phi/backends/device_guard.h
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-
 #include "paddle/phi/backends/device_manager.h"
 
 namespace phi {
@@ -46,5 +44,3 @@ class DeviceGuard {
 };
 
 }  // namespace phi
-
-#endif
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index 2bb57ab8fe6ea..69c2d9d088cfe 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #include "paddle/phi/common/complex.h"
 
@@ -663,6 +662,8 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   std::vector<std::string> libraries;
   std::regex express(".*\\.so");
   std::match_results<std::string::iterator> results;
+
+#if !defined(_WIN32)
   DIR* dir = nullptr;
   dirent* ptr = nullptr;
 
@@ -680,9 +681,9 @@ std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
     }
     closedir(dir);
   }
+#endif
 
   return libraries;
 }
 
 }  // namespace phi
-#endif
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 130f8fab449ac..990157da0f462 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
 
 #include <unordered_map>
 
@@ -285,12 +284,14 @@ class DeviceManager {
 
 std::vector<std::string> ListAllLibraries(const std::string& library_dir);
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
 void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle);
 
 void LoadCustomRuntimeLib(const CustomRuntimeParams& runtime_params,
                           std::unique_ptr<C_DeviceInterface> device_interface,
                           const std::string& dso_lib_path,
                           void* dso_handle);
+#endif
 
 class Registrar {
  public:
@@ -303,5 +304,3 @@ class Registrar {
 };
 
 }  // namespace phi
-
-#endif
diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc
index b594d919abc18..7d87318cfec55 100644
--- a/paddle/phi/backends/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -59,7 +59,7 @@ void Event::Record(const stream::Stream* stream) { stream->RecordEvent(this); }
 
 bool Event::Query() const { return device_->QueryEvent(this); }
 
-void Event::Synchonrize() const { device_->SynchronizeEvent(this); }
+void Event::Synchronize() const { device_->SynchronizeEvent(this); }
 
 const Place& Event::GetPlace() const { return place_; }
 
diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h
index 8de223528f8fd..a58083ff2898f 100644
--- a/paddle/phi/backends/event.h
+++ b/paddle/phi/backends/event.h
@@ -46,7 +46,7 @@ class Event {
   void Destroy();
   void Record(const stream::Stream* stream);
   bool Query() const;
-  void Synchonrize() const;
+  void Synchronize() const;
   const Place& GetPlace() const;
 
  private:
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 2751009dd3090..defb6321847c2 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -15,6 +15,8 @@
 # TODO: define the functions to manipulate devices
 import re
 import os
+import ctypes
+import paddle
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph.parallel import ParallelEnv
@@ -43,6 +45,12 @@
     'get_all_custom_device_type',
     'get_available_device',
     'get_available_custom_device',
+    'Stream',
+    'Event',
+    'current_stream',
+    'set_stream',
+    'stream_guard',
+    'synchronize',
 ]
 
 _cudnn_version = None
@@ -514,3 +522,475 @@ def get_available_custom_device():
             # Output: ['CustomCPU', 'CustomGPU:0', 'CustomGPU:1']
     """
     return core.get_available_custom_device()
+
+
+class Event(object):
+    '''
+    A device event wrapper around StreamBase.
+    Parameters:
+        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
+            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
+            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+        enable_timing (bool, optional): indicates if the event should measure time, default is False
+        blocking (bool, optional): if True, ``wait`` will be blocking, default is False
+        interprocess (bool): if True, the event can be shared between processes, default is False
+    Returns:
+        Event: The event.
+    Examples:
+        .. code-block:: python
+            # required: custom_device
+            import paddle
+            e1 = paddle.device.Event()
+            e2 = paddle.device.Event('custom_cpu')
+            e3 = paddle.device.Event('custom_cpu:0')
+            e4 = paddle.device.Event(paddle.CustomPlace('custom_cpu', 0))
+    '''
+
+    def __init__(
+        self,
+        device=None,
+        enable_timing=False,
+        blocking=False,
+        interprocess=False,
+    ):
+        if device is None:
+            self.device = paddle.framework._current_expected_place()
+        elif isinstance(device, str):
+            self.device = paddle.device._convert_to_place(device)
+        else:
+            self.device = device
+
+        if paddle.is_compiled_with_cuda() and isinstance(
+            self.device, paddle.CUDAPlace
+        ):
+            self.event_base = core.CUDAEvent(
+                enable_timing, blocking, interprocess
+            )
+        elif isinstance(self.device, paddle.CustomPlace):
+            self.event_base = core.CustomDeviceEvent(
+                self.device.get_device_type(),
+                self.device.get_device_id(),
+                enable_timing,
+                blocking,
+                interprocess,
+            )
+        else:
+            raise TypeError(
+                "device should be gpu, xpu, {}".format(
+                    ",".join(paddle.device.get_all_custom_device_type())
+                )
+            )
+
+    def record(self, stream=None):
+        '''
+        Records the event in a given stream.
+        Parameters:
+            stream(Stream, optional): The given stream. By default, stream is None,
+            event will be recorded in current_stream.
+        Returns:
+            None.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                e = paddle.device.Event()
+                e.record()
+
+                s = paddle.device.Stream()
+                e.record(s)
+        '''
+        if stream is None:
+            stream = current_stream(self.device)
+
+        self.event_base.record(stream.stream_base)
+
+    def query(self):
+        '''
+        Checks if all work currently captured by event has completed.
+        Returns:
+            bool: Whether all work currently captured by event has completed.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                e = paddle.device.Event()
+                e.query()
+        '''
+        return self.event_base.query()
+
+    def elapsed_time(self, end_event):
+        '''
+        Returns the time elapsed in milliseconds after the event was
+        recorded and before the end_event was recorded.
+        Returns:
+            int: The time.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                e1 = paddle.device.Event()
+                e2 = paddle.device.Event()
+                e1.elapsed_time(e2)
+        '''
+        return 0
+
+    def synchronize(self):
+        '''
+        Waits for the event to complete.
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+        Returns:
+            None.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                e = paddle.device.Event()
+                e.synchronize()
+        '''
+        self.event_base.synchronize()
+
+    def __repr__(self):
+        return self.event_base
+
+
+class Stream(object):
+    '''
+    A device stream wrapper around StreamBase.
+    Parameters:
+        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
+            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
+            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+        priority(int, optional): priority of the CUDA stream. Can be either
+            1 (high priority) or 2 (low priority). By default, streams have
+            priority 2.
+    Returns:
+        Stream: The stream.
+    Examples:
+        .. code-block:: python
+            # required: custom_device
+            import paddle
+            s1 = paddle.device.Stream()
+            s2 = paddle.device.Stream('custom_cpu')
+            s3 = paddle.device.Stream('custom_cpu:0')
+            s4 = paddle.device.Stream(paddle.CustomPlace('custom_cpu', 0))
+    '''
+
+    def __init__(self, device=None, priority=2, stream_base=None):
+        if stream_base is not None:
+            if isinstance(
+                stream_base, (core.CUDAStream, core.CustomDeviceStream)
+            ):
+                self.stream_base = stream_base
+                self.device = stream_base.place
+            else:
+                raise TypeError(
+                    "stream_base should be CUDAStream, CustomDeviceStream"
+                )
+            return
+
+        if device is None:
+            self.device = paddle.framework._current_expected_place()
+        elif isinstance(device, str):
+            self.device = paddle.device._convert_to_place(device)
+        else:
+            self.device = device
+
+        if paddle.is_compiled_with_cuda() and isinstance(
+            self.device, paddle.CUDAPlace
+        ):
+            self.stream_base = core.CUDAStream(
+                self.device.get_device_id(), priority
+            )
+        elif isinstance(self.device, paddle.CustomPlace):
+            self.stream_base = core.CustomDeviceStream(
+                self.device.get_device_type(),
+                self.device.get_device_id(),
+                priority,
+                blocking=False,
+            )
+        else:
+            raise TypeError(
+                "device should be gpu, xpu, {}".format(
+                    ",".join(paddle.device.get_all_custom_device_type())
+                )
+            )
+
+    def wait_event(self, event):
+        '''
+        Makes all future work submitted to the stream wait for an event.
+        Parameters:
+            event (Event): an event to wait for.
+        Returns:
+            None.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                s = paddle.device.Stream()
+                e = paddle.device.Event()
+                s.wait_event(e)
+        '''
+        self.stream_base.wait_event(event.event_base)
+
+    def wait_stream(self, stream):
+        '''
+        Synchronizes with another stream.
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+        Parameters:
+            stream (Stream): a stream to synchronize.
+        Returns:
+            None.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                s1 = paddle.device.Stream()
+                s2 = paddle.device.Stream()
+                s1.wait_stream(s2)
+        '''
+        self.stream_base.wait_stream(stream.stream_base)
+
+    def record_event(self, event=None):
+        '''
+        Records an event.
+        Parameters:
+            event (Event, optional): event to record. If not given, a new one
+                will be allocated.
+        Returns:
+            Event: Recorded event.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                s = paddle.device.Stream()
+                e1 = s.record_event()
+
+                e2 = paddle.device.Event()
+                s.record_event(e2)
+        '''
+        if event is None:
+            event = Event(self.device)
+        event.record(self)
+        return event
+
+    def query(self):
+        '''
+        Checks if all the work submitted has been completed.
+        Returns:
+            bool: Whether all kernels in this stream are completed.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                s = paddle.device.Stream()
+                s.query()
+        '''
+        return self.stream_base.query()
+
+    def synchronize(self):
+        '''
+        Wait for all the kernels in this stream to complete.
+        Returns:
+            None.
+        Examples:
+            .. code-block:: python
+                # required: custom_device
+                import paddle
+                s = paddle.device.Stream()
+                s.synchronize()
+        '''
+        self.stream_base.synchronize()
+
+    @property
+    def _as_parameter_(self):
+        if isinstance(self.stream_base, core.CUDAStream):
+            return ctypes.c_void_p(self.stream_base.cuda_stream)
+        else:
+            return ctypes.c_void_p(self.stream_base.raw_stream)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return super(Stream, self).__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.stream_base, self.device))
+
+    def __repr__(self):
+        return '<paddle.device.Stream device={0} stream={1:#x}>'.format(
+            self.device, self._as_parameter_.value
+        )
+
+
+def current_stream(device=None):
+    '''
+    Return the current stream by the device.
+    Parameters:
+        device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): The device which want to get stream from.  If device is None, the device is the current device. Default: None.
+            It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
+            where ``x`` is the index of the GPUs, CustomDevicecs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+    Returns:
+        Stream: The stream to the device.
+    Examples:
+        .. code-block:: python
+            # required: custom_device
+            import paddle
+            s1 = paddle.device.current_stream()
+            s2 = paddle.device.current_stream("gpu:0")
+            place = paddle.CustomPlace('custom_cpu', 0)
+            s3 = paddle.device.current_stream(place)
+    '''
+    if device is None:
+        place = paddle.framework._current_expected_place()
+    elif isinstance(device, str):
+        place = paddle.device._convert_to_place(device)
+    else:
+        place = device
+
+    if paddle.is_compiled_with_cuda() and isinstance(place, paddle.CUDAPlace):
+        return Stream(
+            stream_base=core._get_current_stream(place.get_device_id())
+        )
+    elif isinstance(place, paddle.CustomPlace):
+        return Stream(
+            stream_base=core._get_current_custom_device_stream(
+                place.get_device_type(), place.get_device_id()
+            )
+        )
+    else:
+        raise TypeError(
+            "device should be gpu, xpu, {}".format(
+                ",".join(paddle.device.get_all_custom_device_type())
+            )
+        )
+
+
+def set_stream(stream):
+    '''
+    Set the current stream.
+    Parameters:
+        stream(Stream): The selected stream.
+    Returns:
+        Stream: The previous stream.
+    Examples:
+        .. code-block:: python
+            # required: custom_device
+            import paddle
+            s = paddle.device.Stream()
+            paddle.device.set_stream(s)
+    '''
+
+    prev_stream = current_stream(stream.stream_base.place)
+
+    if paddle.is_compiled_with_cuda() and isinstance(
+        stream.stream_base.place, paddle.CUDAPlace
+    ):
+        core._set_current_stream(stream.stream_base)
+    elif isinstance(stream.stream_base.place, paddle.CustomPlace):
+        core._set_current_custom_device_stream(
+            stream.stream_base.place.get_device_type(),
+            stream.stream_base.place.get_device_id(),
+            stream.stream_base,
+        )
+    else:
+        raise TypeError(
+            "device should be gpu, xpu, {}".format(
+                ",".join(paddle.device.get_all_custom_device_type())
+            )
+        )
+
+    return prev_stream
+
+
+class stream_guard(object):
+    '''
+    Notes:
+        This API only supports dynamic graph mode currently.
+    A context manager that specifies the current stream context by the given stream.
+    Parameters:
+        stream(Stream, optional): the selected stream. If stream is None, just yield.
+    Returns:
+        None.
+    Examples:
+        .. code-block:: python
+            # required: custom_device
+            import paddle
+            s = paddle.device.Stream()
+            data1 = paddle.ones(shape=[20])
+            data2 = paddle.ones(shape=[20])
+            data3 = data1 + data2
+            with paddle.device.stream_guard(s):
+                s.wait_stream(paddle.device.default_stream())
+                data4 = data1 + data3
+    '''
+
+    def __init__(self, stream=None):
+        self.stream = stream
+
+    def __enter__(self):
+        cur_stream = self.stream
+        if cur_stream is None:
+            return
+
+        self.src_prev_stream = current_stream(cur_stream.device)
+        if self.src_prev_stream.device != cur_stream.device:
+            self.tmp_place = paddle.fluid.framework._current_expected_place()
+            paddle.fluid.framework._set_expected_place(cur_stream.device)
+            self.dst_prev_stream = current_stream(cur_stream.device)
+            set_stream(cur_stream)
+        else:
+            set_stream(cur_stream)
+
+    def __exit__(self, *args):
+        cur_stream = self.stream
+        if cur_stream is None:
+            return
+
+        if self.src_prev_stream.device != cur_stream.device:
+            set_stream(self.dst_prev_stream)
+            paddle.fluid.framework._set_expected_place(self.tmp_place)
+            set_stream(self.src_prev_stream)
+        else:
+            set_stream(self.src_prev_stream)
+
+
+def synchronize(device=None):
+    '''
+    Wait for the compute on the given device to finish.
+    Parameters:
+        device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for.  If device is None, the device is the current device. Default: None.
+            It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
+            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
+    Examples:
+        .. code-block:: python
+            # required: custom_device
+            import paddle
+            paddle.device.synchronize()
+            paddle.device.synchronize("gpu:0")
+            place = paddle.CustomPlace('custom_cpu', 0)
+            paddle.device.synchronize(place)
+    '''
+
+    if device is None:
+        place = paddle.framework._current_expected_place()
+    elif isinstance(device, str):
+        place = paddle.device._convert_to_place(device)
+    else:
+        place = device
+
+    if paddle.is_compiled_with_cuda() and isinstance(place, paddle.CUDAPlace):
+        core._device_synchronize(place.get_device_id())
+    elif paddle.is_compiled_with_xpu() and isinstance(place, paddle.XPUPlace):
+        core._xpu_device_synchronize(place.get_device_id())
+    elif isinstance(place, paddle.CustomPlace):
+        core._synchronize_custom_device(
+            place.get_device_type(), place.get_device_id()
+        )
+    else:
+        raise TypeError(
+            "device should be gpu, xpu, {}".format(
+                ",".join(paddle.device.get_all_custom_device_type())
+            )
+        )
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 188e334c2f1ba..fed666e2e8e08 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -15,6 +15,7 @@
 import paddle
 from paddle.fluid import core
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
+from paddle.utils import deprecated
 
 from .streams import Stream  # noqa: F401
 from .streams import Event  # noqa: F401
@@ -37,6 +38,12 @@
 ]
 
 
+@deprecated(
+    since="2.5.0",
+    update_to="paddle.device.current_stream",
+    level=1,
+    reason="current_stream in paddle.device.cuda will be removed in future",
+)
 def current_stream(device=None):
     '''
     Return the current CUDA stream by the device.
@@ -75,6 +82,12 @@ def current_stream(device=None):
     return core._get_current_stream(device_id)
 
 
+@deprecated(
+    since="2.5.0",
+    update_to="paddle.device.synchronize",
+    level=1,
+    reason="synchronize in paddle.device.cuda will be removed in future",
+)
 def synchronize(device=None):
     '''
     Wait for the compute on the given CUDA device to finish.
@@ -352,6 +365,12 @@ def _set_current_stream(stream):
     return core._set_current_stream(stream)
 
 
+@deprecated(
+    since="2.5.0",
+    update_to="paddle.device.stream_guard",
+    level=1,
+    reason="stream_guard in paddle.device.cuda will be removed in future",
+)
 @signature_safe_contextmanager
 def stream_guard(stream):
     '''
diff --git a/python/paddle/device/xpu/__init__.py b/python/paddle/device/xpu/__init__.py
index a928a0f7c0405..832c1baa63153 100644
--- a/python/paddle/device/xpu/__init__.py
+++ b/python/paddle/device/xpu/__init__.py
@@ -14,12 +14,19 @@
 
 import paddle
 from paddle.fluid import core
+from paddle.utils import deprecated
 
 __all__ = [
     'synchronize',
 ]
 
 
+@deprecated(
+    since="2.5.0",
+    update_to="paddle.device.synchronize",
+    level=1,
+    reason="synchronize in paddle.device.xpu will be removed in future",
+)
 def synchronize(device=None):
     '''
     Wait for the compute on the given XPU device to finish.
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index b17c29a97868a..9aaf0f684f1e7 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -314,6 +314,13 @@ def to_list(s):
     from .libpaddle import _is_fwd_prim_enabled
     from .libpaddle import __set_all_prim_enabled
 
+    # custom devivce
+    from .libpaddle import _get_current_custom_device_stream
+    from .libpaddle import _set_current_custom_device_stream
+    from .libpaddle import _synchronize_custom_device
+    from .libpaddle import CustomDeviceStream
+    from .libpaddle import CustomDeviceEvent
+
     if sys.platform != 'win32':
         from .libpaddle import _set_process_pids
         from .libpaddle import _erase_process_pids
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
index 3139f13127d6a..80b91ca8f0133 100755
--- a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
@@ -56,6 +56,7 @@ def test_custom_device(self):
         self._test_eager_copy_to()
         self._test_fallback_kernel()
         self._test_scalar()
+        self._test_custom_device_py_api()
 
     def _test_custom_device_dataloader(self):
         import paddle
@@ -257,6 +258,34 @@ def forward(self, inputs, label=None):
         avg_loss.backward()
         sgd.step()
 
+    def _test_custom_device_py_api(self):
+        import paddle
+
+        p = paddle.set_device('custom_cpu')
+        paddle.device.synchronize('custom_cpu')
+
+        s1 = paddle.device.Stream()
+        s2 = paddle.device.Stream(p)
+
+        s1 = paddle.device.current_stream()
+        s2 = paddle.device.current_stream(p)
+
+        e1 = paddle.device.Event()
+        e2 = paddle.device.Event(p)
+
+        s = paddle.device.Stream()
+        e = paddle.device.Event()
+        s.query()
+        s.synchronize()
+        s.wait_event(e)
+        s.record_event(e)
+        s.wait_stream(s)
+        paddle.device.set_stream(s)
+
+        e.query()
+        e.synchronize()
+        e.record(s)
+
 
 if __name__ == '__main__':
     if os.name == 'nt' or sys.platform.startswith('darwin'):

From 96a0ce60d215b8525f63a87594ea1080bc27f174 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Tue, 31 Jan 2023 15:42:18 +0800
Subject: [PATCH 38/89] fix div 0 error of NoamDecay (#49953)

* fix div 0 error of NoamDecay

* add unittest

* Update lr.py
---
 .../tests/unittests/test_noamdecay_op.py      | 34 +++++++++++++++++++
 python/paddle/optimizer/lr.py                 |  3 ++
 2 files changed, 37 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_noamdecay_op.py

diff --git a/python/paddle/fluid/tests/unittests/test_noamdecay_op.py b/python/paddle/fluid/tests/unittests/test_noamdecay_op.py
new file mode 100644
index 0000000000000..62312c7a8b9f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_noamdecay_op.py
@@ -0,0 +1,34 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+
+class TestSparseEmbeddingAPIError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.fluid.dygraph.guard():
+            # The size of input in sparse_embedding should not be 0.
+            def test_0_d_model():
+                schedular = paddle.optimizer.lr.NoamDecay(
+                    d_model=0, warmup_steps=0
+                )
+
+            self.assertRaises(ValueError, test_0_d_model)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 07420be8915d3..bc5f9020b7f30 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -296,6 +296,9 @@ def __init__(
         last_epoch=-1,
         verbose=False,
     ):
+        if d_model <= 0:
+            raise ValueError("d_model should be grater than 0")
+
         self.d_model = d_model
         self.warmup_steps = warmup_steps
         super().__init__(learning_rate, last_epoch, verbose)

From 86a238186d39286a9ee5af6d2defb0ab80f324c1 Mon Sep 17 00:00:00 2001
From: PuQing <me@puqing.work>
Date: Tue, 31 Jan 2023 15:46:18 +0800
Subject: [PATCH 39/89] [Numpy] Add FP16 dtype for CastNumpy2Scalar (#50002)

* add FP16 dtype for CastNumpy2Scalar

* fix throw message

* add test

* fix SyntaxWarning

* test skip for float16

* fix dtype mistakes
---
 paddle/fluid/pybind/eager_utils.cc             |  5 ++++-
 paddle/fluid/pybind/op_function_common.cc      |  6 ++++++
 paddle/fluid/pybind/op_function_common.h       |  3 +++
 .../tests/unittests/test_elementwise_add_op.py | 18 ++++++++++++++++++
 python/paddle/fluid/transpiler/collective.py   |  4 ++--
 5 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 371ba65a46d15..b3ed1df95709f 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1343,6 +1343,9 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
   } else if (type_name == "numpy.float32") {
     float value = CastPyArg2Float(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
+  } else if (type_name == "numpy.float16") {
+    float16 value = CastPyArg2Float16(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
   } else if (type_name == "numpy.int64") {
     int64_t value = CastPyArg2Long(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
@@ -1352,7 +1355,7 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
-        "numpy.float32/float64, numpy.int32/int64, but got %s",
+        "numpy.float16/float32/float64, numpy.int32/int64, but got %s",
         op_type,
         arg_pos + 1,
         type_name));  // NOLINT
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 5cdd9a0fa0668..edab97c8b5e69 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -184,6 +184,12 @@ void CastPyArg2AttrLong(PyObject* obj,
   attrs[key] = CastPyArg2Long(obj, op_type, arg_pos);
 }
 
+float16 CastPyArg2Float16(PyObject* obj,
+                          const std::string& op_type,
+                          ssize_t arg_pos) {
+  return static_cast<float16>(CastPyArg2Double(obj, op_type, arg_pos));
+}
+
 float CastPyArg2Float(PyObject* obj,
                       const std::string& op_type,
                       ssize_t arg_pos) {
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 686694631cc66..57423bb3b74a3 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -55,6 +55,9 @@ int CastPyArg2Int(PyObject* obj, const std::string& op_type, ssize_t arg_pos);
 int64_t CastPyArg2Long(PyObject* obj,
                        const std::string& op_type,
                        ssize_t arg_pos);
+float16 CastPyArg2Float16(PyObject* obj,
+                          const std::string& op_type,
+                          ssize_t arg_pos);
 float CastPyArg2Float(PyObject* obj,
                       const std::string& op_type,
                       ssize_t arg_pos);
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 1cb57e6d72fd5..3bf2b7cdcd703 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -737,6 +737,24 @@ def test_dygraph_add(self):
         paddle.enable_static()
 
 
+class TestTensorAddNumpyScalar(unittest.TestCase):
+    def test_float32_add(self):
+        paddle.disable_static()
+        a = paddle.full([4, 5, 6], 1.5, dtype='float32')
+        b = np.array([1.5], dtype='float32')[0]
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+
+    def test_float16_add(self):
+        if not core.is_compiled_with_cuda():
+            return
+        paddle.disable_static()
+        a = paddle.full([4, 5, 6], 1.5, dtype='float16')
+        b = np.array([1.5], dtype='float16')[0]
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP16)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 870efa0968d72..04bd68d257163 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -516,12 +516,12 @@ def _transpile_startup_program(self):
     def _transpile_main_program(self):
         # not need loss scale and no dense param
         param_cnt = self._get_update_param_count()
-        if self.loss_scale is 0 and param_cnt is 0:
+        if self.loss_scale == 0 and param_cnt == 0:
             return
         # scale loss
         self._insert_scale_loss_grad_ops()
         # no param
-        if param_cnt is 0:
+        if param_cnt == 0:
             return
         # fuse allreduce
         if self.fuse_allreduce > 0:

From 5822e15ce4ffd9c4561a6403ca352a65dcb5cd76 Mon Sep 17 00:00:00 2001
From: Zhang Jun <ewalker@live.cn>
Date: Tue, 31 Jan 2023 16:19:27 +0800
Subject: [PATCH 40/89] [inference][trt] add elementwise input data type check
 (#49675)

---
 paddle/fluid/inference/tensorrt/op_teller.cc  |  45 ++++-
 .../test_trt_convert_compare_and_logical.py   | 137 +++++++++++++++
 .../inference/test_trt_convert_elementwise.py | 166 +++++++++++++++++-
 .../ir/inference/test_trt_convert_equal.py    |   2 +-
 4 files changed, 335 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index fbbd77a4c9825..0075c64759333 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1365,16 +1365,26 @@ struct SimpleOpTypeSetTeller : public Teller {
         VLOG(3) << "Ops(" << op_type << ") do not support static shape yet.";
         return false;
       }
+      auto* block = desc.Block();
+      auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
+      auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
+      auto x_dtype = x_var_desc->GetDataType();
+      auto y_dtype = y_var_desc->GetDataType();
       if (op_type == "logical_or" || op_type == "logical_xor" ||
           op_type == "logical_and") {
-        auto* block = desc.Block();
-        auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
-        auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
-        auto x_dtype = x_var_desc->GetDataType();
-        auto y_dtype = y_var_desc->GetDataType();
         if (x_dtype != framework::proto::VarType::BOOL ||
             y_dtype != framework::proto::VarType::BOOL) {
-          VLOG(3) << "the op only support input of BOOL.";
+          VLOG(3) << "the op (" << op_type << ") only support input of BOOL.";
+          return false;
+        }
+      }
+      if (op_type == "less_than" || op_type == "greater_than" ||
+          op_type == "less_equal") {
+        if (x_dtype == framework::proto::VarType::BOOL ||
+            y_dtype == framework::proto::VarType::BOOL) {
+          VLOG(3)
+              << "ElementWiseOperation::kLESS/ElementWiseOperation::kGREATER "
+                 "do not support boolean datatype.";
           return false;
         }
       }
@@ -1417,6 +1427,29 @@ struct SimpleOpTypeSetTeller : public Teller {
       const auto x_shape = x_var_desc->GetShape();
       const auto y_shape = y_var_desc->GetShape();
 
+      // These operations do not support boolean datatype.
+      if (op_type == "elementwise_add" || op_type == "elementwise_mul" ||
+          op_type == "elementwise_sub" || op_type == "elementwise_div" ||
+          op_type == "elementwise_pow" || op_type == "elementwise_min" ||
+          op_type == "elementwise_max" || op_type == "elementwise_floordiv") {
+        if (x_var_desc->GetDataType() ==
+            paddle::framework::proto::VarType_Type::VarType_Type_BOOL) {
+          VLOG(3) << "These operations "
+                     "(elementwise_add/mul/sub/div/pow/min/max/floordiv) do "
+                     "not support boolean datatype.";
+          return false;
+        }
+      }
+      // These operations input do not support int32 datatype.
+      if (op_type == "elementwise_pow") {
+        if (x_var_desc->GetDataType() ==
+            paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
+          VLOG(3) << "These operations (elementwise_pow) do not support int32 "
+                     "datatype.";
+          return false;
+        }
+      }
+
       // The case when x_shape.size() == 1 is dealt with in common case
       if (!with_dynamic_shape && (!y_var_desc->Persistable()) &&
           y_shape.size() == 1) {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_compare_and_logical.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_compare_and_logical.py
index e59b9a0cd416c..50159c222cc8a 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_compare_and_logical.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_compare_and_logical.py
@@ -481,5 +481,142 @@ def test(self):
         self.run_test()
 
 
+class TrtConvertCompareSkipTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.int32)
+
+        for shape in [[2, 16], [2, 16, 32], [1, 32, 16, 32]]:
+            for op_type in ["less_than", "greater_than"]:
+                for axis in [-1]:
+                    self.dims = len(shape)
+                    dics = [
+                        {"axis": axis},
+                        {"in_dtype": 2, "out_dtype": 0},
+                        {"in_dtype": 0, "out_dtype": 2},
+                    ]
+                    ops_config = [
+                        {
+                            "op_type": "cast",
+                            "op_inputs": {"X": ["input_data1"]},
+                            "op_outputs": {"Out": ["cast_output_data1"]},
+                            "op_attrs": dics[1],
+                            "outputs_dtype": {"cast_output_data1": np.bool_},
+                        },
+                        {
+                            "op_type": "cast",
+                            "op_inputs": {"X": ["input_data2"]},
+                            "op_outputs": {"Out": ["cast_output_data2"]},
+                            "op_attrs": dics[1],
+                            "outputs_dtype": {"cast_output_data2": np.bool_},
+                        },
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["cast_output_data1"],
+                                "Y": ["cast_output_data2"],
+                            },
+                            "op_outputs": {"Out": ["cast_output_data0"]},
+                            "op_attrs": dics[0],
+                            "outputs_dtype": {"cast_output_data0": np.bool_},
+                        },
+                        {
+                            "op_type": "cast",
+                            "op_inputs": {"X": ["cast_output_data0"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[2],
+                            "outputs_dtype": {"output_data": np.int32},
+                        },
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data1": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
+                            "input_data2": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 2:
+                shape_data = [2, 16]
+            if self.dims == 3:
+                shape_data = [2, 16, 32]
+            if self.dims == 4:
+                shape_data = [1, 32, 16, 32]
+
+            shape_info = {
+                "input_data1": shape_data,
+                "input_data2": shape_data,
+                "cast_output_data0": shape_data,
+                "cast_output_data1": shape_data,
+                "cast_output_data2": shape_data,
+            }
+            self.dynamic_shape.min_input_shape = shape_info
+            self.dynamic_shape.max_input_shape = shape_info
+            self.dynamic_shape.opt_input_shape = shape_info
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8400:
+                return 0, 7
+            if not dynamic_shape:
+                return 0, 7
+            return 3, 4
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), (1e-3, 1e-3)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), (1e-3, 1e-3)
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 89debb2a27dcd..4d4df30acb031 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -25,7 +25,7 @@
 
 # This is the special test case with weight including batch dimension
 # I don't want to mess up the code written by others, so I wrote a class specifically
-class TrtConvertElementwiseTest_one_input_special_case0(TrtLayerAutoScanTest):
+class TrtConvertElementwiseTestOneInputSpecialCase0(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -158,7 +158,7 @@ def test(self):
 
 
 # This is the special test case
-class TrtConvertElementwiseTest_one_input_special_case1(TrtLayerAutoScanTest):
+class TrtConvertElementwiseTestOneInputSpecialCase1(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -279,7 +279,7 @@ def test(self):
         self.run_test()
 
 
-class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest):
+class TrtConvertElementwiseTestOneInput(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -431,9 +431,7 @@ def test(self):
         self.run_test()
 
 
-class TrtConvertElementwiseTest_two_input_without_broadcast(
-    TrtLayerAutoScanTest
-):
+class TrtConvertElementwiseTestTwoInputWithoutBroadcast(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -592,7 +590,7 @@ def test(self):
         self.run_test()
 
 
-class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest):
+class TrtConvertElementwiseTestTwoInputWithBroadcast(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         if len(inputs['input_data1'].shape) != len(inputs['input_data2'].shape):
@@ -754,7 +752,7 @@ def test(self):
         self.run_test()
 
 
-class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest):
+class TrtConvertElementwiseTestOneInputCornerCase(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -896,5 +894,157 @@ def test(self):
         self.run_test()
 
 
+class TrtConvertElementwiseTestTwoInputSkipCase(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # if program_config.ops[0].type in "round":
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(shape, op_type):
+            if op_type == "elementwise_pow":
+                return np.random.randint(
+                    low=1, high=10000, size=shape, dtype=np.int32
+                )
+            # Paddle mul support bool and TensorRT not
+            if op_type == "elementwise_mul":
+                return np.random.random(shape).astype(np.bool)
+
+        for shape in [[4], [4, 32], [2, 32, 16], [1, 8, 16, 32]]:
+            for op_type in [
+                "elementwise_pow",
+                "elementwise_mul",
+            ]:
+                for axis in [0, -1]:
+                    self.dims = len(shape)
+                    dics = [{"axis": axis}]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["input_data1"],
+                                "Y": ["input_data2"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                            "outputs_dtype": {
+                                "output_data": np.int32
+                                if op_type == "elementwise_pow"
+                                else np.bool_
+                            },
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data1": TensorConfig(
+                                data_gen=partial(generate_input, shape, op_type)
+                            ),
+                            "input_data2": TensorConfig(
+                                data_gen=partial(generate_input, shape, op_type)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [1],
+                    "input_data2": [1],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [128],
+                    "input_data2": [128],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [32],
+                    "input_data2": [32],
+                }
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [1, 4],
+                    "input_data2": [1, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [128, 256],
+                    "input_data2": [128, 256],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [32, 64],
+                    "input_data2": [32, 64],
+                }
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [1, 4, 4],
+                    "input_data2": [1, 4, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [128, 128, 256],
+                    "input_data2": [128, 128, 256],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [2, 32, 16],
+                    "input_data2": [2, 32, 16],
+                }
+            elif self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [1, 4, 4, 4],
+                    "input_data2": [1, 4, 4, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [8, 128, 64, 128],
+                    "input_data2": [8, 128, 64, 128],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [2, 64, 32, 32],
+                    "input_data2": [2, 64, 32, 32],
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 0, 4
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), (1e-3, 1e-3)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 4), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 4), (1e-3, 1e-3)
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
index 7be685d2894b1..4993e830f190b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
@@ -23,7 +23,7 @@
 import paddle.inference as paddle_infer
 
 
-class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest):
+class TrtConvertEqualOneInputCornerCase(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))

From 0e51f3988bf92c3a13f3a1e54c0ade4d98c7edeb Mon Sep 17 00:00:00 2001
From: Charles-hit <56987902+Charles-hit@users.noreply.github.com>
Date: Tue, 31 Jan 2023 16:20:00 +0800
Subject: [PATCH 41/89] Integrate static code gen info (#49858)

* polish static grad op maker gen

* fix some bugs

* fix static code gen

* solve conflict

* modify composite grad maker name

* integrate phi and fluid info in static code gen

* rename some composite maker

* modify static code gen format
---
 paddle/fluid/framework/details/op_registry.h  |   4 +-
 paddle/fluid/framework/op_info.h              |   4 +-
 paddle/fluid/framework/type_defs.h            |   2 +-
 .../elementwise/elementwise_add_op.cc         |   8 +-
 .../elementwise/elementwise_div_op.cc         |   8 +-
 .../elementwise/elementwise_mul_op.cc         |   8 +-
 .../elementwise/elementwise_sub_op.cc         |   8 +-
 paddle/fluid/operators/expand_v2_op.cc        |   6 +-
 paddle/fluid/operators/generator/filters.py   |  23 +-
 .../fluid/operators/generator/generate_op.py  | 193 ++++++++---------
 .../operators/generator/generate_sparse_op.py |  16 +-
 .../operators/generator/generate_static_op.py |  12 +-
 .../fluid/operators/generator/parse_utils.py  |  15 +-
 .../operators/generator/templates/op.c.j2     |   8 +-
 .../generator/templates/operator_utils.c.j2   | 201 +++++++++---------
 .../operators/reduce_ops/reduce_sum_op.cc     |   4 +-
 paddle/fluid/prim/tests/test_static_prim.cc   |  26 +--
 .../utils/static/composite_grad_desc_maker.h  |   6 +-
 paddle/fluid/pybind/pybind.cc                 |   8 +-
 19 files changed, 286 insertions(+), 274 deletions(-)

diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 1edc84aba07d9..4b109ba0dcff2 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -63,7 +63,7 @@ using OpRegistryClasses = std::tuple<                                // NOLINT
     TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>,       // NOLINT
     TypePair<GradOpDescMakerBase, kGradOpDescMaker>,                 // NOLINT
     TypePair<imperative::GradOpBaseMakerBase, kGradOpBaseMaker>,     // NOLINT
-    TypePair<prim::GradCompositeOpMakerBase, kGradCompOpDescMaker>,  // NOLINT
+    TypePair<prim::CompositeGradOpMakerBase, kGradCompOpDescMaker>,  // NOLINT
     TypePair<VarTypeInference, kVarTypeInference>,                   // NOLINT
     TypePair<InferShapeBase, kShapeInference>,                       // NOLINT
     TypePair<InplaceOpInference, kInplaceOpInference>,               // NOLINT
@@ -262,7 +262,7 @@ struct OpInfoFiller<T, kGradCompOpDescMaker> {
         info->grad_comp_op_maker_,
         nullptr,
         platform::errors::AlreadyExists(
-            "GradCompositeOpMakerBase of %s has been registered", op_type));
+            "CompositeGradOpMakerBase of %s has been registered", op_type));
 
     info->grad_comp_op_maker_ =
         [](const OpDesc& fwd_op,
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 61a2373eb3479..bd4405f722844 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -43,7 +43,7 @@ class OpInfo {
  public:
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
-  GradCompositeOpMakerFN grad_comp_op_maker_;
+  CompositeGradOpMakerFN grad_comp_op_maker_;
   proto::OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
@@ -84,7 +84,7 @@ class OpInfo {
 
   const GradOpMakerFN& GradOpMaker() const { return grad_op_maker_; }
 
-  const GradCompositeOpMakerFN& GradCompOpMaker() const {
+  const CompositeGradOpMakerFN& CompGradOpMaker() const {
     return grad_comp_op_maker_;
   }
 
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 7c90925da4a88..13bd782ce4033 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -96,7 +96,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
     std::unordered_map<std::string, std::string>* /*grad_to_var*/,
     const std::vector<BlockDesc*>& grad_block)>;
 
-using GradCompositeOpMakerFN =
+using CompositeGradOpMakerFN =
     std::function<std::vector<std::unique_ptr<OpDesc>>(
         const OpDesc&,
         const std::unordered_set<std::string>& /*no_grad_set*/,
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 11e0fa7dd1f97..48a5d2e433a10 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -51,9 +51,9 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
   }
 };
 
-class ElementwiseAddGradCompositeOpMaker
-    : public prim::GradCompositeOpMakerBase {
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+class ElementwiseAddCompositeGradOpMaker
+    : public prim::CompositeGradOpMakerBase {
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
 
  public:
   void Apply() override {
@@ -122,7 +122,7 @@ REGISTER_OPERATOR(elementwise_add,
                   ::paddle::operators::ElementwiseOpInferVarType,
                   elementwise_addGradMaker<::paddle::framework::OpDesc>,
                   elementwise_addGradMaker<::paddle::imperative::OpBase>,
-                  ::paddle::operators::ElementwiseAddGradCompositeOpMaker,
+                  ::paddle::operators::ElementwiseAddCompositeGradOpMaker,
                   ::paddle::operators::ElementwiseOpInplaceInferer);
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 3d62792d8513e..41549ede1ebc6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -67,9 +67,9 @@ class ElementwiseDivGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-class ElementwiseDivGradCompositeOpMaker
-    : public prim::GradCompositeOpMakerBase {
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+class ElementwiseDivCompositeGradOpMaker
+    : public prim::CompositeGradOpMakerBase {
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
 
  public:
   void Apply() override {
@@ -123,7 +123,7 @@ REGISTER_OPERATOR(elementwise_div,
                   ops::ElementwiseOp,
                   ops::ElementwiseDivOpMaker,
                   ops::ElementwiseOpInferVarType,
-                  ops::ElementwiseDivGradCompositeOpMaker,
+                  ops::ElementwiseDivCompositeGradOpMaker,
                   ops::ElementwiseDivGradOpMaker<paddle::framework::OpDesc>,
                   ops::ElementwiseDivGradOpMaker<paddle::imperative::OpBase>);
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 4052f3e09e0cc..740c9381d92e2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -66,9 +66,9 @@ class ElementwiseMulOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-class ElementwiseMulGradCompositeOpMaker
-    : public prim::GradCompositeOpMakerBase {
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+class ElementwiseMulCompositeGradOpMaker
+    : public prim::CompositeGradOpMakerBase {
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
 
  public:
   void Apply() override {
@@ -155,7 +155,7 @@ REGISTER_OPERATOR(elementwise_mul,
                   ops::ElementwiseOpInferVarType,
                   ops::ElementwiseMulOpGradMaker<paddle::framework::OpDesc>,
                   ops::ElementwiseMulOpGradMaker<paddle::imperative::OpBase>,
-                  ops::ElementwiseMulGradCompositeOpMaker);
+                  ops::ElementwiseMulCompositeGradOpMaker);
 REGISTER_OPERATOR(
     elementwise_mul_grad,
     ops::ElementwiseOpGrad,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index be839f123a1e9..2a9e14867acf1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -54,9 +54,9 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
   }
 };
 
-class ElementwiseSubGradCompositeOpMaker
-    : public prim::GradCompositeOpMakerBase {
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+class ElementwiseSubCompositeGradOpMaker
+    : public prim::CompositeGradOpMakerBase {
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
 
  public:
   void Apply() override {
@@ -109,7 +109,7 @@ REGISTER_OPERATOR(elementwise_sub,
                   ::paddle::operators::ElementwiseOpInferVarType,
                   elementwise_subGradMaker<::paddle::framework::OpDesc>,
                   elementwise_subGradMaker<::paddle::imperative::OpBase>,
-                  ::paddle::operators::ElementwiseSubGradCompositeOpMaker,
+                  ::paddle::operators::ElementwiseSubCompositeGradOpMaker,
                   ::paddle::operators::ElementwiseOpInplaceInferer);
 
 REGISTER_OPERATOR(
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 9a867c040fcb8..3c05ab9295c67 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -193,8 +193,8 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-class ExpandV2GradCompositeOpMaker : public prim::GradCompositeOpMakerBase {
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+class ExpandV2CompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
 
  public:
   void Apply() override {
@@ -244,7 +244,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(expand_v2,
                   ops::ExpandV2Op,
                   ops::ExpandV2OpMaker,
-                  ops::ExpandV2GradCompositeOpMaker,
+                  ops::ExpandV2CompositeGradOpMaker,
                   ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>,
                   ExpandInferShapeFunctor);
diff --git a/paddle/fluid/operators/generator/filters.py b/paddle/fluid/operators/generator/filters.py
index 8efbac1f7e92c..50bc1f7bca884 100644
--- a/paddle/fluid/operators/generator/filters.py
+++ b/paddle/fluid/operators/generator/filters.py
@@ -14,6 +14,7 @@
 
 import itertools
 import re
+from typing import Dict, List
 
 from type_mapping import (
     attr_types_map,
@@ -137,17 +138,23 @@ def to_composite_grad_opmaker_name(backward_op_name):
     for i in range(len(words)):
         words[i] = words[i].strip()
         words[i] = words[i].capitalize()
-    composite_grad_opmaker_name = words[0] + "Composite"
-    composite_grad_opmaker_name += "".join(word for word in words[1:])
-    composite_grad_opmaker_name += "OpMaker"
+    composite_grad_opmaker_name = "".join(word for word in words[:-1])
+    composite_grad_opmaker_name += "CompositeGradOpMaker"
     return composite_grad_opmaker_name
 
 
+def to_variable_names(dict_list: List[Dict], key: str) -> List[str]:
+    names = []
+    for var in dict_list:
+        names.append(var[key])
+    return names
+
+
 def cartesian_prod_attrs(attrs):
     items = []
     for attr in attrs:
         type_name = attr["typename"]
-        name = attr["name"]
+        name = attr["fluid_name"]
         if type_name == "Scalar":
             items.append((name, to_scalar_tensor_name(attr)))
         elif type_name == "IntArray":
@@ -176,11 +183,15 @@ def cartesian_prod_attrs(attrs):
 def cartesian_prod_mapping(op):
     kernels = op["kernel"]["func"]
     inputs = [
-        x["name"] for x in op["inputs"] if x["name"] in op["kernel"]["param"]
+        x["fluid_name"]
+        for x in op["inputs"]
+        if x["fluid_name"] in op["kernel"]["param"]
     ]
     inputs = [to_opmaker_name_cstr(input) for input in inputs]
     attrs = cartesian_prod_attrs(op["attrs"])
-    outputs = [to_opmaker_name_cstr(output["name"]) for output in op["outputs"]]
+    outputs = [
+        to_opmaker_name_cstr(output["fluid_name"]) for output in op["outputs"]
+    ]
 
     def vec(items):
         return "{" + ', '.join(items) + "}"
diff --git a/paddle/fluid/operators/generator/generate_op.py b/paddle/fluid/operators/generator/generate_op.py
index e4bb7041016d2..2da40b1edd114 100644
--- a/paddle/fluid/operators/generator/generate_op.py
+++ b/paddle/fluid/operators/generator/generate_op.py
@@ -28,6 +28,7 @@
     to_opmaker_name_cstr,
     to_pascal_case,
     to_scalar_tensor_name,
+    to_variable_names,
 )
 from jinja2 import Environment, FileSystemLoader, StrictUndefined
 from parse_utils import to_named_dict
@@ -60,6 +61,7 @@
 env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr
 env.filters["cartesian_prod_mapping"] = cartesian_prod_mapping
 env.filters["to_composite_grad_opmaker_name"] = to_composite_grad_opmaker_name
+env.filters["to_variable_names"] = to_variable_names
 env.tests["base_op"] = is_base_op
 env.tests["composite_op"] = is_composite_op
 env.tests["vec"] = is_vec
@@ -157,29 +159,26 @@ def process_int_array(op_item, int_array_configs):
                         ]
 
 
-def parse_composite_info(ops, backward_ops, backward_op_dict):
-    for op in ops:
-        if "backward" in op:
-            op["phi_backward"] = op["backward"]
-    for backward_op in backward_ops:
-        if "backward" in backward_op:
-            backward_op["phi_backward"] = backward_op["backward"]
-    for backward_op_name, op_dict in backward_op_dict.items():
-        if "composite" not in op_dict:
-            continue
-        op_dict["composite"]["phi_inputs"] = []
-        op_dict["composite"]["phi_attrs"] = []
-        op_dict["composite"]["phi_outputs"] = []
-        for input in op_dict["inputs"]:
-            op_dict["composite"]["phi_inputs"].append(input['name'])
-        for attr in op_dict["attrs"]:
-            op_dict["composite"]["phi_attrs"].append(attr['name'])
-        for output in op_dict["outputs"]:
-            op_dict["composite"]["phi_outputs"].append(output['name'])
-
-
-# replace name of op and params for OpMaker
-def replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
+def add_composite_info(ops, backward_ops, backward_op_dict):
+    # add backward composite name in forward
+    for op in ops + backward_ops:
+        if (
+            op["backward"] in backward_op_dict
+            and "composite" in backward_op_dict[op["backward"]]
+        ):
+            op["backward_composite"] = op["backward"]
+        else:
+            op["backward_composite"] = None
+
+
+# add fluid name in ops and backward ops info
+def add_fluid_name(dict_list):
+    for item in dict_list:
+        item["fluid_name"] = item["name"]
+
+
+# add fluid name of op and params for OpMaker
+def add_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict):
     def get_phi_and_fluid_op_name(op_item):
         names = op_item.split('(')
         if len(names) == 1:
@@ -187,12 +186,14 @@ def get_phi_and_fluid_op_name(op_item):
         else:
             return names[0].strip(), names[1].split(')')[0].strip()
 
-    def update_op_param_name(op_args, args_alias_map):
+    def add_op_param_name(op_args, args_alias_map):
         for item in op_args:
             if item['name'] in args_alias_map:
-                item['name'] = args_alias_map[item['name']]
+                item['fluid_name'] = args_alias_map[item['name']]
+            else:
+                item['fluid_name'] = item['name']
 
-    def update_grad_args_name(op_args, args_alias_map):
+    def add_grad_args_name(op_args, args_alias_map):
         for item in op_args:
             if (
                 item['name'].endswith('_grad')
@@ -201,38 +202,12 @@ def update_grad_args_name(op_args, args_alias_map):
                 args_alias_map[item['name']] = (
                     args_alias_map[item['name'][:-5]] + '_grad'
                 )
-                item['name'] = args_alias_map[item['name'][:-5]] + '_grad'
-
-    def add_fluid_info_in_composite(composite_map, args_alias_map):
-        fluid_input_list = []
-        fluid_attr_list = []
-        fluid_output_list = []
-        # add fluid op inputs
-        for input in composite_map["phi_inputs"]:
-            if input in args_alias_map:
-                fluid_input_list.append(args_alias_map[input])
-            else:
-                fluid_input_list.append(input)
-        # add fluid op attrs
-        for attr in composite_map["phi_attrs"]:
-            if attr in args_alias_map:
-                fluid_attr_list.append(args_alias_map[attr])
-            else:
-                fluid_attr_list.append(attr)
-        # add fluid op outputs
-        for output in composite_map["phi_outputs"]:
-            if output in args_alias_map:
-                fluid_output_list.append(args_alias_map[output])
-            else:
-                fluid_output_list.append(output)
-
-        composite_map.update(
-            {
-                "fluid_inputs": fluid_input_list,
-                "fluid_attrs": fluid_attr_list,
-                "fluid_outputs": fluid_output_list,
-            }
-        )
+                item['fluid_name'] = args_alias_map[item['name'][:-5]] + '_grad'
+            elif (
+                item['name'].endswith('_grad')
+                and item['name'][:-5] not in args_alias_map
+            ):
+                item['fluid_name'] = item['name']
 
     def get_param_list_alias(param_list, args_map):
         return [
@@ -297,15 +272,15 @@ def update_common_params_name(
                 op_item['kernel']['layout']['candidates'], args_name_map
             )
 
-    def update_grad_op_compat_name(grad_op_item, args_name_map):
-        update_op_param_name(grad_op_item['inputs'], args_name_map)
-        update_op_param_name(grad_op_item['outputs'], args_name_map)
-        update_op_param_name(grad_op_item['attrs'], args_name_map)
-        update_op_param_name(grad_op_item['forward']['inputs'], args_name_map)
-        update_op_param_name(grad_op_item['forward']['outputs'], args_name_map)
-        update_op_param_name(grad_op_item['forward']['attrs'], args_name_map)
-        update_grad_args_name(grad_op_item['inputs'], args_map)
-        update_grad_args_name(grad_op_item['outputs'], args_map)
+    def add_grad_op_compat_name(grad_op_item, args_name_map):
+        add_op_param_name(grad_op_item['inputs'], args_name_map)
+        add_op_param_name(grad_op_item['outputs'], args_name_map)
+        add_op_param_name(grad_op_item['attrs'], args_name_map)
+        add_op_param_name(grad_op_item['forward']['inputs'], args_name_map)
+        add_op_param_name(grad_op_item['forward']['outputs'], args_name_map)
+        add_op_param_name(grad_op_item['forward']['attrs'], args_name_map)
+        add_grad_args_name(grad_op_item['inputs'], args_map)
+        add_grad_args_name(grad_op_item['outputs'], args_map)
 
     for op_args in op_fluid_map_list:
         new_op_name, op_name = get_phi_and_fluid_op_name(op_args['op'])
@@ -350,39 +325,32 @@ def update_grad_op_compat_name(grad_op_item, args_name_map):
                             int_array_configs[
                                 op_args[key][args_item['name']]
                             ] = int_array_configs[args_item['name']]
-                        args_item['name'] = op_args[key][args_item['name']]
-                if has_backward:
-                    for args_item in backward_op_item['forward'][key]:
-                        if args_item['name'] in op_args[key]:
-                            args_item['name'] = op_args[key][args_item['name']]
-        forward_op_item["attr_dict"] = to_named_dict(forward_op_item["attrs"])
+                        args_item['fluid_name'] = op_args[key][
+                            args_item['name']
+                        ]
         update_common_params_name(
             forward_op_item, args_map, scalar_configs, int_array_configs
         )
 
         if has_backward:
-            update_grad_op_compat_name(backward_op_item, args_map)
+            # update fluid info in backward
+            add_grad_op_compat_name(backward_op_item, args_map)
             update_common_params_name(
                 backward_op_item, args_map, scalar_configs, int_array_configs
             )
-            backward_op_item["attr_dict"] = to_named_dict(
-                backward_op_item["attrs"]
-            )
 
             if 'backward' not in op_args:
                 continue
 
             backward_op_list = op_args['backward'].split(',')
-            # add fluid args name in composite map
-            for backward_op in backward_op_list:
-                if (
-                    "composite"
-                    in backward_op_dict[backward_op.split('(')[0].strip()]
-                ):
-                    add_fluid_info_in_composite(
-                        backward_op_dict[backward_op]["composite"], args_map
-                    )
-            _, bw_op_name = get_phi_and_fluid_op_name(backward_op_list[0])
+            phi_bw_op_name, bw_op_name = get_phi_and_fluid_op_name(
+                backward_op_list[0]
+            )
+            if (
+                forward_op_item["backward_composite"] is not None
+                and phi_bw_op_name != bw_op_name
+            ):
+                forward_op_item["backward_composite"] = bw_op_name
             forward_op_item['backward'] = bw_op_name
             backward_op_item['op_name'] = bw_op_name
 
@@ -393,18 +361,20 @@ def update_grad_op_compat_name(grad_op_item, args_name_map):
                     double_grad_op_name,
                 ) = get_phi_and_fluid_op_name(backward_op_list[1])
                 double_grad_item = backward_op_dict[phi_double_grad_op_name]
+                if (
+                    backward_op_item["backward_composite"] is not None
+                    and phi_double_grad_op_name != double_grad_op_name
+                ):
+                    backward_op_item["backward_composite"] = double_grad_op_name
                 backward_op_item['backward'] = double_grad_op_name
                 double_grad_item['op_name'] = double_grad_op_name
-                update_grad_op_compat_name(double_grad_item, args_map)
+                add_grad_op_compat_name(double_grad_item, args_map)
                 update_common_params_name(
                     double_grad_item,
                     args_map,
                     scalar_configs,
                     int_array_configs,
                 )
-                double_grad_item["attr_dict"] = to_named_dict(
-                    double_grad_item["attrs"]
-                )
 
                 # for triple grad
                 if len(backward_op_list) > 2:
@@ -413,18 +383,22 @@ def update_grad_op_compat_name(grad_op_item, args_name_map):
                         triple_grad_op_name,
                     ) = get_phi_and_fluid_op_name(backward_op_list[2])
                     triple_grad_item = backward_op_dict[phi_triple_grad_op_name]
+                    if (
+                        double_grad_item["backward_composite"] is not None
+                        and phi_triple_grad_op_name != triple_grad_op_name
+                    ):
+                        double_grad_item[
+                            "backward_composite"
+                        ] = triple_grad_op_name
                     double_grad_item['backward'] = triple_grad_op_name
                     triple_grad_item['op_name'] = triple_grad_op_name
-                    update_grad_op_compat_name(triple_grad_item, args_map)
+                    add_grad_op_compat_name(triple_grad_item, args_map)
                     update_common_params_name(
                         triple_grad_item,
                         args_map,
                         scalar_configs,
                         int_array_configs,
                     )
-                    triple_grad_item["attr_dict"] = to_named_dict(
-                        triple_grad_item["attrs"]
-                    )
 
 
 def process_invoke_op(forward_op_dict, backward_op_dict):
@@ -442,20 +416,28 @@ def process_invoke_op(forward_op_dict, backward_op_dict):
                 for input_item in reuse_op['inputs']:
                     bw_op['invoke']['inputs'].append(
                         {
+                            'fluid_name': input_item['fluid_name'],
                             'name': input_item['name'],
                             'value': args_list[args_index],
                         }
                     )
                     args_index = args_index + 1
+                bw_fluid_attrs_set = [
+                    item['fluid_name'] for item in bw_op['attrs']
+                ]
                 for attr in reuse_op['attrs']:
                     if args_index < len(args_list):
                         attr_value = (
                             f"this->GetAttr(\"{args_list[args_index]}\")"
-                            if args_list[args_index] in bw_op['attr_dict']
+                            if args_list[args_index] in bw_fluid_attrs_set
                             else args_list[args_index]
                         )
                         bw_op['invoke']['attrs'].append(
-                            {'name': attr['name'], 'value': attr_value}
+                            {
+                                'name': attr['name'],
+                                'fluid_name': attr['fluid_name'],
+                                'value': attr_value,
+                            }
                         )
                         args_index = args_index + 1
                     else:
@@ -464,7 +446,8 @@ def process_invoke_op(forward_op_dict, backward_op_dict):
                     bw_op['invoke']['outputs'].append(
                         {
                             'name': output_item['name'],
-                            'value': bw_op['outputs'][idx]['name'],
+                            'fluid_name': output_item['fluid_name'],
+                            'value': bw_op['outputs'][idx]['fluid_name'],
                         }
                     )
 
@@ -517,17 +500,26 @@ def main(
 
     for op in ops:
         op['op_name'] = op['name']
+        add_fluid_name(op['inputs'])
+        add_fluid_name(op['attrs'])
+        add_fluid_name(op['outputs'])
     for bw_op in backward_ops:
         bw_op['op_name'] = bw_op['name']
+        add_fluid_name(bw_op['inputs'])
+        add_fluid_name(bw_op['attrs'])
+        add_fluid_name(bw_op['outputs'])
+        add_fluid_name(bw_op['forward']['inputs'])
+        add_fluid_name(bw_op['forward']['attrs'])
+        add_fluid_name(bw_op['forward']['outputs'])
         for bw_output in bw_op['outputs']:
             bw_output['drop_empty_grad'] = True
 
     # deal the drop_empty_grad of bw_op by op_compat.yaml
     parse_drop_empty_grad(op_fluid_map_list, backward_op_dict)
 
-    parse_composite_info(ops, backward_ops, backward_op_dict)
+    add_composite_info(ops, backward_ops, backward_op_dict)
 
-    replace_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict)
+    add_compat_name(op_fluid_map_list, forward_op_dict, backward_op_dict)
 
     # prepare for invoke case
     process_invoke_op(forward_op_dict, backward_op_dict)
@@ -555,7 +547,6 @@ def main(
             ops=ops,
             backward_ops=backward_ops,
             op_dict=op_dict,
-            composite_gen_flag=True,
         )
         f.write(msg)
     ks_template = env.get_template('ks.c.j2')
diff --git a/paddle/fluid/operators/generator/generate_sparse_op.py b/paddle/fluid/operators/generator/generate_sparse_op.py
index 1da91e3f60005..3eea32091dc80 100644
--- a/paddle/fluid/operators/generator/generate_sparse_op.py
+++ b/paddle/fluid/operators/generator/generate_sparse_op.py
@@ -28,12 +28,14 @@
     to_opmaker_name_cstr,
     to_pascal_case,
     to_scalar_tensor_name,
+    to_variable_names,
 )
-from generate_op import process_invoke_op
+from generate_op import add_fluid_name, process_invoke_op
 from jinja2 import Environment, FileSystemLoader, StrictUndefined
 from parse_utils import to_named_dict
 from tests import (
     is_base_op,
+    is_composite_op,
     is_initializer_list,
     is_scalar,
     is_vec,
@@ -60,7 +62,9 @@
 env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr
 env.filters["cartesian_prod_mapping"] = cartesian_prod_mapping
 env.filters["to_composite_grad_opmaker_name"] = to_composite_grad_opmaker_name
+env.filters["to_variable_names"] = to_variable_names
 env.tests["base_op"] = is_base_op
+env.tests["composite_op"] = is_composite_op
 env.tests["vec"] = is_vec
 env.tests["scalar"] = is_scalar
 env.tests["initializer_list"] = is_initializer_list
@@ -96,9 +100,18 @@ def main(op_yaml_path, backward_yaml_path, output_op_path, output_arg_map_path):
         op['name'] = op['op_name']
         if op["backward"] is not None:
             op["backward"] = SPARSE_OP_PREFIX + op["backward"]
+        add_fluid_name(op["inputs"])
+        add_fluid_name(op["attrs"])
+        add_fluid_name(op["outputs"])
     for bw_op in backward_ops:
         bw_op['op_name'] = SPARSE_OP_PREFIX + bw_op['name']
         bw_op['name'] = bw_op['op_name']
+        add_fluid_name(bw_op["inputs"])
+        add_fluid_name(bw_op["attrs"])
+        add_fluid_name(bw_op["outputs"])
+        add_fluid_name(bw_op["forward"]["inputs"])
+        add_fluid_name(bw_op["forward"]["attrs"])
+        add_fluid_name(bw_op["forward"]["outputs"])
         if 'invoke' in bw_op:
             bw_op['invoke']['args'] = [
                 param.strip() for param in bw_op['invoke']['args'].split(',')
@@ -139,7 +152,6 @@ def main(op_yaml_path, backward_yaml_path, output_op_path, output_arg_map_path):
             ops=ops,
             backward_ops=backward_ops,
             op_dict=op_dict,
-            composite_gen_flag=False,
         )
         f.write(msg)
 
diff --git a/paddle/fluid/operators/generator/generate_static_op.py b/paddle/fluid/operators/generator/generate_static_op.py
index 7701f76734a0c..3a825bafb127c 100644
--- a/paddle/fluid/operators/generator/generate_static_op.py
+++ b/paddle/fluid/operators/generator/generate_static_op.py
@@ -28,12 +28,14 @@
     to_opmaker_name_cstr,
     to_pascal_case,
     to_scalar_tensor_name,
+    to_variable_names,
 )
-from generate_op import replace_compat_name
+from generate_op import add_compat_name, add_fluid_name
 from jinja2 import Environment, FileSystemLoader, StrictUndefined
 from parse_utils import to_named_dict
 from tests import (
     is_base_op,
+    is_composite_op,
     is_initializer_list,
     is_scalar,
     is_vec,
@@ -60,7 +62,9 @@
 env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr
 env.filters["cartesian_prod_mapping"] = cartesian_prod_mapping
 env.filters["to_composite_grad_opmaker_name"] = to_composite_grad_opmaker_name
+env.filters["to_variable_names"] = to_variable_names
 env.tests["base_op"] = is_base_op
+env.tests["composite_op"] = is_composite_op
 env.tests["vec"] = is_vec
 env.tests["scalar"] = is_scalar
 env.tests["initializer_list"] = is_initializer_list
@@ -100,8 +104,11 @@ def main(
 
     for op in ops:
         op['op_name'] = op['name']
+        add_fluid_name(op["inputs"])
+        add_fluid_name(op["attrs"])
+        add_fluid_name(op["outputs"])
 
-    replace_compat_name(op_op_map, forward_op_dict, {})
+    add_compat_name(op_op_map, forward_op_dict, {})
 
     if len(ops) == 0:
         if os.path.isfile(output_op_path):
@@ -116,7 +123,6 @@ def main(
             ops=ops,
             backward_ops=[],
             op_dict=forward_op_dict,
-            composite_gen_flag=False,
         )
         f.write(msg)
 
diff --git a/paddle/fluid/operators/generator/parse_utils.py b/paddle/fluid/operators/generator/parse_utils.py
index d5a58a2a94a0e..31441aadbf8e3 100644
--- a/paddle/fluid/operators/generator/parse_utils.py
+++ b/paddle/fluid/operators/generator/parse_utils.py
@@ -294,14 +294,13 @@ def parse_composite(
     composite_config: str,
 ) -> Dict[str, Any]:
     # composite_config: func(args1, args2,.....)
-    fname = r'(.*?)'
-    wspace = r'\s*'
-    fargs = r'(.*?)'
-    pattern = fr'{fname}{wspace}\({wspace}{fargs}{wspace}\)'
-
-    m = re.search(pattern, composite_config)
-    func_name = m.group(1)
-    func_args = m.group(2)
+    result = re.search(
+        r"(?P<func_name>[a-z][a-z0-9_]+)\s*\((?P<func_args>[^\)]+)\)",
+        composite_config,
+    )
+
+    func_name = result.group("func_name")
+    func_args = result.group("func_args")
 
     composite_dict = {}
     composite_dict["func_name"] = func_name
diff --git a/paddle/fluid/operators/generator/templates/op.c.j2 b/paddle/fluid/operators/generator/templates/op.c.j2
index 23641dad90f1b..2339822af280f 100644
--- a/paddle/fluid/operators/generator/templates/op.c.j2
+++ b/paddle/fluid/operators/generator/templates/op.c.j2
@@ -39,11 +39,9 @@ using paddle::framework::GradVarName;
   {% else %}
 {{backward_op_reused_maker(op, op_dict[op["forward"]["name"]], op["invoke"])}}
   {% endif %}
-  {% if composite_gen_flag == True %}
-    {% if op is composite_op %}
+  {% if op is composite_op %}
 {{composite_grad_op_maker(op_dict[op["name"]])}}
-    {% endif %}
-  {% endif %}  
+  {% endif %}
 {% endfor %}
 }  // namespace operators
 }  // namespace paddle
@@ -51,7 +49,7 @@ using paddle::framework::GradVarName;
 namespace ops = paddle::operators;
 {% for op in ops + backward_ops %}
 {% if op is base_op %}
-{{register_op_with_components(op, op_dict)}}
+{{register_op_with_components(op)}}
 {{register_op_version(op)}}
 {% endif %}
 {% endfor %}
diff --git a/paddle/fluid/operators/generator/templates/operator_utils.c.j2 b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
index 000e56453d934..a471efaa562b4 100644
--- a/paddle/fluid/operators/generator/templates/operator_utils.c.j2
+++ b/paddle/fluid/operators/generator/templates/operator_utils.c.j2
@@ -12,7 +12,7 @@ class {{op_name | to_pascal_case}}OpMaker : public framework::OpProtoAndCheckerM
 {{add_output(loop.index0, output, op_name)}};
     {% endfor %}
     {% for attr in op["attrs"] %}
-      {% if attr["name"] in op["kernel"]["param"] %}
+      {% if attr["fluid_name"] in op["kernel"]["param"] %}
 {{add_attr(loop.index0, attr, op_name)}};
       {% endif %}
     {% endfor %}
@@ -27,7 +27,7 @@ TODO: Documentation of {{op_name}} op.
 
 {# add input, it could be duplicable or dispensable #}
 {% macro add_input(i, input, op_name) %}{# inline #}
-  {% set name = input["name"] %}
+  {% set name = input["fluid_name"] %}
   {% set typename = input["typename"] %}
 AddInput({{name| to_opmaker_name}}, "({{typename}}), input {{i}} of {{op_name}} op.")
   {%- if typename is vec %}
@@ -42,7 +42,7 @@ AddInput({{name| to_opmaker_name}}, "({{typename}}), input {{i}} of {{op_name}}
 
 {# add output, it could be duplicable or intermediate, however, optional output is not supported #}
 {% macro add_output(i, output, op_name) %}{# inline #}
-  {% set name = output["name"] %}
+  {% set name = output["fluid_name"] %}
   {% set typename = output["typename"] %}
   {% set is_intermediate = output["intermediate"] %}
 AddOutput({{name | to_opmaker_name}}, "({{typename}}), output {{i}} of {{op_name}} op.")
@@ -66,7 +66,7 @@ AddOutput({{name | to_opmaker_name}}, "({{typename}}), output {{i}} of {{op_name
 
 {# add attribute, and process default value if needed #}
 {% macro add_attr(i, attr, op_name) %}{# inline #}
-  {% set name = attr["name"] %}
+  {% set name = attr["fluid_name"] %}
   {% set typename = attr["typename"] %}
   {% if typename is scalar %}
 AddInput("{{attr | to_scalar_tensor_name}}", "attribute {{i}} for {{op_name}} op from 0D Tensor.")
@@ -153,15 +153,15 @@ KernelSignature {{op["op_name"] | to_pascal_case }}OpArgumentMapping(const Argum
   {% set kernel_in_type_list = kernel_config["dispatch"][kernel_func][0] %}
 
   if ( {%- for input in inputs %}
-    {%- if input["name"] in kernel_config["param"] %}
+    {%- if input["fluid_name"] in kernel_config["param"] %}
       {%- if kernel_in_type_list[input_idx.idx] == "dense" %}
-ctx.IsDenseTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
+ctx.IsDenseTensorInput("{{input["fluid_name"]}}"){{" && " if not loop.last}}
       {%- elif kernel_in_type_list[input_idx.idx] == "selected_rows" %}
-ctx.IsSelectedRowsInput("{{input["name"]}}"){{" && " if not loop.last}}
+ctx.IsSelectedRowsInput("{{input["fluid_name"]}}"){{" && " if not loop.last}}
       {%- elif kernel_in_type_list[input_idx.idx] == "sparse_coo" %}
-ctx.IsSparseCooTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
+ctx.IsSparseCooTensorInput("{{input["fluid_name"]}}"){{" && " if not loop.last}}
       {%- elif kernel_in_type_list[input_idx.idx] == "sparse_csr" %}
-ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
+ctx.IsSparseCsrTensorInput("{{input["fluid_name"]}}"){{" && " if not loop.last}}
       {%- endif %}
       {% set input_idx.idx = input_idx.idx + 1 %}
     {%- endif %}
@@ -210,8 +210,8 @@ PD_REGISTER_ARG_MAPPING_FN({{op["op_name"]}}, phi::{{op["op_name"] | to_pascal_c
 {% macro get_input_list(inputs, kernel_args) %}{# inline #}
 paddle::small_vector<const char*> inputs {
 {%- for input in inputs %}
-{%- if input["name"] in kernel_args %}
-{{input["name"] | to_opmaker_name_cstr}}{{", " if not loop.last}}
+{%- if input["fluid_name"] in kernel_args %}
+{{input["fluid_name"] | to_opmaker_name_cstr}}{{", " if not loop.last}}
 {%- endif %}
 {%- endfor %}
 }
@@ -219,8 +219,8 @@ paddle::small_vector<const char*> inputs {
 
 {% macro get_an_attr(attr, kernel_args) %}{# inline #}
 {% set typename = attr["typename"] %}
-{%- if attr["name"] in kernel_args %}
-{% set name = attr["name"] %}
+{%- if attr["fluid_name"] in kernel_args %}
+{% set name = attr["fluid_name"] %}
 {% if typename is scalar %}{# scalar correspond to a dispensable input and an attr in opmaker #}
 attrs.emplace_back(ctx.HasInput("{{attr | to_scalar_tensor_name}}") ? "{{attr | to_scalar_tensor_name}}" : "{{name}}");
 {%- elif typename == "IntArray" %}
@@ -251,7 +251,7 @@ attrs.emplace_back("{{name}}");
 {% macro get_output_list(outputs, kernel_args) %}{# inline #}
 paddle::small_vector<const char*> outputs {
 {%- for output in outputs %}
-{{output["name"] | to_opmaker_name_cstr}}{{", " if not loop.last}}
+{{output["fluid_name"] | to_opmaker_name_cstr}}{{", " if not loop.last}}
 {%- endfor %}
 }
 {%- endmacro %}
@@ -263,7 +263,7 @@ phi::KernelKey GetExpectedKernelType(
 {%if kernel["data_type"] is not none %}{# data type ---------------------------------#}
   {% if kernel["data_type"]["candidates"] | length == 1 %}
     {% set data_type_arg = kernel["data_type"]["candidates"][0] %}
-    {% set inputs = op["inputs"] | map(attribute="name") | list %}
+    {% set inputs = op["inputs"] | map(attribute="fluid_name") | list %}
     {% if data_type_arg in inputs %}
   auto data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, {{data_type_arg | to_opmaker_name}});
       {% if kernel["data_type"]["to_complex_flag"][0] %}
@@ -353,9 +353,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER({{op["op_name"] | to_pascal_case}}NoNeedBuff
 {% endif %}
 {% endmacro%}
 
-{% macro register_op_with_components(op, op_dict) %}
+{% macro register_op_with_components(op) %}
 {% set name = op["op_name"] %}
-{% set phi_name = op["name"] %}
 REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op,
 {% if not "forward" in op %}{# it is a forward op #}
                   ops::{{name | to_pascal_case}}OpMaker,
@@ -371,8 +370,8 @@ REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op,
 {% if op is supports_inplace %}{# inplace#}
                   ops::{{name | to_pascal_case}}InplaceInferer,
 {% endif %}
-{% if "phi_backward" in op and  op["phi_backward"] is not none and "composite" in op_dict[op["phi_backward"]] %}
-                  ops::{{op["phi_backward"] | to_composite_grad_opmaker_name}},
+{% if "backward_composite" in op and op["backward_composite"] is not none %}
+                  ops::{{op["backward_composite"] | to_composite_grad_opmaker_name}},
 {% endif %}
 {% if op is supports_no_need_buffer %}{# no_need_buffer #}
                   ops::{{name | to_pascal_case}}NoNeedBufferVarInferer,
@@ -425,12 +424,12 @@ REGISTER_OP_VERSION({{name}})
 {# --------------------------------------- backward op maker ---------------------------------------------- #}
 {% macro backward_op_maker(op, forward_op ) %}
   {% set name = op["op_name"] %}
-  {% set forward_input_names = op["forward"]["inputs"] | map(attribute="name") | list %}
-  {% set forward_output_names = op["forward"]["outputs"] | map(attribute="name") | list %}
-  {% set forward_attr_names = op["forward"]["attrs"] | map(attribute="name") | list %}
-  {% set forward_input_orig_names = forward_op["inputs"] | map(attribute="name") | list %}
-  {% set forward_output_orig_names = forward_op["outputs"] | map(attribute="name") | list %}
-  {% set forward_attr_orig_names = forward_op["attrs"] | map(attribute="name") | list %}
+  {% set forward_input_names = op["forward"]["inputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_output_names = op["forward"]["outputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_attr_names = op["forward"]["attrs"] | map(attribute="fluid_name") | list %}
+  {% set forward_input_orig_names = forward_op["inputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_output_orig_names = forward_op["outputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_attr_orig_names = forward_op["attrs"] | map(attribute="fluid_name") | list %}
 template <typename T>
 class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -441,8 +440,8 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
     grad_op->SetType("{{name}}");
 
   {% for input in op["inputs"] %}
-    grad_op->SetInput({{input["name"] | to_opmaker_name}}, this->{{extract_input_from_forward(
-      input["name"],
+    grad_op->SetInput({{input["fluid_name"] | to_opmaker_name}}, this->{{extract_input_from_forward(
+      input["fluid_name"],
       forward_input_names,
       forward_output_names,
       forward_input_orig_names,
@@ -450,8 +449,8 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
   {% endfor %}
 
   {% for output in op["outputs"] %}
-    grad_op->SetOutput({{output["name"] | to_opmaker_name}}, this->{{extract_output_from_forward(
-      output["name"],
+    grad_op->SetOutput({{output["fluid_name"] | to_opmaker_name}}, this->{{extract_output_from_forward(
+      output["fluid_name"],
       forward_input_names,
       forward_output_names,
       forward_input_orig_names,
@@ -461,7 +460,7 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
 
     grad_op->SetAttrMap(this->Attrs());
   {% for attr in op["attrs"] %}
-    {% set attr_name = attr["name"] %}
+    {% set attr_name = attr["fluid_name"] %}
     {% if attr_name in forward_attr_names %}
       {% if attr["typename"] == "IntArray" %}
         {% if 'tensor_name' in attr or 'manual_flag' not in attr %}
@@ -489,12 +488,12 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
 
 {% macro backward_op_reused_maker(bw_op, forward_op, invoke_op) %}
   {% set name = bw_op["op_name"] %}
-  {% set forward_input_names = bw_op["forward"]["inputs"] | map(attribute="name") | list %}
-  {% set forward_output_names = bw_op["forward"]["outputs"] | map(attribute="name") | list %}
-  {% set forward_attr_names = bw_op["forward"]["attrs"] | map(attribute="name") | list %}
-  {% set forward_input_orig_names = forward_op["inputs"] | map(attribute="name") | list %}
-  {% set forward_output_orig_names = forward_op["outputs"] | map(attribute="name") | list %}
-  {% set forward_attr_orig_names = forward_op["attrs"] | map(attribute="name") | list %}
+  {% set forward_input_names = bw_op["forward"]["inputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_output_names = bw_op["forward"]["outputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_attr_names = bw_op["forward"]["attrs"] | map(attribute="fluid_name") | list %}
+  {% set forward_input_orig_names = forward_op["inputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_output_orig_names = forward_op["outputs"] | map(attribute="fluid_name") | list %}
+  {% set forward_attr_orig_names = forward_op["attrs"] | map(attribute="fluid_name") | list %}
 template <typename T>
 class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -505,7 +504,7 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
     grad_op->SetType("{{invoke_op["func"]}}");
 
   {% for input in invoke_op["inputs"] %}
-    grad_op->SetInput({{input["name"] | to_opmaker_name}}, this->{{extract_input_from_forward(
+    grad_op->SetInput({{input["fluid_name"] | to_opmaker_name}}, this->{{extract_input_from_forward(
       input["value"],
       forward_input_names,
       forward_output_names,
@@ -514,7 +513,7 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
   {% endfor %}
 
   {% for output in invoke_op["outputs"] %}
-    grad_op->SetOutput({{output["name"] | to_opmaker_name}}, this->{{extract_output_from_forward(
+    grad_op->SetOutput({{output["fluid_name"] | to_opmaker_name}}, this->{{extract_output_from_forward(
       output["value"],
       forward_input_names,
       forward_output_names,
@@ -524,42 +523,49 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
   {% endfor %}
 
   {% for attr in invoke_op["attrs"] %}
-    grad_op->SetAttr("{{attr["name"]}}", {{attr["value"]}});
+    grad_op->SetAttr("{{attr["fluid_name"]}}", {{attr["value"]}});
   {% endfor %}
   }
 };
 {% endmacro %}
 
-{% macro composite_grad_op_maker(composite_op_dict) %}
-  {% set op_name = composite_op_dict["name"] %}
-class {{op_name | to_composite_grad_opmaker_name}} : public prim::GradCompositeOpMakerBase {
+{% macro composite_grad_op_maker(backward_op) %}
+  {% set op_name = backward_op["op_name"] %}
+  {% set inputs = backward_op["inputs"] | to_variable_names("name")%}
+  {% set input_dict = backward_op["input_dict"] %}
+  {% set fluid_inputs = backward_op["inputs"] | to_variable_names("fluid_name")%}
+  {% set forward_fluid_inputs = backward_op["forward"]["inputs"] | to_variable_names("fluid_name")%}
+  {% set forward_fluid_outputs = backward_op["forward"]["outputs"] | to_variable_names("fluid_name")%}
+  {% set attrs = backward_op["attrs"] | to_variable_names("name") %}
+  {% set fluid_attrs = backward_op["attrs"] | to_variable_names("fluid_name") %}
+  {% set attr_dict = backward_op["attr_dict"] %}
+  {% set outputs = backward_op["outputs"] | to_variable_names("name")%}
+  {% set output_dict = backward_op["output_dict"] %}
+  {% set fluid_outputs = backward_op["outputs"] | to_variable_names("fluid_name")%}
+  {% set composite_func_info = backward_op["composite"] %}
+class {{op_name | to_composite_grad_opmaker_name}} : public prim::CompositeGradOpMakerBase {
  public:
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
   void Apply() override {
     //get inputs
-{{construct_composite_input(composite_op_dict)}}
+{{construct_composite_input(inputs, fluid_inputs, forward_fluid_inputs, forward_fluid_outputs, input_dict)}}
     //get attr
-{{construct_composite_attr(composite_op_dict)}}
+{{construct_composite_attr(attrs, fluid_attrs, attr_dict)}}
     //get output
-{{construct_composite_output(composite_op_dict)}}
+{{construct_composite_output(outputs, fluid_outputs, output_dict)}}
     //get output ptr
-{{construct_composite_output_ptr(composite_op_dict)}}
+{{construct_composite_output_ptr(outputs, output_dict)}}
     //get output orginal name
-{{get_composite_output_orginal_name(composite_op_dict)}}
+{{get_composite_output_orginal_name(outputs, output_dict)}}
     //call composite backward func
-{{call_composite_backward_api(composite_op_dict)}}
+{{call_composite_backward_api(composite_func_info)}}
     //recover output name
-{{recover_composite_output_name(composite_op_dict)}}
+{{recover_composite_output_name(outputs)}}
   }
 };
 {%- endmacro %}
 
-{% macro construct_composite_input(composite_op_dict) %}
-  {% set inputs = composite_op_dict["composite"]["phi_inputs"] %}
-  {% set input_dict = composite_op_dict["input_dict"] %}
-  {% set fluid_inputs = composite_op_dict["composite"]["fluid_inputs"] %}
-  {% set forward_fluid_inputs = composite_op_dict["forward"]["inputs"] | map(attribute="name") | list %}
-  {% set forward_fluid_outputs = composite_op_dict["forward"]["outputs"] | map(attribute="name") | list %}
+{% macro construct_composite_input(inputs, fluid_inputs, forward_fluid_inputs, forward_fluid_outputs, input_dict) %}
   {% set inputs_length = inputs | length %}
   {% for i in range(inputs_length) %}
     {% set input_typename = input_dict[inputs[i]]["typename"] %}
@@ -567,83 +573,75 @@ class {{op_name | to_composite_grad_opmaker_name}} : public prim::GradCompositeO
     {% if fluid_inputs[i] in forward_fluid_inputs %}
       {% if input_typename == "Tensor" %}
         {% if input_optional_flag == True %}
-    paddle::optional<paddle::experimental::Tensor> {{inputs[i]}} = this->GetOptionalSingleForwardInput("{{fluid_inputs[i]}}");
-        {% elif input_optional_flag == False %}
-    paddle::experimental::Tensor {{inputs[i]}} = this->GetSingleForwardInput("{{fluid_inputs[i]}}");
+    auto {{inputs[i]}} = this->GetOptionalSingleForwardInput("{{fluid_inputs[i]}}");
+        {% else %}
+    auto {{inputs[i]}} = this->GetSingleForwardInput("{{fluid_inputs[i]}}");
         {% endif %}
       {% elif input_typename == "Tensor[]" %}
         {% if input_optional_flag == True %}
-    std::vector<paddle::optional<paddle::experimental::Tensor>> {{inputs[i]}} = this->GetOptionalMultiForwardInput("{{fluid_inputs[i]}}");
-        {% elif input_optional_flag == False %}
-    std::vector<paddle::experimental::Tensor> {{inputs[i]}} = this->GetMultiForwardInput("{{fluid_inputs[i]}}");
+    auto {{inputs[i]}} = this->GetOptionalMultiForwardInput("{{fluid_inputs[i]}}");
+        {% else %}
+    auto {{inputs[i]}} = this->GetMultiForwardInput("{{fluid_inputs[i]}}");
         {% endif %}
       {% endif %}
     {% elif fluid_inputs[i] in forward_fluid_outputs %}
       {% if input_typename == "Tensor" %}
         {% if input_optional_flag == True %}
-    paddle::optional<paddle::experimental::Tensor> {{inputs[i]}} = this->GetOptionalSingleForwardOutput("{{fluid_inputs[i]}}");
-        {% elif input_optional_flag == False %}
-    paddle::experimental::Tensor {{inputs[i]}} = this->GetSingleForwardOutput("{{fluid_inputs[i]}}");
+    auto {{inputs[i]}} = this->GetOptionalSingleForwardOutput("{{fluid_inputs[i]}}");
+        {% else %}
+    auto {{inputs[i]}} = this->GetSingleForwardOutput("{{fluid_inputs[i]}}");
         {% endif %}
       {% elif input_typename == "Tensor[]" %}
         {% if input_optional_flag == True %}
-    std::vector<paddle::optional<paddle::experimental::Tensor>> {{inputs[i]}} = this->GetOptionalMultiForwardOutput("{{fluid_inputs[i]}}");
-        {% elif input_optional_flag == False %}
-    std::vector<paddle::experimental::Tensor> {{inputs[i]}} = this->GetMultiForwardOutput("{{fluid_inputs[i]}}");
+    auto {{inputs[i]}} = this->GetOptionalMultiForwardOutput("{{fluid_inputs[i]}}");
+        {% else %}
+    auto {{inputs[i]}} = this->GetMultiForwardOutput("{{fluid_inputs[i]}}");
         {% endif %}
       {% endif %}
     {% elif fluid_inputs[i][:-5] in forward_fluid_outputs %}
       {% if input_typename == "Tensor" %}
         {% if input_optional_flag == True %}
-    paddle::optional<paddle::experimental::Tensor> {{inputs[i]}} = this->GetOptionalSingleOutputGrad("{{fluid_inputs[i][:-5]}}");
-        {% elif input_optional_flag == False %}
-    paddle::experimental::Tensor {{inputs[i]}} = this->GetSingleOutputGrad("{{fluid_inputs[i][:-5]}}");
+    auto {{inputs[i]}} = this->GetOptionalSingleOutputGrad("{{fluid_inputs[i][:-5]}}");
+        {% else %}
+    auto {{inputs[i]}} = this->GetSingleOutputGrad("{{fluid_inputs[i][:-5]}}");
         {% endif %}
       {% elif input_typename == "Tensor[]" %}
         {% if input_optional_flag == True %}
-    std::vector<paddle::optional<paddle::experimental::Tensor>> {{inputs[i]}} = this->GetOptionalMultiOutputGrad("{{fluid_inputs[i][:-5]}}");
-        {% elif input_optional_flag == False %}
-    std::vector<paddle::experimental::Tensor> {{inputs[i]}} = this->GetMultiOutputGrad("{{fluid_inputs[i][:-5]}}");
+    auto {{inputs[i]}} = this->GetOptionalMultiOutputGrad("{{fluid_inputs[i][:-5]}}");
+        {% else %}
+    auto {{inputs[i]}} = this->GetMultiOutputGrad("{{fluid_inputs[i][:-5]}}");
         {%- endif %}
       {%- endif %}
     {%- endif %}
   {%- endfor %}
 {%- endmacro %}
 
-{% macro construct_composite_attr(composite_op_dict) %}
-  {% set attrs = composite_op_dict["composite"]["phi_attrs"] %}
-  {% set fluid_attrs = composite_op_dict["composite"]["fluid_attrs"] %}
-  {% set fluid_attrs_dict = composite_op_dict["attr_dict"] %}
+{% macro construct_composite_attr(attrs, fluid_attrs, attr_dict) %}
   {% set attrs_length = attrs | length %}
   {% for i in range(attrs_length) %}
-    {% set attrs_data_type = fluid_attrs_dict[fluid_attrs[i]]["typename"] | to_op_attr_type %}
-    {{attrs_data_type}} {{attrs[i]}} = this->Attr<{{attrs_data_type}}>("{{fluid_attrs[i]}}");
+    {% set attrs_data_type = attr_dict[attrs[i]]["typename"] | to_op_attr_type %}
+    const {{attrs_data_type}} {{attrs[i]}} = this->Attr<{{attrs_data_type}}>("{{fluid_attrs[i]}}");
   {% endfor %}
 {%- endmacro %}
 
-{% macro construct_composite_output(composite_op_dict) %}
-  {% set outputs = composite_op_dict["composite"]["phi_outputs"] %}
-  {% set fluid_outputs = composite_op_dict["composite"]["fluid_outputs"] %}
-  {% set outputs_dict = composite_op_dict["output_dict"] %}
+{% macro construct_composite_output(outputs, fluid_outputs, output_dict) %}
   {% set outputs_length = outputs | length %}
   {% for i in range(outputs_length) %}
-    {% set output_typename = outputs_dict[outputs[i]]["typename"] %}
+    {% set output_typename = output_dict[outputs[i]]["typename"] %}
     {% if output_typename == "Tensor" %}
-    paddle::experimental::Tensor {{outputs[i] + "_t"}} = this->GetSingleInputGrad("{{fluid_outputs[i][:-5]}}");
+    auto {{outputs[i] + "_t"}} = this->GetSingleInputGrad("{{fluid_outputs[i][:-5]}}");
     {% elif output_typename == "Tensor[]" %}
-    std::vector<paddle::experimental::Tensor> {{outputs[i] + "_t"}} = this->GetMultiInputGrad("{{fluid_outputs[i][:-5]}}");
+    auto {{outputs[i] + "_t"}} = this->GetMultiInputGrad("{{fluid_outputs[i][:-5]}}");
     {%- endif %}    
   {%- endfor %}
 {%- endmacro %}
 
-{% macro construct_composite_output_ptr(composite_op_dict) %}
-  {% set outputs = composite_op_dict["composite"]["phi_outputs"] %}
-  {% set outputs_dict = composite_op_dict["output_dict"] %}
+{% macro construct_composite_output_ptr(outputs, output_dict) %}
   {% set outputs_length = outputs | length %}
   {% for i in range(outputs_length) %}
-    {% set output_typename = outputs_dict[outputs[i]]["typename"] %}
+    {% set output_typename = output_dict[outputs[i]]["typename"] %}
     {% if output_typename == "Tensor" %}
-    paddle::experimental::Tensor* {{outputs[i]}} = this->GetOutputPtr(&{{outputs[i]+ "_t"}});
+    auto {{outputs[i]}} = this->GetOutputPtr(&{{outputs[i]+ "_t"}});
     {% elif output_typename == "Tensor[]" %}
     std::vector<paddle::experimental::Tensor*> {{outputs[i]}}({{outputs[i] + "_t"}}.size());
     for(size_t i = 0; i < {{outputs[i]}}.size(); ++i){
@@ -654,27 +652,24 @@ class {{op_name | to_composite_grad_opmaker_name}} : public prim::GradCompositeO
   {%- endfor %}
 {%- endmacro %}
 
-{% macro get_composite_output_orginal_name(composite_op_dict) %}
-  {% set outputs = composite_op_dict["composite"]["phi_outputs"] %}
-  {% set outputs_dict = composite_op_dict["output_dict"] %}
+{% macro get_composite_output_orginal_name(outputs, output_dict) %}
   {% set outputs_length = outputs | length %}
   {% for i in range(outputs_length) %}
-    {% set output_typename = outputs_dict[outputs[i]]["typename"] %}
+    {% set output_typename = output_dict[outputs[i]]["typename"] %}
     {% if output_typename == "Tensor" %}
-    std::string {{outputs[i] + "_name"}} = this->GetOutputName({{outputs[i] + "_t"}});
+    auto {{outputs[i] + "_name"}} = this->GetOutputName({{outputs[i] + "_t"}});
     {% elif output_typename == "Tensor[]" %}
-    std::vector<std::string> {{outputs[i] + "_name"}} = this->GetOutputName({{outputs[i] + "_t"}});
+    auto {{outputs[i] + "_name"}} = this->GetOutputName({{outputs[i] + "_t"}});
     {%- endif %}    
   {%- endfor %}
 {%- endmacro %}
 
-{% macro call_composite_backward_api(composite_op_dict) %}
-    VLOG(3) << "Runing {{composite_op_dict["composite"]["func_name"]}} composite func";
-    prim::{{composite_op_dict["composite"]["func_name"]}}<prim::DescTensor>({{composite_op_dict["composite"]["func_args"]}});
+{% macro call_composite_backward_api(composite_func_info) %}
+    VLOG(3) << "Runing {{composite_func_info["func_name"]}} composite func";
+    prim::{{composite_func_info["func_name"]}}<prim::DescTensor>({{composite_func_info["func_args"]}});
 {%- endmacro %}
 
-{% macro recover_composite_output_name(composite_op_dict) %}
-  {% set outputs = composite_op_dict["composite"]["phi_outputs"] %}
+{% macro recover_composite_output_name(outputs) %}
   {% set outputs_length = outputs | length %}
   {% for i in range(outputs_length) %}
     this->RecoverOutputName({{outputs[i] + "_t"}}, {{outputs[i] + "_name"}});
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 2b337887faa3f..25e6ad9b65cc0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -64,9 +64,9 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-class ReduceSumCompositeGradOpMaker : public prim::GradCompositeOpMakerBase {
+class ReduceSumCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
  public:
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
   void Apply() override {
     // get inputs
     paddle::experimental::Tensor x = this->GetSingleForwardInput("X");
diff --git a/paddle/fluid/prim/tests/test_static_prim.cc b/paddle/fluid/prim/tests/test_static_prim.cc
index fe7a6ca404044..313a3ccc99b74 100644
--- a/paddle/fluid/prim/tests/test_static_prim.cc
+++ b/paddle/fluid/prim/tests/test_static_prim.cc
@@ -135,9 +135,9 @@ struct TestBaseProgram {
   int idx_{0};
 };
 
-class TestGradCompositeGradMaker : public GradCompositeOpMakerBase {
+class TestCompositeGradMaker : public CompositeGradOpMakerBase {
  public:
-  using prim::GradCompositeOpMakerBase::GradCompositeOpMakerBase;
+  using prim::CompositeGradOpMakerBase::CompositeGradOpMakerBase;
   void Apply() override {}
 };
 
@@ -177,7 +177,7 @@ TEST(StaticPrim, TanhBackwardComposite) {
   std::vector<std::unique_ptr<framework::OpDesc>> grad_ops =
       std::move(framework::OpInfoMap::Instance()
                     .Get(forward_opdesc->Type())
-                    .GradCompOpMaker()(*forward_opdesc,
+                    .CompGradOpMaker()(*forward_opdesc,
                                        std::unordered_set<std::string>(),
                                        &grad_to_var,
                                        target_block,
@@ -250,11 +250,11 @@ TEST(StaticCompositeGradMaker, TestMutiInputMethod) {
   auto* forward_opdesc = target_block->AllOps()[0];
   std::unordered_map<std::string, std::string> grad_to_var;
   std::vector<framework::BlockDesc*> grad_sub_block;
-  auto test = TestGradCompositeGradMaker(*forward_opdesc,
-                                         std::unordered_set<std::string>(),
-                                         &grad_to_var,
-                                         target_block,
-                                         grad_sub_block);
+  auto test = TestCompositeGradMaker(*forward_opdesc,
+                                     std::unordered_set<std::string>(),
+                                     &grad_to_var,
+                                     target_block,
+                                     grad_sub_block);
   test();
   std::vector<paddle::experimental::Tensor> muti_fw_input =
       test.GetMultiForwardInput("X");
@@ -312,11 +312,11 @@ TEST(StaticCompositeGradMaker, TestMutiOutputMethod) {
   auto* forward_opdesc = target_block->AllOps()[0];
   std::unordered_map<std::string, std::string> grad_to_var;
   std::vector<framework::BlockDesc*> grad_sub_block;
-  auto test = TestGradCompositeGradMaker(*forward_opdesc,
-                                         std::unordered_set<std::string>(),
-                                         &grad_to_var,
-                                         target_block,
-                                         grad_sub_block);
+  auto test = TestCompositeGradMaker(*forward_opdesc,
+                                     std::unordered_set<std::string>(),
+                                     &grad_to_var,
+                                     target_block,
+                                     grad_sub_block);
   test();
   paddle::experimental::Tensor fw_input = test.GetSingleForwardInput("X");
   paddle::optional<paddle::experimental::Tensor> opt_fw_input =
diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
index c2e7ca4ec57e2..e391d8ac5300b 100644
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -41,9 +41,9 @@ namespace prim {
   argument DropEmptyIG in the derived classes.
  */
 
-class GradCompositeOpMakerBase {
+class CompositeGradOpMakerBase {
  public:
-  explicit GradCompositeOpMakerBase(
+  explicit CompositeGradOpMakerBase(
       const framework::OpDesc& fwd_op,
       const std::unordered_set<std::string>& no_grad_set,
       std::unordered_map<std::string, std::string>* grad_to_var,
@@ -61,7 +61,7 @@ class GradCompositeOpMakerBase {
         acting_program_.MutableBlock(0));
   }
 
-  virtual ~GradCompositeOpMakerBase() = default;
+  virtual ~CompositeGradOpMakerBase() = default;
 
   virtual std::vector<std::unique_ptr<framework::OpDesc>> operator()() {
     this->Apply();
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 36e2436406812..5569657707389 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1251,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle.
 
           auto op_info = framework::OpInfoMap::Instance().Get(op_desc.Type());
           auto grad_op_maker = op_info.GradOpMaker();
-          auto grad_comp_op_maker = op_info.GradCompOpMaker();
+          auto grad_comp_op_maker = op_info.CompGradOpMaker();
 
           if ((grad_op_maker == nullptr) && (grad_comp_op_maker == nullptr)) {
             // Normally, proto_ should not be null, except some special
@@ -1259,7 +1259,7 @@ All parameter, weight, gradient are variables in Paddle.
             std::string type =
                 op_info.proto_ ? op_info.proto_->type() : "unknown";
             PADDLE_THROW(platform::errors::NotFound(
-                "Neither operator %s's GradOpMaker nor GradCompOpMaker has "
+                "Neither operator %s's GradOpMaker nor CompGradOpMaker has "
                 "been registered.\nPlease check whether (%s) operator has "
                 "gradient operator.\nIf not, please set stop_gradient to be "
                 "True for its input and output variables using "
@@ -1268,10 +1268,10 @@ All parameter, weight, gradient are variables in Paddle.
                 type.c_str()));
           }
 
-          // In PrimEnabled mode, the priority of GradCompOpMaker is greater
+          // In PrimEnabled mode, the priority of CompGradOpMaker is greater
           // than GradCompMaker as we need split first-order grad operator into
           // primitive operators for compiler. In PrimDisabled mode, the
-          // priority of GradCompOpMaker is less than GradCompMaker for better
+          // priority of CompGradOpMaker is less than GradCompMaker for better
           // performance.
           std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
           if (paddle::prim::PrimCommonUtils::IsBwdPrimEnabled()) {

From c18fddd348391b3008807d68061ed7e92bd4b87a Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 31 Jan 2023 16:49:03 +0800
Subject: [PATCH 42/89] Save nan log to file when output_dir is setted (#49200)

---
 .../framework/details/nan_inf_utils_detail.cc | 123 ++----------
 .../framework/details/nan_inf_utils_detail.cu |  34 +++-
 .../framework/details/nan_inf_utils_detail.h  | 187 +++++++++++++++++-
 paddle/fluid/pybind/pybind.cc                 |   4 +
 .../fluid/tests/unittests/test_nan_inf_dir.py | 108 ++++++++++
 5 files changed, 344 insertions(+), 112 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_nan_inf_dir.py

diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 30046b2d1d44e..dd7791af85447 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -30,6 +30,23 @@ DECLARE_int32(check_nan_inf_level);
 namespace paddle {
 namespace framework {
 namespace details {
+struct DebugTools {
+  DebugTools() {}
+  std::string path = "";
+};
+static DebugTools debug_nan_inf;
+
+void SetNanInfDebugPath(const std::string& nan_inf_path) {
+  debug_nan_inf.path = nan_inf_path;
+  VLOG(4) << "Set the log's path of debug tools : " << nan_inf_path;
+}
+
+std::string GetNanPath() {
+  if (debug_nan_inf.path.empty()) {
+    return "";
+  }
+  return debug_nan_inf.path + "/";
+}
 
 static std::once_flag white_list_init_flag;
 
@@ -134,112 +151,6 @@ static void InitWhiteListFormEnv() {
   }
 }
 
-template <
-    typename T,
-    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
-                         !std::is_same<T, phi::dtype::complex<double>>::value,
-                     bool> = true>
-static void CheckNanInfCpuImpl(const T* value_ptr,
-                               const int64_t numel,
-                               const std::string& cpu_hint_str) {
-  using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
-
-#ifdef _OPENMP
-  // Use maximum 4 threads to collect the nan and inf information.
-  int num_threads = std::max(omp_get_num_threads(), 1);
-  num_threads = std::min(num_threads, 4);
-#else
-  int num_threads = 1;
-#endif
-
-  std::vector<int64_t> thread_num_nan(num_threads, 0);
-  std::vector<int64_t> thread_num_inf(num_threads, 0);
-  std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0]));
-  std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0]));
-  std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0));
-
-#ifdef _OPENMP
-#pragma omp parallel num_threads(num_threads)
-#endif
-  {
-#ifdef _OPENMP
-    int64_t tid = omp_get_thread_num();
-    int64_t chunk_size = (numel + num_threads - 1) / num_threads;
-    int64_t begin = tid * chunk_size;
-    int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin;
-#else
-    int64_t tid = 0;
-    int64_t begin = 0;
-    int64_t end = numel;
-#endif
-    for (int64_t i = begin; i < end; ++i) {
-      MT value = static_cast<MT>(value_ptr[i]);
-
-      thread_min_value[tid] = std::min(thread_min_value[tid], value);
-      thread_max_value[tid] = std::max(thread_max_value[tid], value);
-      thread_mean_value[tid] += value / static_cast<MT>(numel);
-
-      if (std::isnan(value)) {
-        thread_num_nan[tid] += 1;
-      } else if (std::isinf(value)) {
-        thread_num_inf[tid] += 1;
-      }
-    }
-  }
-
-  int64_t num_nan = 0;
-  int64_t num_inf = 0;
-  MT min_value = thread_min_value[0];
-  MT max_value = thread_max_value[0];
-  MT mean_value = static_cast<MT>(0);
-  for (int i = 0; i < num_threads; ++i) {
-    num_nan += thread_num_nan[i];
-    num_inf += thread_num_inf[i];
-    min_value = std::min(thread_min_value[i], min_value);
-    max_value = std::max(thread_max_value[i], max_value);
-    mean_value += thread_mean_value[i];
-  }
-
-  PrintForDifferentLevel<T, MT>(cpu_hint_str.c_str(),
-                                numel,
-                                num_nan,
-                                num_inf,
-                                max_value,
-                                min_value,
-                                mean_value,
-                                FLAGS_check_nan_inf_level);
-}
-
-template <
-    typename T,
-    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
-                         std::is_same<T, phi::dtype::complex<double>>::value,
-                     bool> = true>
-void CheckNanInfCpuImpl(const T* value_ptr,
-                        const int64_t numel,
-                        const std::string& cpu_hint_str) {
-  using RealType = typename T::value_type;
-
-  RealType real_sum = 0.0f, imag_sum = 0.0f;
-
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum)
-#endif
-  for (int64_t i = 0; i < numel; ++i) {
-    T value = value_ptr[i];
-    real_sum += (value.real - value.real);
-    imag_sum += (value.imag - value.imag);
-  }
-
-  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
-      std::isinf(imag_sum)) {
-    // hot fix for compile failed in gcc4.8
-    // here also need print detail info of nan or inf later
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are NAN or INF in %s.", cpu_hint_str));
-  }
-}
-
 template <>
 template <typename T>
 void TensorCheckerVisitor<phi::CPUContext>::apply(
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 8754a33b663db..514e5aa3e5d56 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -322,18 +322,26 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
 }
 
 template <typename T>
-static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
-                                 const std::string& op_type,
+inline std::string GetHintString(const std::string& op_type,
                                  const std::string& var_name,
-                                 int dev_id) {
+                                 const phi::Place& place,
+                                 int dev_id = -1) {
+  std::string op_var = GetCpuHintString<T>(op_type, var_name, place, dev_id);
   PADDLE_ENFORCE_EQ(
       (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()),
       true,
       platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d",
                                    multi_op_var2gpu_str_mutex().size()));
+  return op_var;
+}
 
+template <typename T>
+static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
+                                 const std::string& op_type,
+                                 const std::string& var_name,
+                                 int dev_id) {
   std::string op_var =
-      GetCpuHintString<T>(op_type, var_name, ctx.GetPlace(), dev_id);
+      GetHintString<T>(op_type, var_name, ctx.GetPlace(), dev_id);
   char* gpu_str_ptr = nullptr;
 
   {
@@ -396,6 +404,24 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       platform::DeviceContextPool::Instance().Get(tensor.place()));
   int dev_id = tensor.place().device;
+  // Write log to file
+  auto file_path = GetNanPath();
+  if (file_path.size() > 0) {
+    phi::DenseTensor cpu_tensor;
+    platform::CPUPlace cpu_place;
+    cpu_tensor.Resize(tensor.dims());
+    // 1. copy from gpu to cpu
+    paddle::framework::TensorCopySync(tensor, cpu_place, &cpu_tensor);
+    auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
+        platform::DeviceContextPool::Instance().Get(tensor.place()));
+    const std::string debug_info =
+        GetHintString<T>(op_type, var_name, place, dev_id);
+    // 2. write log to file
+    CheckNanInfCpuImpl(cpu_tensor.data<T>(), tensor.numel(), debug_info, "gpu");
+    return;
+  }
+
+  // Write log to window
   char* gpu_str_ptr =
       GetGpuHintStringPtr<T>(*dev_ctx, op_type, var_name, dev_id);
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 0adf23fd02921..fee2a52b428b2 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -13,17 +13,33 @@
 // limitations under the License.
 
 #pragma once
-
+#include <fstream>
+#include <iostream>
 #include <string>
-
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+
+#ifdef _WIN32
+#include <direct.h>
+#include <io.h>
+#define MKDIR(path) _mkdir(path)
+#else
+#include <sys/stat.h>
+#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
+#endif
 
+DECLARE_int32(check_nan_inf_level);
 namespace paddle {
 namespace framework {
 namespace details {
 
+void SetNanInfDebugPath(const std::string& nan_inf_path);
+
+std::string GetNanPath();
+
 template <typename T,
           typename MT,
           std::enable_if_t<std::is_same<T, float>::value, bool> = true>
@@ -93,6 +109,49 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
   }
 }
 
+template <typename T, typename MT>
+void PrintForDifferentLevelFile(const char* debug_info,
+                                int64_t numel,
+                                int64_t num_nan,
+                                int64_t num_inf,
+                                MT max_value,
+                                MT min_value,
+                                MT mean_value,
+                                int check_nan_inf_level,
+                                const std::string& log_name) {
+  int dev_id = 0;
+#ifdef PADDLE_WITH_HIP
+  hipGetDevice(&dev_id);
+#elif PADDLE_WITH_CUDA
+  cudaGetDevice(&dev_id);
+#endif
+  auto file_path = GetNanPath();
+  MKDIR(file_path.c_str());
+  std::string file_name = "worker_" + log_name + "." + std::to_string(dev_id);
+  std::string path = file_path + file_name;
+  std::ofstream outfile(path, std::ios::app);
+  if (!outfile.is_open()) {
+    return;
+  }
+
+  if (num_nan > 0 || num_inf > 0) {
+    outfile << "[PRECISION] [ERROR] in " << debug_info
+            << ", numel=" << static_cast<long long>(numel)      // NOLINT
+            << ", num_nan=" << static_cast<long long>(num_nan)  // NOLINT
+            << ", num_inf=" << static_cast<long long>(num_inf)  // NOLINT
+            << ", max=" << static_cast<float>(max_value)
+            << ", min=" << static_cast<float>(min_value)
+            << ", mean=" << static_cast<float>(mean_value) << std::endl;
+  } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
+    outfile << "[PRECISION] in " << debug_info
+            << ", numel=" << static_cast<long long>(numel)  // NOLINT
+            << ", max=" << static_cast<float>(max_value)
+            << ", min=" << static_cast<float>(min_value)
+            << ", mean=" << static_cast<float>(mean_value) << std::endl;
+  }
+  outfile.close();
+}
+
 template <typename T>
 inline std::string GetCpuHintString(const std::string& op_type,
                                     const std::string& var_name,
@@ -120,6 +179,130 @@ inline std::string GetCpuHintString(const std::string& op_type,
   return ss.str();
 }
 
+template <
+    typename T,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+static void CheckNanInfCpuImpl(const T* value_ptr,
+                               const int64_t numel,
+                               const std::string& cpu_hint_str,
+                               const std::string log_name = "cpu") {
+  using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
+
+#ifdef _OPENMP
+  // Use maximum 4 threads to collect the nan and inf information.
+  int num_threads = std::max(omp_get_num_threads(), 1);
+  num_threads = std::min(num_threads, 4);
+#else
+  int num_threads = 1;
+#endif
+
+  std::vector<int64_t> thread_num_nan(num_threads, 0);
+  std::vector<int64_t> thread_num_inf(num_threads, 0);
+  std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0]));
+  std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0]));
+  std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0));
+
+#ifdef _OPENMP
+#pragma omp parallel num_threads(num_threads)
+#endif
+  {
+#ifdef _OPENMP
+    int64_t tid = omp_get_thread_num();
+    int64_t chunk_size = (numel + num_threads - 1) / num_threads;
+    int64_t begin = tid * chunk_size;
+    int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin;
+#else
+    int64_t tid = 0;
+    int64_t begin = 0;
+    int64_t end = numel;
+#endif
+    for (int64_t i = begin; i < end; ++i) {
+      MT value = static_cast<MT>(value_ptr[i]);
+
+      thread_min_value[tid] = std::min(thread_min_value[tid], value);
+      thread_max_value[tid] = std::max(thread_max_value[tid], value);
+      thread_mean_value[tid] += value / static_cast<MT>(numel);
+
+      if (std::isnan(value)) {
+        thread_num_nan[tid] += 1;
+      } else if (std::isinf(value)) {
+        thread_num_inf[tid] += 1;
+      }
+    }
+  }
+
+  int64_t num_nan = 0;
+  int64_t num_inf = 0;
+  MT min_value = thread_min_value[0];
+  MT max_value = thread_max_value[0];
+  MT mean_value = static_cast<MT>(0);
+  for (int i = 0; i < num_threads; ++i) {
+    num_nan += thread_num_nan[i];
+    num_inf += thread_num_inf[i];
+    min_value = std::min(thread_min_value[i], min_value);
+    max_value = std::max(thread_max_value[i], max_value);
+    mean_value += thread_mean_value[i];
+  }
+  auto file_path = GetNanPath();
+  // Write log to file
+  if (file_path.size() > 0) {
+    VLOG(4) << "[FLAGS_check_nan_inf_level=" << FLAGS_check_nan_inf_level
+            << "]. Write log to " << file_path;
+    PrintForDifferentLevelFile<T, MT>(cpu_hint_str.c_str(),
+                                      numel,
+                                      num_nan,
+                                      num_inf,
+                                      max_value,
+                                      min_value,
+                                      mean_value,
+                                      FLAGS_check_nan_inf_level,
+                                      log_name);
+    return;
+  }
+
+  PrintForDifferentLevel<T, MT>(cpu_hint_str.c_str(),
+                                numel,
+                                num_nan,
+                                num_inf,
+                                max_value,
+                                min_value,
+                                mean_value,
+                                FLAGS_check_nan_inf_level);
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+void CheckNanInfCpuImpl(const T* value_ptr,
+                        const int64_t numel,
+                        const std::string& cpu_hint_str,
+                        const std::string log_name = "cpu") {
+  using RealType = typename T::value_type;
+
+  RealType real_sum = 0.0f, imag_sum = 0.0f;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum)
+#endif
+  for (int64_t i = 0; i < numel; ++i) {
+    T value = value_ptr[i];
+    real_sum += (value.real - value.real);
+    imag_sum += (value.imag - value.imag);
+  }
+
+  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
+      std::isinf(imag_sum)) {
+    // hot fix for compile failed in gcc4.8
+    // here also need print detail info of nan or inf later
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "There are NAN or INF in %s.", cpu_hint_str));
+  }
+}
+
 template <typename DeviceContext>
 struct TensorCheckerVisitor {
   TensorCheckerVisitor(const std::string& o,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5569657707389..020a926b4739e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/executor_cache.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -2671,6 +2672,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("use_layout_autotune",
         [] { return egr::Controller::Instance().UseLayoutAutoTune(); });
+  // Add the api for nan op debug
+  m.def("set_nan_inf_debug_path",
+        &paddle::framework::details::SetNanInfDebugPath);
 
   BindFleetWrapper(&m);
   BindIO(&m);
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
new file mode 100644
index 0000000000000..49882d192f9f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestNanInfDirCheckResult(unittest.TestCase):
+    def generate_inputs(self, shape, dtype="float32"):
+        data = np.random.random(size=shape).astype(dtype)
+        # [-10, 10)
+        x = (data * 20 - 10) * np.random.randint(
+            low=0, high=2, size=shape
+        ).astype(dtype)
+        y = np.random.randint(low=0, high=2, size=shape).astype(dtype)
+        return x, y
+
+    def get_reference_num_nan_inf(self, x):
+        out = np.log(x)
+        num_nan = np.sum(np.isnan(out))
+        num_inf = np.sum(np.isinf(out))
+        print("[reference] num_nan={}, num_inf={}".format(num_nan, num_inf))
+        return num_nan, num_inf
+
+    def get_num_nan_inf(
+        self, x_np, use_cuda=True, add_assert=False, pt="nan_inf_log_dir"
+    ):
+        num_nan = 0
+        num_inf = 0
+        if add_assert:
+            if use_cuda:
+                paddle.device.set_device("gpu:0")
+            else:
+                paddle.device.set_device("cpu")
+            x = paddle.to_tensor(x_np)
+            out = paddle.log(x)
+            sys.stdout.flush()
+            if not use_cuda:
+                os.path.exists(pt)
+                num_nan = 0
+                num_inf = 0
+                for root, dirs, files in os.walk(pt):
+                    for file_name in files:
+                        if file_name.startswith('worker_cpu'):
+                            file_path = os.path.join(root, file_name)
+                            with open(file_path, "rb") as fp:
+                                for e in fp:
+                                    err_str_list = (
+                                        str(e)
+                                        .replace("(", " ")
+                                        .replace(")", " ")
+                                        .replace(",", " ")
+                                        .split(" ")
+                                    )
+                                    for err_str in err_str_list:
+                                        if "num_nan" in err_str:
+                                            num_nan = int(err_str.split("=")[1])
+                                        elif "num_inf" in err_str:
+                                            num_inf = int(err_str.split("=")[1])
+                print(
+                    "[paddle] num_nan={}, num_inf={}".format(num_nan, num_inf)
+                )
+        return num_nan, num_inf
+
+    def test_num_nan_inf(self):
+        path = "nan_inf_log_dir"
+        paddle.fluid.core.set_nan_inf_debug_path(path)
+
+        def _check_num_nan_inf(use_cuda):
+            shape = [32, 32]
+            x_np, _ = self.generate_inputs(shape)
+            num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
+            add_assert = (num_nan_np + num_inf_np) > 0
+            num_nan, num_inf = self.get_num_nan_inf(
+                x_np, use_cuda, add_assert, path
+            )
+            if not use_cuda:
+                assert num_nan == num_nan_np and num_inf == num_inf_np
+
+        paddle.set_flags(
+            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
+        )
+        _check_num_nan_inf(use_cuda=False)
+        if paddle.fluid.core.is_compiled_with_cuda():
+            _check_num_nan_inf(use_cuda=True)
+        x = paddle.to_tensor([2, 3, 4], 'float32')
+        y = paddle.to_tensor([1, 5, 2], 'float32')
+        z = paddle.add(x, y)
+
+
+if __name__ == '__main__':
+    unittest.main()

From c3cd8502bcd50a40834622b21ec3d1fb9549798e Mon Sep 17 00:00:00 2001
From: Zhang Jun <ewalker@live.cn>
Date: Tue, 31 Jan 2023 17:03:16 +0800
Subject: [PATCH 43/89] [pass] Upgrade Constant Folding Pass (#49908)

---
 paddle/fluid/framework/ir/constant_folding_pass.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
index cd069e474e648..0bcd7a733dde7 100644
--- a/paddle/fluid/framework/ir/constant_folding_pass.cc
+++ b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -143,6 +143,10 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
         }
         out_desc->SetShape(out_shape);
         out_desc->SetPersistable(true);
+        auto *var_desc_out = op_node->Op()->Block()->Var(out_name);
+        var_desc_out->SetShape(out_shape);
+        var_desc_out->SetPersistable(true);
+        var_desc_out->Flush();
         auto *global_out_tensor =
             scope->Var(out_name)->GetMutable<phi::DenseTensor>();
         *global_out_tensor = *local_out_tensor;

From 0d9185b99151d851b9d597dd860e47b28d59cfc1 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 31 Jan 2023 17:21:08 +0800
Subject: [PATCH 44/89] Unary (#49914)

* disable integer

* disable integer

* add cast layer
---
 .../inference/tensorrt/convert/unary_op.cc    | 26 ++++++++++--
 .../ir/inference/test_trt_convert_unary.py    | 41 +++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
index 9279e25a1836c..3186bf5fd33d0 100644
--- a/paddle/fluid/inference/tensorrt/convert/unary_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -52,13 +52,33 @@ class UnaryOpConverter : public OpConverter {
     nvinfer1::ITensor* input_tensor =
         engine_->GetITensor(op_desc.Input("X")[0]);
     auto op_pair = ops.find(op_type_);
-
-    nvinfer1::IUnaryLayer* layer = nullptr;
+    nvinfer1::ILayer* layer;
+#if !IS_TRT_VERSION_GE(8500)
+    nvinfer1::DataType org_type = input_tensor->getType();
+    bool cast = org_type == nvinfer1::DataType::kINT8 ||
+                org_type == nvinfer1::DataType::kINT32;
+    if (cast) {
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input_tensor);
+      if (engine_->precision() == AnalysisConfig::Precision::kFloat32) {
+        layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+      } else {
+        layer->setOutputType(0, nvinfer1::DataType::kHALF);
+      }
+      input_tensor = layer->getOutput(0);
+    }
+#endif
     for (auto trt_op : op_pair->second) {
       layer = TRT_ENGINE_ADD_LAYER(engine_, Unary, *input_tensor, trt_op);
       input_tensor = layer->getOutput(0);
     }
-
+#if !IS_TRT_VERSION_GE(8500)
+    // type restore
+    if (cast) {
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input_tensor);
+      layer->setOutputType(0, org_type);
+      input_tensor = layer->getOutput(0);
+    }
+#endif
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
   }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
index 673ab597659fe..97e83e79714a5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -42,6 +42,14 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             else:
                 return np.random.random([batch, 3, 32, 32]).astype(np.float32)
 
+        def generate_int_input(dims, batch, attrs: List[Dict[str, Any]]):
+            if dims == 2:
+                return np.random.random([3, 32]).astype(np.int32)
+            elif dims == 3:
+                return np.random.random([3, 32, 32]).astype(np.int32)
+            else:
+                return np.random.random([batch, 3, 32, 32]).astype(np.int32)
+
         for dims in [2, 3, 4]:
             for batch in [1, 4]:
                 for op_type in [
@@ -96,6 +104,39 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
 
                     yield program_config
 
+                for op_type in [
+                    "exp",
+                    "abs",
+                ]:
+                    self.dims = dims
+                    self.op_type = op_type
+                    dics = [{}]
+
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(
+                                    generate_int_input, dims, batch, dics
+                                )
+                            )
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
     def sample_predictor_configs(
         self, program_config
     ) -> (paddle_infer.Config, List[int], float):

From a5f2e1f7dbfab674b28fcb3fe26242677514c48b Mon Sep 17 00:00:00 2001
From: wangshengxiang <121413869+shengxiangwang@users.noreply.github.com>
Date: Tue, 31 Jan 2023 18:31:37 +0800
Subject: [PATCH 45/89] bind pixel_shuffle & pixel_shuffle_grad op for xpu
 (#50090)

---
 paddle/phi/backends/xpu/xpu2_op_list.cc       |   2 +
 .../kernels/xpu/pixel_shuffle_grad_kernel.cc  |  54 +++++++
 .../phi/kernels/xpu/pixel_shuffle_kernel.cc   |  54 +++++++
 .../xpu/test_pixel_shuffle_op_xpu.py          | 143 ++++++++++++++++++
 4 files changed, 253 insertions(+)
 create mode 100644 paddle/phi/kernels/xpu/pixel_shuffle_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/xpu/pixel_shuffle_kernel.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py

diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 99cb79035b4b2..67ac2b17a7094 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -415,6 +415,8 @@ XPUOpMap& get_kl2_ops() {
       {"p_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pad3d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pad3d", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pixel_shuffle", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"pixel_shuffle_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"pool2d_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"pool2d",
diff --git a/paddle/phi/kernels/xpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/xpu/pixel_shuffle_grad_kernel.cc
new file mode 100644
index 0000000000000..0b1879faff2bf
--- /dev/null
+++ b/paddle/phi/kernels/xpu/pixel_shuffle_grad_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void PixelShuffleGradKernel(const Context& ctx,
+                            const DenseTensor& out_grad,
+                            int upscale_factor,
+                            const std::string& data_format,
+                            DenseTensor* x_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  const T* x_ptr = out_grad.data<T>();
+  T* y_ptr = ctx.template Alloc<T>(x_grad);
+
+  bool is_nchw = data_format == "NCHW";
+
+  int64_t n = out_grad.dims()[0];
+  int64_t xc = out_grad.dims()[is_nchw ? 1 : 3];
+  int64_t xh = out_grad.dims()[is_nchw ? 2 : 1];
+  int64_t xw = out_grad.dims()[is_nchw ? 3 : 2];
+
+  int r = pixel_unshuffle(ctx.x_context(),
+                          reinterpret_cast<const XPUType*>(x_ptr),
+                          reinterpret_cast<XPUType*>(y_ptr),
+                          n,
+                          xc,
+                          xh,
+                          xw,
+                          upscale_factor,
+                          is_nchw);
+
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "pixel_unshuffle");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pixel_shuffle_grad, XPU, ALL_LAYOUT, phi::PixelShuffleGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/xpu/pixel_shuffle_kernel.cc
new file mode 100644
index 0000000000000..d35775a5c15c9
--- /dev/null
+++ b/paddle/phi/kernels/xpu/pixel_shuffle_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pixel_shuffle_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void PixelShuffleKernel(const Context& ctx,
+                        const DenseTensor& x,
+                        int upscale_factor,
+                        const std::string& data_format,
+                        DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  const T* x_ptr = x.data<T>();
+  T* y_ptr = ctx.template Alloc<T>(out);
+
+  bool is_nchw = data_format == "NCHW";
+
+  int64_t n = x.dims()[0];
+  int64_t xc = x.dims()[is_nchw ? 1 : 3];
+  int64_t xh = x.dims()[is_nchw ? 2 : 1];
+  int64_t xw = x.dims()[is_nchw ? 3 : 2];
+
+  int r = pixel_shuffle(ctx.x_context(),
+                        reinterpret_cast<const XPUType*>(x_ptr),
+                        reinterpret_cast<XPUType*>(y_ptr),
+                        n,
+                        xc,
+                        xh,
+                        xw,
+                        upscale_factor,
+                        is_nchw);
+
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "pixel_shuffle");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    pixel_shuffle, XPU, ALL_LAYOUT, phi::PixelShuffleKernel, float) {}
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py
new file mode 100644
index 0000000000000..299fe38018c9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+import numpy as np
+
+sys.path.append("..")
+
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+
+import paddle
+
+paddle.enable_static()
+
+
+def pixel_shuffle_np(x, up_factor, data_format="NCHW"):
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        new_shape = (
+            n,
+            c // (up_factor * up_factor),
+            up_factor,
+            up_factor,
+            h,
+            w,
+        )
+        # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+    else:
+        n, h, w, c = x.shape
+        new_shape = (
+            n,
+            h,
+            w,
+            c // (up_factor * up_factor),
+            up_factor,
+            up_factor,
+        )
+        # reshape to (num,h,w,output_channel,upscale_factor,upscale_factor)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,h,upscale_factor,w,upscale_factor,output_channel)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, h * up_factor, w * up_factor, c // (up_factor * up_factor)]
+        npresult = np.reshape(npresult, oshape)
+        return npresult
+
+
+class XPUTestPixelShuffleOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "pixel_shuffle"
+        self.use_dynamic_create_class = False
+
+    class TestPixelShuffleOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "pixel_shuffle"
+            self.init_dtype()
+            self.eager_mode = True
+
+            # override
+            self.init_input_shape()
+            self.init_attr()
+
+            self.x = np.random.random(self.x_shape).astype(self.dtype)
+            self.y = pixel_shuffle_np(
+                self.x, self.attrs["upscale_factor"], self.attrs["data_format"]
+            )
+
+            self.inputs = {'X': self.x}
+            self.outputs = {'Out': self.y}
+
+        def init_input_shape(self):
+            self.x_shape = [2, 64, 26, 26]
+
+        def init_attr(self):
+            self.attrs = {'upscale_factor': 2, 'data_format': "NCHW"}
+
+        def set_xpu(self):
+            self.__class__.no_need_check_grad = False
+            self.place = paddle.XPUPlace(0)
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', check_eager=self.eager_mode
+            )
+
+    class TestNHWC(TestPixelShuffleOp):
+        def init_input_shape(self):
+            self.x_shape = [2, 64, 26, 24]
+
+        def init_attr(self):
+            self.attrs = {'upscale_factor': 2, 'data_format': "NHWC"}
+
+    class TestUpFactor3(TestPixelShuffleOp):
+        def init_input_shape(self):
+            self.x_shape = [2, 27, 5, 5]
+
+        def init_attr(self):
+            self.attrs = {'upscale_factor': 3, 'data_format': "NCHW"}
+
+    class TestUpFactor3NHWC(TestPixelShuffleOp):
+        def init_input_shape(self):
+            self.x_shape = [2, 27, 5, 9]
+
+        def init_attr(self):
+            self.attrs = {'upscale_factor': 3, 'data_format': "NHWC"}
+
+
+support_types = get_xpu_op_support_types("pixel_shuffle")
+for stype in support_types:
+    create_test_class(globals(), XPUTestPixelShuffleOp, stype)
+
+if __name__ == "__main__":
+    unittest.main()

From 86a22ad491e760129c9181d1135ca04f5673c907 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=B0=B8=E4=B9=85?=
 <34344716+yjjiang11@users.noreply.github.com>
Date: Tue, 31 Jan 2023 18:55:54 +0800
Subject: [PATCH 46/89] imigrating from old dynamic graph to new dynamic graph
 for argmin/argmax/adalta test (#50093)

* more ops

* revert some ops

* reset some ops
---
 .../fluid/tests/unittests/test_adagrad_op.py   | 10 +++++++++-
 .../fluid/tests/unittests/test_addmm_op.py     | 14 ++++++++------
 .../tests/unittests/test_arg_min_max_op.py     | 16 +++++++++++++++-
 .../tests/unittests/test_arg_min_max_v2_op.py  | 18 +++++++++++++++++-
 4 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
index 94a754ca3cecf..8eed1871a8759 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -16,18 +16,24 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
+def adamgrad_wrapper(param, grad, moment, learning_rate, epsilon):
+    paddle._C_ops.adagrad_(param, grad, moment, learning_rate, epsilon)
+
+
 class TestAdagradOp1(OpTest):
     '''Test Adagrad operator with explicit attributes'''
 
     def setUp(self):
         self.op_type = "adagrad"
+        self.python_api = adamgrad_wrapper
+        self.python_out_sig = ['out']
         param = np.random.random((123, 321)).astype("float32")
         grad = np.random.random((123, 321)).astype("float32")
         moment = np.zeros((123, 321)).astype("float32")
@@ -57,6 +63,8 @@ class TestAdagradOp2(OpTest):
 
     def setUp(self):
         self.op_type = "adagrad"
+        self.python_api = adamgrad_wrapper
+        self.python_out_sig = ['out']
 
         param = np.random.random((123, 321)).astype("float32")
         grad = np.random.random((123, 321)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 2e4a9515b6aef..7691cf0c7010b 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -43,19 +43,19 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input', 'X', 'Y'], 'Out', check_eager=False)
+        self.check_grad(['Input', 'X', 'Y'], 'Out')
 
     def test_check_grad_x(self):
-        self.check_grad(['X'], 'Out', no_grad_set=None, check_eager=False)
+        self.check_grad(['X'], 'Out', no_grad_set=None)
 
     def test_check_grad_y(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=None, check_eager=False)
+        self.check_grad(['Y'], 'Out', no_grad_set=None)
 
     def test_check_grad_input(self):
-        self.check_grad(['Input'], 'Out', no_grad_set=None, check_eager=False)
+        self.check_grad(['Input'], 'Out', no_grad_set=None)
 
 
 class TestAddMMOpError(unittest.TestCase):
@@ -186,6 +186,7 @@ class TestAddMMOp3(OpTest):
     # test broadcast
     def setUp(self):
         self.op_type = "addmm"
+        self.python_api = paddle.addmm
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
@@ -225,6 +226,7 @@ class TestAddMMOp4(OpTest):
     # test broadcast
     def setUp(self):
         self.op_type = "addmm"
+        self.python_api = paddle.addmm
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index 603ea0d6b7a03..56ac56181f07c 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 from test_attribute_var import UnittestBase
 
 import paddle
@@ -27,6 +27,7 @@
 class BaseTestCase(OpTest):
     def initTestCase(self):
         self.op_type = 'arg_min'
+        self.python_api = paddle.tensor.argmin
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
         self.axis = 0
@@ -48,6 +49,7 @@ def test_check_output(self):
 class TestCase0(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (3, 4, 5)
         self.dtype = 'float32'
         self.axis = 0
@@ -56,6 +58,7 @@ def initTestCase(self):
 class TestCase1(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_min'
+        self.python_api = paddle.tensor.argmin
         self.dims = (3, 4)
         self.dtype = 'float64'
         self.axis = 1
@@ -64,6 +67,7 @@ def initTestCase(self):
 class TestCase2(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (3, 4)
         self.dtype = 'int64'
         self.axis = 0
@@ -75,6 +79,7 @@ def initTestCase(self):
 class TestCase0FP16(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (3, 4, 5)
         self.dtype = np.float16
         self.axis = 0
@@ -86,6 +91,7 @@ def initTestCase(self):
 class TestCase1FP16(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_min'
+        self.python_api = paddle.tensor.argmin
         self.dims = (3, 4)
         self.dtype = np.float16
         self.axis = 1
@@ -94,6 +100,7 @@ def initTestCase(self):
 class TestCase2_1(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (3, 4)
         self.dtype = 'int64'
         self.axis = -1
@@ -102,6 +109,7 @@ def initTestCase(self):
 class TestCase3(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (3,)
         self.dtype = 'int64'
         self.axis = 0
@@ -110,6 +118,7 @@ def initTestCase(self):
 class TestCase4(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_min'
+        self.python_api = paddle.tensor.argmin
         self.dims = (1,)
         self.dtype = 'int32'
         self.axis = 0
@@ -118,6 +127,7 @@ def initTestCase(self):
 class TestCase3_(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (3,)
         self.axis = 0
 
@@ -125,6 +135,7 @@ def initTestCase(self):
 class BaseTestComplex1_1(OpTest):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (4, 5, 6)
         self.dtype = 'int32'
         self.axis = 2
@@ -148,6 +159,7 @@ def setUp(self):
 class BaseTestComplex1_2(OpTest):
     def initTestCase(self):
         self.op_type = 'arg_min'
+        self.python_api = paddle.tensor.argmin
         self.dims = (4, 5, 6)
         self.dtype = 'int32'
         self.axis = 2
@@ -171,6 +183,7 @@ def setUp(self):
 class BaseTestComplex2_1(OpTest):
     def initTestCase(self):
         self.op_type = 'arg_max'
+        self.python_api = paddle.tensor.argmax
         self.dims = (4, 5, 6)
         self.dtype = 'int32'
         self.axis = 2
@@ -199,6 +212,7 @@ def setUp(self):
 class BaseTestComplex2_2(OpTest):
     def initTestCase(self):
         self.op_type = 'arg_min'
+        self.python_api = paddle.tensor.argmin
         self.dims = (4, 5, 6)
         self.dtype = 'int32'
         self.axis = 2
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 99dcff5db7b0b..f5cb975019c98 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -33,6 +33,10 @@ def initTestCase(self):
         def setUp(self):
             np.random.seed(123)
             self.initTestCase()
+            if op_type == 'arg_min':
+                self.python_api = paddle.tensor.argmin
+            else:
+                self.python_api = paddle.tensor.argmax
             self.dims = (4, 5, 6)
             self.dtype = "float64"
             self.x = 1000 * np.random.random(self.dims).astype(self.dtype)
@@ -72,6 +76,10 @@ def initTestCase(self):
     class ArgMinMaxKernelCase4(ArgMinMaxKernelBaseCase):
         def setUp(self):
             self.initTestCase()
+            if op_type == 'arg_min':
+                self.python_api = paddle.tensor.argmin
+            else:
+                self.python_api = paddle.tensor.argmax
             self.dims = (4, 5, 6)
             self.dtype = "float64"
             self.x = 1000 * np.random.random(self.dims).astype(self.dtype)
@@ -85,6 +93,10 @@ def setUp(self):
     class ArgMinMaxKernelCase5(ArgMinMaxKernelBaseCase):
         def setUp(self):
             self.initTestCase()
+            if op_type == 'arg_min':
+                self.python_api = paddle.tensor.argmin
+            else:
+                self.python_api = paddle.tensor.argmax
             self.dims = 4
             self.dtype = "float64"
             self.x = 1000 * np.random.random(self.dims).astype(self.dtype)
@@ -98,6 +110,10 @@ def setUp(self):
     class ArgMinMaxKernelCase6(ArgMinMaxKernelBaseCase):
         def setUp(self):
             self.initTestCase()
+            if op_type == 'arg_min':
+                self.python_api = paddle.tensor.argmin
+            else:
+                self.python_api = paddle.tensor.argmax
             self.dims = 4
             self.dtype = "float64"
             self.x = 1000 * np.random.random(self.dims).astype(self.dtype)

From 47ddd36ec43b51205ca39ad0d73b7932312f9b81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=B0=B8=E4=B9=85?=
 <34344716+yjjiang11@users.noreply.github.com>
Date: Tue, 31 Jan 2023 19:48:11 +0800
Subject: [PATCH 47/89] update ops  for new dynamic graph tests (#50061)

* update elementwise ops tests

* add more ops

* modify sum&split

* lint

* rm check_dygraph

* revert pow

* modify add for cpu test

* revert reshape

* modify min
---
 .../unittests/test_elementwise_add_op.py      | 56 +++++++++++--------
 .../unittests/test_elementwise_div_op.py      | 15 ++++-
 .../unittests/test_elementwise_min_op.py      | 27 ++++-----
 .../unittests/test_elementwise_mul_op.py      | 34 +++++++++--
 .../unittests/test_elementwise_sub_op.py      | 32 ++++++++++-
 .../tests/unittests/test_pixel_shuffle.py     |  9 ++-
 .../fluid/tests/unittests/test_poisson_op.py  |  3 +-
 .../tests/unittests/test_put_along_axis_op.py |  3 +-
 .../fluid/tests/unittests/test_size_op.py     |  7 ++-
 .../fluid/tests/unittests/test_softmax_op.py  | 12 +++-
 .../tests/unittests/test_spectral_norm_op.py  |  8 ++-
 .../fluid/tests/unittests/test_split_op.py    | 12 +++-
 .../fluid/tests/unittests/test_sum_op.py      | 26 ++++++---
 .../unittests/test_take_along_axis_op.py      |  3 +-
 14 files changed, 185 insertions(+), 62 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 3bf2b7cdcd703..502ca504c1b8e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -15,15 +15,18 @@
 import unittest
 
 import numpy as np
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    skip_check_grad_ci,
-)
+
+
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def add_wrapper(x, y, axis=-1):
+        return x + y.reshape(shape)
+
+    return add_wrapper
 
 
 class TestElementwiseAddOp(OpTest):
@@ -45,14 +48,13 @@ def setUp(self):
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': self.out}
 
-    def check_eager(self):
+    def check_dygraph(self):
         return not self.use_mkldnn and self.axis == -1
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def test_check_grad_normal(self):
@@ -62,8 +64,7 @@ def test_check_grad_normal(self):
         self.check_grad(
             ['X', 'Y'],
             'Out',
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_x(self):
@@ -74,8 +75,7 @@ def test_check_grad_ingore_x(self):
             ['Y'],
             'Out',
             no_grad_set=set("X"),
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_y(self):
@@ -86,8 +86,7 @@ def test_check_grad_ingore_y(self):
             ['X'],
             'Out',
             no_grad_set=set('Y'),
-            check_dygraph=(not self.use_mkldnn),
-            check_eager=self.check_eager(),
+            check_dygraph=self.check_dygraph(),
         )
 
     def init_input_output(self):
@@ -136,7 +135,8 @@ def test_check_output(self):
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 self.check_output_with_place(
-                    place, atol=1e-3, check_dygraph=(not self.use_mkldnn)
+                    place,
+                    atol=1e-3,
                 )
 
 
@@ -149,6 +149,7 @@ def test_check_output(self):
 class TestBF16ElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
+        self.python_api = paddle.add
         self.dtype = np.uint16
 
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
@@ -170,23 +171,19 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=False)
+        self.check_output_with_place(place)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False)
+        self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), check_eager=False
-        )
+        self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False
-        )
+        self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y'))
 
 
 @skip_check_grad_ci(
@@ -248,6 +245,7 @@ def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -258,6 +256,7 @@ def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -268,6 +267,7 @@ def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 100, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -278,6 +278,7 @@ def init_input_output(self):
         self.x = np.random.rand(2, 100, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 100, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -288,6 +289,7 @@ def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 1, 100)
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
 
 
 class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
@@ -295,6 +297,7 @@ def init_input_output(self):
         self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 1, 100)
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
 
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
@@ -302,6 +305,7 @@ def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 10, 12, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -312,6 +316,7 @@ def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 10, 12, 1)
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
 
     def init_axis(self):
         self.axis = 1
@@ -322,6 +327,7 @@ def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -332,6 +338,7 @@ def init_input_output(self):
         self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1, 1])
 
     def init_axis(self):
         self.axis = 0
@@ -597,6 +604,7 @@ def init_data(self):
 class TestComplexElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
+        self.python_api = paddle.add
         self.dtype = np.float64
         self.shape = (2, 3, 4, 5)
         self.init_input_output()
@@ -629,7 +637,7 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 943486827237d..c17a41b0bfad5 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -15,13 +15,20 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import fluid
 from paddle.fluid import core
 
 
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def div_wrapper(x, y, axis=-1):
+        return paddle.divide(x, y.reshape(shape))
+
+    return div_wrapper
+
+
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
@@ -193,6 +200,7 @@ def init_shape(self):
         self.x_shape = [100, 3, 4]
         self.y_shape = [100]
         self.attrs = {'axis': 0}
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
 
     def compute_output(self, x, y):
         return x / y.reshape(100, 1, 1)
@@ -209,6 +217,7 @@ def init_shape(self):
         self.x_shape = [2, 100, 4]
         self.y_shape = [100]
         self.attrs = {'axis': 1}
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
 
     def compute_output(self, x, y):
         return x / y.reshape(1, 100, 1)
@@ -224,6 +233,7 @@ class TestElementwiseDivOpBroadcast2(ElementwiseDivOp):
     def init_shape(self):
         self.x_shape = [2, 3, 100]
         self.y_shape = [100]
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
 
     def compute_output(self, x, y):
         return x / y.reshape(1, 1, 100)
@@ -240,6 +250,7 @@ def init_shape(self):
         self.x_shape = [2, 10, 12, 5]
         self.y_shape = [10, 12]
         self.attrs = {'axis': 1}
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
 
     def compute_output(self, x, y):
         return x / y.reshape(1, 10, 12, 1)
@@ -393,7 +404,7 @@ def init_grad_input_output(self):
         self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y)
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
index c9835b5cb1566..02f1d1dd6d275 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
@@ -25,6 +25,13 @@
 paddle.enable_static()
 
 
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def min_wrapper(x, y, axis=-1):
+        return paddle.minimum(x, y.reshape(shape))
+
+    return min_wrapper
+
+
 class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_min"
@@ -39,16 +46,10 @@ def setUp(self):
         self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
 
     def test_check_output(self):
-        if hasattr(self, 'attrs'):
-            self.check_output(check_eager=False)
-        else:
-            self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad_normal(self):
-        if hasattr(self, 'attrs'):
-            self.check_grad(['X', 'Y'], 'Out', check_eager=False)
-        else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=True)
+        self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -118,7 +119,7 @@ def setUp(self):
 class TestElementwiseMinOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
         x = np.random.uniform(0.5, 1, (100, 3, 2)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (100,)).astype(np.float64)
         y = x[:, 0, 0] + sgn * np.random.uniform(1, 2, (100,)).astype(
@@ -137,7 +138,7 @@ def setUp(self):
 class TestElementwiseMinOp_broadcast_1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
         x = np.random.uniform(0.5, 1, (2, 100, 3)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (100,)).astype(np.float64)
         y = x[0, :, 0] + sgn * np.random.uniform(1, 2, (100,)).astype(
@@ -156,7 +157,7 @@ def setUp(self):
 class TestElementwiseMinOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
         x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (100,)).astype(np.float64)
         y = x[0, 0, :] + sgn * np.random.uniform(1, 2, (100,)).astype(
@@ -174,7 +175,7 @@ def setUp(self):
 class TestElementwiseMinOp_broadcast_3(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_min"
-        self.python_api = paddle.minimum
+        self.python_api = broadcast_wrapper(shape=[1, 25, 4, 1])
         x = np.random.uniform(0.5, 1, (2, 25, 4, 1)).astype(np.float64)
         sgn = np.random.choice([-1, 1], (25, 4)).astype(np.float64)
         y = x[0, :, :, 0] + sgn * np.random.uniform(1, 2, (25, 4)).astype(
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index e34d9d0dfd32b..4fe6a15ef8efc 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -15,14 +15,24 @@
 import unittest
 
 import numpy as np
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    skip_check_grad_ci,
-)
+
+
+def mul(x, y, axis=-1, use_mkldnn=False):
+    return x * y
+
+
+setattr(paddle, "mul", mul)
+
+
+def broadcast_wrapper(shape=[1, 10, 12, 1]):
+    def mul_wrapper(x, y, axis=-1):
+        return x * y.reshape(shape)
+
+    return mul_wrapper
 
 
 class ElementwiseMulOp(OpTest):
@@ -31,6 +41,7 @@ def init_kernel_type(self):
 
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.dtype = np.float64
         self.axis = -1
         self.init_dtype()
@@ -107,6 +118,7 @@ def init_input_output(self):
 class TestBF16ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.dtype = np.uint16
 
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
@@ -145,6 +157,7 @@ def test_check_grad_ingore_y(self):
 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 3, 4).astype(np.float64),
             'Y': np.random.rand(1).astype(np.float64),
@@ -156,6 +169,7 @@ def setUp(self):
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.random((100,)).astype("float64"),
             'Y': np.random.random((100,)).astype("float64"),
@@ -168,6 +182,7 @@ class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.rand(100, 2, 3).astype(self.dtype)
         self.y = np.random.rand(100).astype(self.dtype)
+        self.python_api = broadcast_wrapper(shape=[100, 1, 1])
         self.out = self.x * self.y.reshape(100, 1, 1)
 
     def init_axis(self):
@@ -177,6 +192,7 @@ def init_axis(self):
 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = broadcast_wrapper(shape=[1, 100, 1])
         self.inputs = {
             'X': np.random.rand(2, 100, 3).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -192,6 +208,7 @@ def setUp(self):
 class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = broadcast_wrapper(shape=[1, 1, 100])
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -206,6 +223,7 @@ def setUp(self):
 class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = broadcast_wrapper(shape=[1, 10, 12, 1])
         self.inputs = {
             'X': np.random.rand(2, 10, 12, 3).astype(np.float64),
             'Y': np.random.rand(10, 12).astype(np.float64),
@@ -221,6 +239,7 @@ def setUp(self):
 class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 2, 11).astype(np.float64),
             'Y': np.random.rand(10, 1, 11).astype(np.float64),
@@ -232,6 +251,7 @@ def setUp(self):
 class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 4, 2, 3).astype(np.float64),
             'Y': np.random.rand(10, 4, 1, 3).astype(np.float64),
@@ -251,6 +271,7 @@ def init_dtype(self):
 class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(1, 1, 100).astype(np.float64),
@@ -262,6 +283,7 @@ def setUp(self):
 class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(30, 3, 1, 5).astype(np.float64),
             'Y': np.random.rand(30, 1, 4, 1).astype(np.float64),
@@ -273,6 +295,7 @@ def setUp(self):
 class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.inputs = {
             'X': np.random.rand(10, 10).astype(np.float64),
             'Y': np.random.rand(2, 2, 10, 10).astype(np.float64),
@@ -289,6 +312,7 @@ def setUp(self):
 class TestComplexElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.python_api = paddle.mul
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 398ef711e2fea..1391dd2e9da5e 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -15,15 +15,26 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
 
 
+def sub_wrapper(shape=None):
+    def inner_wrapper(x, y, axis=-1):
+        if shape is None:
+            return x - y
+        else:
+            return x - y.reshape(shape)
+
+    return inner_wrapper
+
+
 class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
             'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
@@ -50,6 +61,7 @@ def test_check_grad_ingore_y(self):
 class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, []).astype("float64"),
             'Y': np.random.uniform(0.1, 1, []).astype("float64"),
@@ -60,6 +72,7 @@ def setUp(self):
 class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
             'Y': np.random.uniform(0.1, 1, []).astype("float64"),
@@ -70,6 +83,7 @@ def setUp(self):
 class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.uniform(0.1, 1, []).astype("float64"),
             'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
@@ -80,6 +94,7 @@ def setUp(self):
 class TestBF16ElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.dtype = np.uint16
         x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
         y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
@@ -110,6 +125,7 @@ def test_check_grad_ingore_y(self):
 class TestElementwiseSubOp_scalar(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(10, 3, 4).astype(np.float64),
             'Y': np.random.rand(1).astype(np.float64),
@@ -120,6 +136,7 @@ def setUp(self):
 class TestElementwiseSubOp_Vector(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.random((100,)).astype("float64"),
             'Y': np.random.random((100,)).astype("float64"),
@@ -130,6 +147,7 @@ def setUp(self):
 class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[100, 1, 1])
         self.inputs = {
             'X': np.random.rand(100, 3, 2).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -144,6 +162,7 @@ def setUp(self):
 class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[1, 100, 1])
         self.inputs = {
             'X': np.random.rand(2, 100, 3).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -158,6 +177,7 @@ def setUp(self):
 class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[1, 1, 100])
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(100).astype(np.float64),
@@ -171,6 +191,7 @@ def setUp(self):
 class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper(shape=[1, 10, 12, 1])
         self.inputs = {
             'X': np.random.rand(2, 10, 12, 3).astype(np.float64),
             'Y': np.random.rand(10, 12).astype(np.float64),
@@ -185,6 +206,7 @@ def setUp(self):
 class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(2, 5, 3, 12).astype(np.float64),
             'Y': np.random.rand(2, 5, 1, 12).astype(np.float64),
@@ -195,6 +217,7 @@ def setUp(self):
 class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(2, 3, 100).astype(np.float64),
             'Y': np.random.rand(1, 1, 100).astype(np.float64),
@@ -205,6 +228,7 @@ def setUp(self):
 class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.inputs = {
             'X': np.random.rand(10, 3, 1, 4).astype(np.float64),
             'Y': np.random.rand(10, 1, 12, 1).astype(np.float64),
@@ -215,6 +239,11 @@ def setUp(self):
 class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
+
+        def sub_func(x, y, axis=2):
+            return x.reshape([1, 1, 10, 12]) - y
+
+        self.python_api = sub_func
         self.inputs = {
             'X': np.random.rand(10, 12).astype(np.float64),
             'Y': np.random.rand(2, 3, 10, 12).astype(np.float64),
@@ -230,6 +259,7 @@ def setUp(self):
 class TestComplexElementwiseSubOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.python_api = sub_wrapper()
         self.dtype = np.float64
         self.shape = (2, 3, 4, 5)
         self.init_input_output()
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index 9600f5a872c56..0ef6b3e77824b 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -85,10 +85,13 @@ def init_data_format(self):
         self.format = "NCHW"
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_eager=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+        )
 
 
 class TestChannelLast(TestPixelShuffleOp):
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index e2720edb01313..ee66d578014c7 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 
@@ -41,6 +41,7 @@ def output_hist(out, lam, a, b):
 class TestPoissonOp1(OpTest):
     def setUp(self):
         self.op_type = "poisson"
+        self.python_api = paddle.tensor.poisson
         self.config()
 
         self.attrs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
index 3b2cf82fbfd39..7470dae1846ab 100644
--- a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 from paddle.framework import core
@@ -30,6 +30,7 @@ def setUp(self):
         self.reduce_op = "assign"
         self.dtype = 'float64'
         self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
         self.xnp = np.random.random(self.x_shape).astype(self.x_type)
         # numpy put_along_axis is an inplace opearion.
         self.xnp_result = copy.deepcopy(self.xnp)
diff --git a/python/paddle/fluid/tests/unittests/test_size_op.py b/python/paddle/fluid/tests/unittests/test_size_op.py
index b3ae19b8ef20e..edea44abf0890 100644
--- a/python/paddle/fluid/tests/unittests/test_size_op.py
+++ b/python/paddle/fluid/tests/unittests/test_size_op.py
@@ -15,15 +15,20 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
 
 
+def size_wrapper(input):
+    return paddle.numel(paddle.to_tensor(input))
+
+
 class TestSizeOp(OpTest):
     def setUp(self):
         self.op_type = "size"
+        self.python_api = size_wrapper
         self.shape = []
         self.config()
         input = np.zeros(self.shape, dtype='bool')
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 290d72b2485b2..8696cc532820f 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
@@ -43,6 +43,12 @@ def ref_softmax(x, axis=None, dtype=None):
     return np.apply_along_axis(stable_softmax, axis, x_t)
 
 
+def softmax_wrapper(
+    x, axis=-1, dtype=None, name=None, use_cudnn=False, use_mkldnn=False
+):
+    return paddle.nn.functional.softmax(x, axis=axis, dtype=dtype)
+
+
 class TestSoftmaxOp(OpTest):
     def get_x_shape(self):
         return [10, 10]
@@ -52,6 +58,7 @@ def get_axis(self):
 
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = False
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -109,6 +116,7 @@ def test_check_grad(self):
 class TestSoftmaxOp_ZeroDim1(TestSoftmaxOp):
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = False
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -133,6 +141,7 @@ def setUp(self):
 class TestSoftmaxOp_ZeroDim2(TestSoftmaxOp):
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = True
         self.use_mkldnn = False
         # explicilty use float32 for ROCm, as MIOpen does not yet support float64
@@ -366,6 +375,7 @@ def get_x_shape(self):
 class TestSoftmaxBF16Op(OpTest):
     def setUp(self):
         self.op_type = "softmax"
+        self.python_api = softmax_wrapper
         self.use_cudnn = self.init_cudnn()
         self.use_mkldnn = False
         self.dtype = np.uint16
diff --git a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
index 033ee7908866d..c60780f90c49b 100644
--- a/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spectral_norm_op.py
@@ -15,9 +15,10 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
+from paddle import _C_ops
 from paddle.fluid.framework import Program, program_guard
 
 paddle.enable_static()
@@ -47,6 +48,10 @@ def spectral_norm(weight, u, v, dim, power_iters, eps):
     return weight / sigma
 
 
+def spectral_norm_wrapper(weight, u, v, dim, power_iters, eps):
+    return _C_ops.spectral_norm(weight, u, v, dim, power_iters, eps)
+
+
 @skip_check_grad_ci(
     reason="Spectral norm do not check grad when power_iters > 0 "
     "because grad is not calculated in power iterations, "
@@ -56,6 +61,7 @@ class TestSpectralNormOpNoGrad(OpTest):
     def setUp(self):
         self.initTestCase()
         self.op_type = 'spectral_norm'
+        self.python_api = spectral_norm_wrapper
         weight = np.random.random(self.weight_shape).astype('float64')
         u = np.random.normal(0.0, 1.0, self.u_shape).astype('float64')
         v = np.random.normal(0.0, 1.0, self.v_shape).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 40e7bff55e0bc..d250302165bcb 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +24,8 @@
 
 class TestSplitOp(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         axis = 1
@@ -62,6 +64,8 @@ def test_check_grad(self):
 # test with attr(num)
 class TestSplitOp_2(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
@@ -98,6 +102,8 @@ def test_check_grad(self):
 # attr(axis) is Tensor
 class TestSplitOp_AxisTensor(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
@@ -133,6 +139,8 @@ def test_check_grad(self):
 # attr(sections) is list containing Tensor
 class TestSplitOp_SectionsTensor(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
@@ -178,6 +186,8 @@ def test_check_grad(self):
 
 class TestSplitOp_unk_section(OpTest):
     def setUp(self):
+        self.python_api = paddle.split
+        self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.dtype = self.get_dtype()
         self.init_data()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 6e9ff86cb8b7f..b712b0bb161f6 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -19,6 +19,11 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 import paddle.fluid as fluid
@@ -26,16 +31,19 @@
 import paddle.inference as paddle_infer
 from paddle import enable_static
 from paddle.fluid.op import Operator
-from paddle.fluid.tests.unittests.op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+
+
+def sum_wrapper(X, use_mkldnn=False):
+    res = 0
+    for x in X:
+        res += x
+    return res
 
 
 class TestSumOp(OpTest):
     def setUp(self):
         self.op_type = "sum"
+        self.python_api = sum_wrapper
         self.init_kernel_type()
         self.use_mkldnn = False
         self.init_kernel_type()
@@ -341,10 +349,14 @@ def init_kernel_type(self):
         self.dtype = np.uint16
 
     def test_check_output(self):
-        self.check_output()
+        # new dynamic graph mode does not support unit16 type
+        self.check_output(check_dygraph=False)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
+        # new dynamic graph mode does not support unit16 type
+        self.check_grad(
+            ['x0'], 'Out', numeric_grad_delta=0.5, check_dygraph=False
+        )
 
 
 class API_Test_Add_n(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
index da3fa64417fe6..7abd86d19f676 100644
--- a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
+++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 from paddle.framework import core
@@ -27,6 +27,7 @@ class TestTakeAlongAxisOp(OpTest):
     def setUp(self):
         self.init_data()
         self.op_type = "take_along_axis"
+        self.python_api = paddle.tensor.take_along_axis
         self.xnp = np.random.random(self.x_shape).astype(self.x_type)
         self.target = np.take_along_axis(self.xnp, self.index, self.axis)
         broadcast_shape_list = list(self.x_shape)

From b0ee022b02aea2c6580a93127cf84da0598c0080 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=B0=B8=E4=B9=85?=
 <34344716+yjjiang11@users.noreply.github.com>
Date: Tue, 31 Jan 2023 19:48:35 +0800
Subject: [PATCH 48/89] migrating dot/sign/fill/norm from old dynamic graph to
 new dynamic graph (#49895)

* check dygraph on for op tests

* reset eigh and modify prelu&sign

* update eager_op_test

* lint

* add more ops

* fix reduce

* modify reduce test

* reset reduce_op

* modify matmul test

* revert prelu
---
 .../fluid/tests/unittests/test_dot_op.py      | 26 ++++++++--------
 .../tests/unittests/test_fill_any_like_op.py  | 10 ++++++-
 .../tests/unittests/test_fill_constant_op.py  | 21 +++++++++++--
 .../fluid/tests/unittests/test_log_loss_op.py |  5 +++-
 .../fluid/tests/unittests/test_logspace.py    |  7 ++++-
 .../unittests/test_lookup_table_v2_op.py      |  7 +++--
 .../tests/unittests/test_matmul_v2_op.py      | 30 +++++++------------
 .../tests/unittests/test_matrix_power_op.py   |  3 +-
 .../fluid/tests/unittests/test_norm_op.py     | 16 +++++++---
 .../fluid/tests/unittests/test_numel_op.py    |  3 +-
 .../tests/unittests/test_one_hot_v2_op.py     | 11 ++++++-
 .../tests/unittests/test_shard_index_op.py    |  5 +++-
 .../fluid/tests/unittests/test_sign_op.py     |  1 +
 13 files changed, 98 insertions(+), 47 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index d32057bfb0d2d..aa61c1e177869 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -38,7 +38,7 @@ def setUp(self):
         self.attrs = {}
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad_normal(self):
         if core.is_compiled_with_rocm():
@@ -46,10 +46,12 @@ def test_check_grad_normal(self):
                 ['X', 'Y'],
                 'Out',
                 user_defined_grads=[self.inputs['Y'], self.inputs['X']],
-                check_eager=True,
             )
         else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=True)
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+            )
 
     def test_check_grad_ingore_x(self):
         if core.is_compiled_with_rocm():
@@ -58,11 +60,12 @@ def test_check_grad_ingore_x(self):
                 'Out',
                 no_grad_set=set("X"),
                 user_defined_grads=[self.inputs['X']],
-                check_eager=True,
             )
         else:
             self.check_grad(
-                ['Y'], 'Out', no_grad_set=set("X"), check_eager=True
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
             )
 
     def test_check_grad_ingore_y(self):
@@ -72,11 +75,12 @@ def test_check_grad_ingore_y(self):
                 'Out',
                 no_grad_set=set('Y'),
                 user_defined_grads=[self.inputs['Y']],
-                check_eager=True,
             )
         else:
             self.check_grad(
-                ['X'], 'Out', no_grad_set=set('Y'), check_eager=True
+                ['X'],
+                'Out',
+                no_grad_set=set('Y'),
             )
 
     def init_input_output(self):
@@ -187,7 +191,7 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out * np.conj(self.x)
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -195,7 +199,6 @@ def test_check_grad_normal(self):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True,
         )
 
     def test_check_grad_ingore_x(self):
@@ -205,7 +208,6 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -215,13 +217,13 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=True,
         )
 
 
 class TestComplexDotOp2D(OpTest):
     def setUp(self):
         self.op_type = "dot"
+        self.python_api = paddle.dot
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 9fa333d623bf9..d87b47270d4fa 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -15,15 +15,21 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid.core as core
 
 
+def fill_any_like_wrapper(x, value):
+    x.fill_(value)
+    return x
+
+
 class TestFillAnyLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_any_like"
+        self.python_api = fill_any_like_wrapper
         self.dtype = np.int32
         self.value = 0.0
         self.init()
@@ -50,6 +56,7 @@ def init(self):
 class TestFillAnyLikeOpBfloat16(OpTest):
     def setUp(self):
         self.op_type = "fill_any_like"
+        self.python_api = fill_any_like_wrapper
         self.dtype = np.uint16
         self.value = 0.0
         self.inputs = {'X': np.random.random((219, 232)).astype(np.float32)}
@@ -83,6 +90,7 @@ def init(self):
 class TestFillAnyLikeOpType(TestFillAnyLikeOp):
     def setUp(self):
         self.op_type = "fill_any_like"
+        self.python_api = fill_any_like_wrapper
         self.dtype = np.int32
         self.value = 0.0
         self.init()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 38ef0379747db..3151744aa4cc1 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid as fluid
@@ -24,11 +24,17 @@
 from paddle.fluid.op import Operator
 
 
+def fill_wrapper(shape, value=0.0):
+    out = paddle.full(shape=shape, fill_value=value)
+    return out
+
+
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92], 'value': 3.8}
@@ -42,6 +48,7 @@ class TestFillConstantOp2(OpTest):
     def setUp(self):
         '''Test fill_constant op with default value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92]}
@@ -55,6 +62,7 @@ class TestFillConstantOp3(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified int64 value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92], 'value': 10000000000}
@@ -68,6 +76,7 @@ class TestFillConstantOp4(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified int value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
 
         self.inputs = {}
         self.attrs = {'shape': [123, 92], 'value': 3}
@@ -84,6 +93,7 @@ class TestFillConstantBF16Op(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.dtype = np.uint16
         self.inputs = {}
         self.attrs = {
@@ -130,6 +140,7 @@ class TestFillConstantOp1_ShapeTensorList(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
         shape_tensor_list = []
         for index, ele in enumerate(self.shape):
@@ -154,6 +165,7 @@ class TestFillConstantOp2_ShapeTensorList(OpTest):
     def setUp(self):
         '''Test fill_constant op with default value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
         shape_tensor_list = []
         for index, ele in enumerate(self.shape):
@@ -192,6 +204,7 @@ class TestFillConstantOp1_ShapeTensor(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
@@ -211,6 +224,7 @@ class TestFillConstantOp1_ValueTensor(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {
@@ -234,6 +248,7 @@ class TestFillConstantOp2_ValueTensor(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {
@@ -452,6 +467,7 @@ class TestFillConstantOp_ValueTensorBf16(OpTest):
     def setUp(self):
         '''Test fill_constant op with specified value'''
         self.op_type = "fill_constant"
+        self.python_api = fill_wrapper
         self.init_data()
 
         self.inputs = {
@@ -470,7 +486,8 @@ def init_data(self):
         self.mkldnn_data_type = "bfloat16"
 
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        # no dynamic graph test for mkldnn
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_log_loss_op.py b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
index 25bede0af214b..cb1b50b49a853 100644
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
@@ -15,7 +15,9 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
+
+from paddle.nn import functional as F
 
 
 def sigmoid_array(x):
@@ -25,6 +27,7 @@ def sigmoid_array(x):
 class TestLogLossOp(OpTest):
     def setUp(self):
         self.op_type = 'log_loss'
+        self.python_api = F.log_loss
         samples_num = 100
 
         x = np.random.random((samples_num, 1)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_logspace.py b/python/paddle/fluid/tests/unittests/test_logspace.py
index 2a0d466a600d8..dee098dd5f34d 100644
--- a/python/paddle/fluid/tests/unittests/test_logspace.py
+++ b/python/paddle/fluid/tests/unittests/test_logspace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 
@@ -23,6 +23,7 @@
 class TestLogspaceOpCommonCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
@@ -41,6 +42,7 @@ def test_check_output(self):
 class TestLogspaceOpReverseCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([10]).astype(dtype),
@@ -59,6 +61,7 @@ def test_check_output(self):
 class TestLogspaceOpNumOneCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([10]).astype(dtype),
@@ -77,6 +80,7 @@ def test_check_output(self):
 class TestLogspaceOpMinusBaseCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
@@ -95,6 +99,7 @@ def test_check_output(self):
 class TestLogspaceOpZeroBaseCase(OpTest):
     def setUp(self):
         self.op_type = "logspace"
+        self.python_api = paddle.logspace
         dtype = 'float32'
         self.inputs = {
             'Start': np.array([0]).astype(dtype),
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index eb9c4c60893e0..74b6eec7198c6 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
@@ -57,10 +57,10 @@ def id_dtype(self):
         return "int64"
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_eager=True)
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
 class TestLookupTableOpInt16(OpTest):
@@ -81,6 +81,7 @@ def id_dtype(self):
 class TestLookupTableOpWithTensorIds(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
+        self.python_api = paddle.nn.functional.embedding
         table = np.random.random((17, 31)).astype("float64")
         ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int32")
         self.inputs = {'W': table, 'Ids': ids}
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 8136425595a1c..e78ea74260d1e 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from eager_op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from testsuite import create_op
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.testsuite import create_op
 
 
 def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
@@ -72,6 +72,7 @@ def setUp(self):
         self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
         if self.is_bfloat16_op():
             x = np.random.random(self.x_shape).astype(np.float32)
             y = np.random.random(self.y_shape).astype(np.float32)
@@ -102,15 +103,13 @@ def setUp(self):
         self.outputs = {'Out': result}
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad(self):
         if core.is_compiled_with_rocm():
-            self.check_grad(
-                ['X', 'Y'], 'Out', max_relative_error=1e-2, check_eager=False
-            )
+            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
         else:
-            self.check_grad(['X', 'Y'], 'Out', check_eager=False)
+            self.check_grad(['X', 'Y'], 'Out')
 
 
 class TestMatMulOp2(TestMatMulV2Op):
@@ -344,9 +343,7 @@ def test_check_output(self):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(
-                        place, atol=atol, check_eager=False
-                    )
+                    self.check_output_with_place(place, atol=atol)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -356,7 +353,6 @@ def test_check_grad(self):
                     ['X', 'Y'],
                     'Out',
                     max_relative_error=max_relative_error,
-                    check_eager=False,
                 )
 
     cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
@@ -562,6 +558,7 @@ def test_compute_type_fp16_nan(self):
 class TestComplexMatMulOp(OpTest):
     def setUp(self):
         self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
@@ -593,7 +590,7 @@ def init_grad_input_output(self):
         self.grad_y = np.matmul(np.conj(self.x).T, self.grad_out)
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -601,7 +598,6 @@ def test_check_grad_normal(self):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -611,7 +607,6 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -621,13 +616,13 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
 
 class TestComplexMatMulOpBroadcast(OpTest):
     def setUp(self):
         self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
@@ -661,7 +656,7 @@ def init_grad_input_output(self):
         )
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output()
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -669,7 +664,6 @@ def test_check_grad_normal(self):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -679,7 +673,6 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -689,7 +682,6 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
-            check_eager=False,
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
index 7f26a7170191f..6381aeeca9868 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -32,6 +32,7 @@ def config(self):
 
     def setUp(self):
         self.op_type = "matrix_power"
+        self.python_api = paddle.tensor.matrix_power
         self.config()
 
         np.random.seed(123)
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 73938f2d1b1c0..7b4b8dc60a02a 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from eager_op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.fluid as fluid
@@ -29,10 +29,14 @@ def l2_norm(x, axis, epsilon):
     return y, r
 
 
+def norm_wrapper(x, axis=1, epsilon=1e-12, is_test=False):
+    return paddle.nn.functional.normalize(x, axis=axis, epsilon=epsilon)
+
+
 class TestNormOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
-        self.python_api = paddle.nn.functional.normalize
+        self.python_api = norm_wrapper
         self.init_test_case()
         self.init_dtype()
         x = np.random.random(self.shape).astype(self.dtype)
@@ -40,6 +44,7 @@ def setUp(self):
         self.inputs = {'X': x}
         self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
         self.outputs = {'Out': y, 'Norm': norm}
+        self.python_out_sig = ['Out']
 
     def test_check_output(self):
         self.check_output()
@@ -126,19 +131,22 @@ def test_check_grad(self):
 class TestNormTestOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
+        self.python_api = norm_wrapper
         self.init_test_case()
         x = np.random.random(self.shape).astype("float64")
         y, norm = l2_norm(x, self.axis, self.epsilon)
         self.inputs = {'X': x}
         self.attrs = {
             'epsilon': self.epsilon,
-            'axis': self.axis,
+            'axis': int(self.axis),
             'is_test': True,
         }
         self.outputs = {'Out': y}
+        self.python_out_sig = ["out"]
 
     def test_check_output(self):
-        self.check_output()
+        # dynamic graph just supports float tensor
+        self.check_output(check_dygraph=True)
 
     def test_check_grad(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index 1878c8409f5a3..a2414ed369b9b 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -24,6 +24,7 @@
 class TestNumelOp(OpTest):
     def setUp(self):
         self.op_type = "size"
+        self.python_api = paddle.numel
         self.init()
         x = np.random.random((self.shape)).astype("float64")
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 30bb75e0fa783..5d78b371b5fe9 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
 
 import paddle
 import paddle.fluid as fluid
@@ -23,9 +23,15 @@
 from paddle.fluid.framework import Program, program_guard
 
 
+def one_hot_wrapper(x, depth_tensor, **keargs):
+    return paddle.nn.functional.one_hot(x, depth_tensor)
+
+
 class TestOneHotOp(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
+        self.python_out_sig = ['Out']
         depth = 10
         depth_np = np.array(10).astype('int32')
         dimension = 12
@@ -49,6 +55,7 @@ def test_check_output(self):
 class TestOneHotOp_attr(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
@@ -73,6 +80,7 @@ def test_check_output(self):
 class TestOneHotOp_default_dtype(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
         depth = 10
         depth_np = np.array(10).astype('int32')
         dimension = 12
@@ -96,6 +104,7 @@ def test_check_output(self):
 class TestOneHotOp_default_dtype_attr(OpTest):
     def setUp(self):
         self.op_type = 'one_hot_v2'
+        self.python_api = one_hot_wrapper
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
diff --git a/python/paddle/fluid/tests/unittests/test_shard_index_op.py b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
index dfbb98a791372..77cbecd641c14 100644
--- a/python/paddle/fluid/tests/unittests/test_shard_index_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shard_index_op.py
@@ -15,11 +15,14 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from eager_op_test import OpTest
+
+import paddle
 
 
 def common_setup(self, index_num, nshards, shard_id, ignore_value):
     self.op_type = 'shard_index'
+    self.python_api = paddle.tensor.shard_index
     x_lod = [[i for i in range(10)]]
     N = sum(x_lod[0])
     x = [np.random.randint(0, index_num - 1) for i in range(N)]
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index 7834736260d9d..1f7d3b8228a77 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -28,6 +28,7 @@
 class TestSignOp(OpTest):
     def setUp(self):
         self.op_type = "sign"
+        self.python_api = paddle.sign
         self.inputs = {
             'X': np.random.uniform(-10, 10, (10, 10)).astype("float64")
         }

From 111075a349054acb67d272450da4dc5f81ad61c8 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 31 Jan 2023 20:07:54 +0800
Subject: [PATCH 49/89] gn_silu (#49928)

* gn_silu

* add ut

* set TIMEOUT

* correct comments

* comments

* disable windows ut

* rename parameter
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../fluid/framework/ir/groupnorm_act_pass.cc  | 167 ++++++++++++++++++
 .../fluid/framework/ir/groupnorm_act_pass.h   |  81 +++++++++
 .../inference/api/paddle_pass_builder.cc      |   1 +
 .../tensorrt/convert/group_norm_op.cc         |   6 +
 .../plugin/common/groupNormPluginCommon.h     |   4 +-
 .../tensorrt/plugin/group_norm_op_plugin.cu   |   8 +-
 .../tensorrt/plugin/group_norm_op_plugin.h    |   8 +-
 .../plugin/preln_groupnorm_act_op_plugin.cu   |   6 +-
 .../plugin/skip_groupnorm_act_op_plugin.cu    |   6 +-
 .../unittests/ir/inference/CMakeLists.txt     |   3 +
 .../test_groupnorm_act_pass_fuse_pass.py      | 150 ++++++++++++++++
 12 files changed, 428 insertions(+), 13 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/groupnorm_act_pass.cc
 create mode 100644 paddle/fluid/framework/ir/groupnorm_act_pass.h
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b387dc1d6cc26..23d5b0de24722 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -144,6 +144,7 @@ if(WITH_TENSORRT)
   pass_library(trt_support_nhwc_pass inference)
   pass_library(elementwise_groupnorm_act_pass inference)
   pass_library(preln_elementwise_groupnorm_act_pass inference)
+  pass_library(groupnorm_act_pass inference)
   pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
   pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
 endif()
diff --git a/paddle/fluid/framework/ir/groupnorm_act_pass.cc b/paddle/fluid/framework/ir/groupnorm_act_pass.cc
new file mode 100644
index 0000000000000..397a7437757cc
--- /dev/null
+++ b/paddle/fluid/framework/ir/groupnorm_act_pass.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/groupnorm_act_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct GroupNormAct : public PatternBase {
+  GroupNormAct(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "groupnorm_act") {}
+
+  void operator()(PDNode *x);
+  // declare operator node's name
+  PATTERN_DECL_NODE(group_norm);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elementwise_out);
+
+  PATTERN_DECL_NODE(group_norm_bias);
+  PATTERN_DECL_NODE(group_norm_scale);
+  PATTERN_DECL_NODE(group_norm_out);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(act_out);
+};
+
+void GroupNormAct::operator()(PDNode *x) {
+  // Create nodes for group_norm op.
+  auto *group_norm =
+      pattern->NewNode(group_norm_repr())->assert_is_op("group_norm");
+  auto *group_norm_bias_var = pattern->NewNode(group_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("group_norm", "Bias");
+
+  auto *group_norm_scale_var = pattern->NewNode(group_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("group_norm", "Scale");
+
+  auto *group_norm_out_var = pattern->NewNode(group_norm_out_repr())
+                                 ->AsOutput()
+                                 ->assert_is_op_output("group_norm", "Y")
+                                 ->assert_is_op_input("silu", "X");
+
+  // Add links for group_norm op.
+  group_norm->LinksFrom({x, group_norm_bias_var, group_norm_scale_var})
+      .LinksTo({group_norm_out_var});
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_op("silu");
+  auto *act_out = pattern->NewNode(act_out_repr())
+                      ->AsOutput()
+                      ->assert_is_op_output("silu", "Out");
+
+  act->LinksFrom({group_norm_out_var}).LinksTo({act_out});
+}
+
+}  // namespace patterns
+
+int GroupNormActFusePass::ApplyGNSiluPattern(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init("groupnorm_silu_fuse", graph);
+
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  PDNode *x = nullptr;
+
+  x = gpd.mutable_pattern()
+          ->NewNode("groupnorm_act_fuse/x")
+          ->AsInput()
+          ->assert_var_not_persistable()
+          ->assert_is_op_input("group_norm", "X");
+
+  patterns::GroupNormAct fused_pattern(gpd.mutable_pattern(),
+                                       "groupnorm_act_fuse");
+  fused_pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (subgraph.count(x) <= 0) {
+      LOG(WARNING) << "The subgraph is empty.";
+      return;
+    }
+
+    VLOG(4) << "handle groupnorm act fuse";
+
+    GET_IR_NODE_FROM_SUBGRAPH(group_norm, group_norm, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(group_norm_bias, group_norm_bias, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        group_norm_scale, group_norm_scale, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(group_norm_out, group_norm_out, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, fused_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, fused_pattern);
+
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "groupnorm act pass in op compat failed.";
+      return;
+    }
+
+    std::unordered_set<const Node *> del_node_set;
+    // Create an skip_groupnorm_act op node
+    OpDesc new_desc(*group_norm->Op());
+    new_desc.SetAttr("with_silu", true);
+    new_desc.SetOutput("Y", {act_out->Name()});
+    new_desc.Flush();
+
+    auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+
+    del_node_set.insert(group_norm);
+    del_node_set.insert(group_norm_out);
+    del_node_set.insert(act);
+    GraphSafeRemoveNodes(graph, del_node_set);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_node);
+    IR_NODE_LINK_TO(group_norm_scale, fused_node);
+    IR_NODE_LINK_TO(group_norm_bias, fused_node);
+    IR_NODE_LINK_TO(fused_node, act_out);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  return found_subgraph_count;
+}
+
+void GroupNormActFusePass::ApplyImpl(ir::Graph *graph) const {
+  FusePassBase::Init("groupnorm_act_fuse_pass", graph);
+  int found_subgraph_count = ApplyGNSiluPattern(graph);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(groupnorm_act_pass, paddle::framework::ir::GroupNormActFusePass);
+REGISTER_PASS_CAPABILITY(groupnorm_act_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("silu", 0)
+            .EQ("group_norm", 0));
diff --git a/paddle/fluid/framework/ir/groupnorm_act_pass.h b/paddle/fluid/framework/ir/groupnorm_act_pass.h
new file mode 100644
index 0000000000000..16e4d332d29f0
--- /dev/null
+++ b/paddle/fluid/framework/ir/groupnorm_act_pass.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+//
+//       |                             |
+//   group_norm                   group_norm
+//       |              ->             |
+//      silu
+//       |
+
+class Graph;
+
+class GroupNormActFusePass : public FusePassBase {
+ public:
+  GroupNormActFusePass() {
+    AddOpCompat(OpCompat("group_norm"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddInput("Scale")
+        .IsTensor()
+        .End()
+        .AddInput("Bias")
+        .IsTensor()
+        .End()
+        .AddOutput("Y")
+        .IsTensor()
+        .End()
+        .AddOutput("Mean")
+        .IsTensor()
+        .End()
+        .AddOutput("Variance")
+        .IsTensor()
+        .End()
+        .AddAttr("epsilon")
+        .IsNumGE(0.0f)
+        .IsNumLE(1.0f)
+        .End()
+        .AddAttr("groups")
+        .IsNumGE(1)
+        .End()
+        .AddAttr("data_layout")
+        .IsStringIn({"NCHW"})
+        .End();
+    AddOpCompat(OpCompat("silu"))
+        .AddInput("X")
+        .IsTensor()
+        .End()
+        .AddOutput("Out")
+        .IsTensor()
+        .End();
+  }
+
+  virtual ~GroupNormActFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+  int ApplyGNSiluPattern(ir::Graph* graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 9f28343525c12..b5582518eacd2 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -136,6 +136,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
 #else
       "elementwise_groupnorm_act_pass",        //
       "preln_elementwise_groupnorm_act_pass",  //
+      "groupnorm_act_pass",                    //
 #endif
       "tensorrt_subgraph_pass",  //
       "conv_bn_fuse_pass",       //
diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
index 2afc86dfc815d..4384f7d2b3cb9 100644
--- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
@@ -46,6 +46,11 @@ class GroupNormOpConverter : public OpConverter {
     std::string scale_name = op_desc.Input("Scale").front();
     std::string bias_name = op_desc.Input("Bias").front();
 
+    bool with_silu = false;
+    if (op_desc.HasAttr("with_silu")) {
+      with_silu = PADDLE_GET_CONST(bool, op_desc.GetAttr("with_silu"));
+    }
+
     // get the presistable var's data
     auto GetWeight = [&](const std::string& var_name,
                          framework::DDim* dims) -> TensorRTEngine::Weight {
@@ -77,6 +82,7 @@ class GroupNormOpConverter : public OpConverter {
               groups,
               mean_shape,
               variance_shape,
+              with_silu,
               with_fp16);
       nvinfer1::ILayer* groupnorm_layer =
           engine_->AddDynamicPlugin(&input_itensor, 1, plugin);
diff --git a/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h b/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
index 81d507e866a1c..915ee1b5e23ac 100644
--- a/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
+++ b/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
@@ -49,8 +49,8 @@ struct GroupNormNHWCParams {
   int32_t c;
   // The number of groups.
   int32_t groups;
-  // Do we apply the Swish activation function?
-  bool withSwish;
+  // Do we apply the Silu activation function?
+  bool withSilu;
 
   // Precomputed values and parameters to control the execution of the kernels.
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
index 77c00d47d4cea..fc139a9734b30 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
@@ -247,8 +247,8 @@ __global__ void groupNormNHWCScaleKernel(const GroupNormNHWCParams params) {
     f2.x = gammaF2.x * f2.x + betaF2.x;
     f2.y = gammaF2.y * f2.y + betaF2.y;
 
-    // Apply Swish if needed.
-    if (params.withSwish) {
+    // Apply Silu if needed.
+    if (params.withSilu) {
       f2.x = f2.x * sigmoid(f2.x);
       f2.y = f2.y * sigmoid(f2.y);
     }
@@ -457,7 +457,7 @@ bool GroupNormPluginDynamic::supportsFormatCombination(
   if (pos == 0) {
     if (with_fp16_) {
       return ((in.type == nvinfer1::DataType::kHALF) &&
-              (in.format == nvinfer1::PluginFormat::kLINEAR ||
+              ((!with_silu_ && in.format == nvinfer1::PluginFormat::kLINEAR) ||
                in.format == nvinfer1::PluginFormat::kHWC8));
     } else {
       return (in.type == nvinfer1::DataType::kFLOAT) &&
@@ -624,7 +624,7 @@ int GroupNormPluginDynamic::enqueue(
         cPerBlock = 8;
       }
 
-      params_.withSwish = false;
+      params_.withSilu = with_silu_;
       params_.dst = static_cast<half *>(outputs[0]);
       params_.srcX = static_cast<half const *>(inputs[0]);
       params_.gamma = scale_gpu_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
index 1fa505c077ea8..3feb35e0708bc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
@@ -164,11 +164,13 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
                          int groups,
                          std::vector<int64_t> mean_shape,
                          std::vector<int64_t> variance_shape,
+                         bool with_silu,
                          bool with_fp16)
       : groups_(groups),
         eps_(eps),
         mean_shape_(mean_shape),
         variance_shape_(variance_shape),
+        with_silu_(with_silu),
         with_fp16_(with_fp16) {
     scale_.resize(scale_num);
     bias_.resize(bias_num);
@@ -183,6 +185,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &groups_);
     DeserializeValue(&serialData, &serialLength, &mean_shape_);
     DeserializeValue(&serialData, &serialLength, &variance_shape_);
+    DeserializeValue(&serialData, &serialLength, &with_silu_);
     DeserializeValue(&serialData, &serialLength, &with_fp16_);
   }
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
@@ -194,6 +197,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
                                            groups_,
                                            mean_shape_,
                                            variance_shape_,
+                                           with_silu_,
                                            with_fp16_);
     ptr->scale_gpu_ = scale_gpu_;
     ptr->bias_gpu_ = bias_gpu_;
@@ -210,7 +214,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     return SerializedSize(scale_) + SerializedSize(bias_) +
            SerializedSize(eps_) + SerializedSize(groups_) +
            SerializedSize(mean_shape_) + SerializedSize(variance_shape_) +
-           SerializedSize(with_fp16_);
+           SerializedSize(with_silu_) + SerializedSize(with_fp16_);
   }
   void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, scale_);
@@ -219,6 +223,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, groups_);
     SerializeValue(&buffer, mean_shape_);
     SerializeValue(&buffer, variance_shape_);
+    SerializeValue(&buffer, with_silu_);
     SerializeValue(&buffer, with_fp16_);
   }
   nvinfer1::DimsExprs getOutputDimensions(
@@ -277,6 +282,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
   std::vector<int64_t> mean_shape_;
   std::vector<int64_t> variance_shape_;
   GroupNormNHWCParams params_;
+  bool with_silu_;
   bool with_fp16_;
 };
 class GroupNormPluginDynamicCreator : public TensorRTPluginCreator {
diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
index a756a826bfb15..d3ca36770a4d2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
@@ -330,8 +330,8 @@ __global__ void prelnGroupNormNHWCScaleKernel(GroupNormNHWCParams params) {
     f2.x = gammaF2.x * f2.x + betaF2.x;
     f2.y = gammaF2.y * f2.y + betaF2.y;
 
-    // Apply Swish if needed.
-    if (params.withSwish) {
+    // Apply Silu if needed.
+    if (params.withSilu) {
       f2.x = f2.x * sigmoid(f2.x);
       f2.y = f2.y * sigmoid(f2.y);
     }
@@ -431,7 +431,7 @@ int PrelnGroupnormActPluginDynamic::enqueue(
     if (cPerBlock > input_desc[0].dims.d[1]) {
       cPerBlock = 8;
     }
-    params_.withSwish = with_silu_;
+    params_.withSilu = with_silu_;
     params_.dst = static_cast<half *>(outputs[1]);
     params_.eleOut = static_cast<half *>(outputs[0]);
     params_.srcX = static_cast<half const *>(inputs[0]);
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
index adba9324472a2..997205e918936 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
@@ -340,8 +340,8 @@ __global__ void skipGroupNormNHWCScaleKernel(GroupNormNHWCParams params) {
     f2.x = gammaF2.x * f2.x + betaF2.x;
     f2.y = gammaF2.y * f2.y + betaF2.y;
 
-    // Apply Swish if needed.
-    if (params.withSwish) {
+    // Apply Silu if needed.
+    if (params.withSilu) {
       f2.x = f2.x * sigmoid(f2.x);
       f2.y = f2.y * sigmoid(f2.y);
     }
@@ -439,7 +439,7 @@ int SkipGroupnormActPluginDynamic::enqueue(
     if (cPerBlock > input_desc[0].dims.d[1]) {
       cPerBlock = 8;
     }
-    params_.withSwish = true;
+    params_.withSilu = true;
     params_.dst = static_cast<half *>(outputs[0]);
     params_.srcX = static_cast<half const *>(inputs[0]);
     params_.srcY = static_cast<half const *>(inputs[1]);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index d456a86aa9d28..bdcf6ab951022 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -39,6 +39,7 @@ if(WIN32)
        "test_preln_groupnorm_act_fuse_pass")
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
        "test_element_groupnorm_act_fuse_pass")
+  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_groupnorm_act_pass_fuse_pass")
   list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_fused_token_prune")
   list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_fused_token_prune")
 endif()
@@ -225,6 +226,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
                            PROPERTIES TIMEOUT 120)
       set_tests_properties(test_preln_groupnorm_act_fuse_pass PROPERTIES TIMEOUT
                                                                          120)
+      set_tests_properties(test_groupnorm_act_pass_fuse_pass PROPERTIES TIMEOUT
+                                                                        120)
     endif()
   endif()
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py
new file mode 100644
index 0000000000000..c9f821b21d4e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_groupnorm_act_pass_fuse_pass.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+
+import hypothesis.strategies as st
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+import paddle.inference as paddle_infer
+
+
+class TestElementGNActPass(PassAutoScanTest):
+    #
+    #             |             fuse                 |
+    #          groupnorm         ->          groupnorm(with_silu)
+    #             |                                  |
+    #            silu
+    #             |
+    #
+    #
+
+    def sample_predictor_configs(self, program_config):
+        # trt dynamic_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=1,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Half,
+            use_static=False,
+            use_calib_mode=False,
+        )
+        config.set_trt_dynamic_shape_info(
+            {
+                "input_data": [1, 160, 1, 1],
+            },
+            {
+                "input_data": [4, 1280, 64, 64],
+            },
+            {
+                "input_data": [1, 320, 32, 32],
+            },
+        )
+        yield config, ['group_norm'], (3e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        axis = draw(st.sampled_from([0, -1]))
+        epsilon = draw(st.floats(min_value=0.0000001, max_value=0.001))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        groups = draw(st.sampled_from([4, 8, 16, 32]))
+        hw = draw(st.sampled_from([1, 8, 16, 32]))
+        channel = draw(st.sampled_from([320, 1280]))
+
+        def generate_input(attrs):
+            return np.random.random(
+                [attrs[1]["batch_size"], *attrs[1]["input_dim"]]
+            ).astype(np.float32)
+
+        def generate_weight(attrs):
+            return np.random.random(attrs[1]['input_dim'][0]).astype(np.float32)
+
+        attrs = [
+            {
+                'epsilon': epsilon,
+                'groups': groups,
+            },
+            {
+                'batch_size': batch_size,
+                'input_dim': [channel, hw, hw],
+            },
+        ]
+
+        group_norm_op = OpConfig(
+            type="group_norm",
+            inputs={
+                "X": ["input_data"],
+                "Bias": ["group_norm_bias"],
+                "Scale": ["group_norm_scale"],
+            },
+            outputs={
+                "Y": ["group_norm_output1"],
+                "Mean": ["group_norm_output2"],
+                "Variance": ["group_norm_output3"],
+            },
+            attrs={
+                "data_layout": "NCHW",
+                "groups": attrs[0]["groups"],
+                "epsilon": attrs[0]["epsilon"],
+            },
+        )
+        silu_op = OpConfig(
+            type="silu",
+            inputs={
+                "X": ["group_norm_output1"],
+            },
+            outputs={
+                "Out": ["silu_output"],
+            },
+        )
+
+        program_config = ProgramConfig(
+            ops=[
+                group_norm_op,
+                silu_op,
+            ],
+            weights={
+                "group_norm_bias": TensorConfig(
+                    data_gen=partial(generate_weight, attrs)
+                ),
+                "group_norm_scale": TensorConfig(
+                    data_gen=partial(generate_weight, attrs)
+                ),
+            },
+            inputs={
+                "input_data": TensorConfig(
+                    data_gen=partial(generate_input, attrs)
+                ),
+            },
+            outputs=["silu_output"],
+        )
+
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            passes=["groupnorm_act_pass"],
+            max_duration=250,
+            min_success_num=50,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From dc1b6511dd6673f82adc77d13b9526ac60523d3b Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 31 Jan 2023 20:18:33 +0800
Subject: [PATCH 50/89] support empty input for unique_consecutive (#49978)

---
 .../kernels/cpu/unique_consecutive_functor.h  |  6 ++-
 .../unittests/test_unique_consecutive_op.py   | 38 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/cpu/unique_consecutive_functor.h b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
index 314c371bf7a64..73d196bbb98d9 100644
--- a/paddle/phi/kernels/cpu/unique_consecutive_functor.h
+++ b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
@@ -51,9 +51,11 @@ static void UniqueConsecutiveFlattenedTensor(const Context& context,
     }
   }
 
-  int64_t output_size = p - out_vec.data() + 1;
+  bool is_empty = in.numel() == 0;
+  int64_t output_size = is_empty ? 0 : (p - out_vec.data() + 1);
+
   if (return_counts) {
-    *q = in.numel() - last;
+    if (!is_empty) *q = in.numel() - last;
     counts_vec.resize(output_size);
   }
   out_vec.resize(output_size);
diff --git a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
index 86872aff9c7da..2c60fce518d37 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
@@ -32,12 +32,14 @@ def reference_unique_consecutive(X, return_inverse=False, return_counts=False):
         return_counts(bool, optional): If True, also return the counts for each unique consecutive element.
     """
     X = list(X)
+    is_empty = len(X) == 0
     counts_vec = [1] * len(X)
     i = 0
     counts = 1
     last = 0
     inverse_vec = [0] * len(X)
-    inverse_vec[last] = i
+    if not is_empty:
+        inverse_vec[last] = i
     cnt = 0
     while i < len(X) - 1:
         if X[i] == X[i + 1]:
@@ -271,6 +273,40 @@ def test_dygraph(self):
                 )
 
 
+class TestUniqueConsecutiveEmptyInput(OpTest):
+    """empty input"""
+
+    def config(self):
+        self.return_inverse = True
+        self.return_counts = True
+        self.python_api = paddle.unique_consecutive
+
+    def init_kernel_type(self):
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+
+    def setUp(self):
+        self.init_kernel_type()
+        self.config()
+        self.op_type = "unique_consecutive"
+        x = np.array([]).astype(self.dtype)
+        result = reference_unique_consecutive(
+            x, self.return_inverse, self.return_counts
+        )
+        out = reference_unique_consecutive(x)
+        out = np.array(out).astype(self.dtype)
+        self.inputs = {
+            'X': x,
+        }
+        self.python_out_sig = ["Out"]
+        self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
+        self.outputs = {
+            'Out': out,
+        }
+
+    def test_check_output(self):
+        self.check_output(check_eager=True)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()

From 057ba778fa19c1b9670150d5ea5e83d6c8d64d04 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Wed, 1 Feb 2023 00:38:49 +0800
Subject: [PATCH 51/89] H2D data transfer optimization for split kernel
 (#49086)

* profile reduce kernel for fp16 and reduceHigherdim

* use reinterpret_cast

* fix for CI on ROCm

* add Macro for ROCm

* ROCm CI config

* ROCm CI config

* unit test repair

* pull

* add common_funcs.h

* reduceType

* Update reduce_function.h

* not higher

* rename

* implement of matmul using cublasLt instead of cublas

* cublasLt bugfix

* Update matmul_kernel_impl.h

* Update matmul_kernel_impl_via_blasLt.h

* for-loop-algo

* PR comments changes

* add macro

* ci unused variable isCublasLt

* ci unused variable isCublasLt macro

* split matmul to autotune

* rewrite the split kernel with segmented_array

* rewrite the split kernel with segmented_array

* rewrite the split kernel with segmented_array

* add some method for cuda_graph

* fix bugs for rocm

* change for ci-error

* i dont know why ci-model-benchmark gives a shit error, so i recover codes with original one to see if original codes work.

* add some changes for passing mode_benchmark and coverage ci

* fix ci error

* fix ci-rocm error

* add some changes for header

---------

Co-authored-by: zhangbopd <1299246947@qq.com>
Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
---
 .../kernels/funcs/concat_and_split_functor.cu | 494 +++++++++---------
 paddle/phi/kernels/funcs/segmented_array.h    |  70 ++-
 paddle/phi/kernels/funcs/stack_and_unstack.h  |   2 +-
 3 files changed, 305 insertions(+), 261 deletions(-)

diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index fa663528eb015..dc9150e4f2c56 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/segmented_array.h"
 
 namespace phi {
 namespace funcs {
@@ -45,6 +45,12 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x)
+#endif
+
 template <typename T, int Size>
 struct PointerWrapper {
  public:
@@ -55,12 +61,29 @@ struct PointerWrapper {
   PointerWrapper(const phi::GPUContext& ctx,
                  const std::vector<phi::DenseTensor>& ins,
                  const T** pre_alloced_host_ptr) {
+    SetInputAddr(ins);
+  }
+
+ protected:
+  void SetInputAddr(const std::vector<phi::DenseTensor>& ins) {
     for (auto i = 0; i < ins.size(); ++i) {
       ins_addr[i] = ins[i].data();
     }
   }
 };
 
+template <typename T, int Size>
+struct PADDLE_ALIGN(256) AlignedPointerWrapper
+    : public PointerWrapper<T, Size> {
+ public:
+  AlignedPointerWrapper() {}
+  AlignedPointerWrapper(const phi::GPUContext& ctx,
+                        const std::vector<phi::DenseTensor>& ins,
+                        const T** pre_alloced_host_ptr) {
+    this->SetInputAddr(ins);
+  }
+};
+
 template <typename T>
 struct PointerToPointer {
  public:
@@ -93,7 +116,7 @@ struct PointerToPointer {
 };
 
 template <typename T, typename IndexT, int Size>
-struct PointerAndColWrapper {
+struct PADDLE_ALIGN(256) PointerAndColWrapper {
  public:
   IndexT col_length[Size];
   PointerAndColWrapper(const phi::GPUContext& ctx,
@@ -151,6 +174,8 @@ struct PointerToPointerAndCol {
   PointerToPointer<T> ins_ptr_wrapper;
 };
 
+#undef PADDLE_ALIGN
+
 template <int MovSize>
 struct alignas(MovSize) Packed {
   __device__ Packed() {
@@ -358,10 +383,10 @@ void DispatchConcatWithSameShapeKernelLimitNum(
   dim3 grid_dims;
   GetBlockDims(ctx, out_row, out_col, &block_dims, &grid_dims);
 
-#define IMPL_CONCAT_CUDA_KERNEL_CASE(size_, ...)               \
-  case size_: {                                                \
-    PointerWrapper<T, size_> ptr_array(ctx, ins, inputs_data); \
-    __VA_ARGS__;                                               \
+#define IMPL_CONCAT_CUDA_KERNEL_CASE(size_, ...)                      \
+  case size_: {                                                       \
+    AlignedPointerWrapper<T, size_> ptr_array(ctx, ins, inputs_data); \
+    __VA_ARGS__;                                                      \
   } break;
 
   switch (phi::backends::gpu::RoundToNextHighPowOfTwo(limit_num, 4)) {
@@ -519,108 +544,6 @@ void DispatchConcatKernel(const phi::GPUContext& ctx,
   }
 }
 
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t* out_cols,
-                             int out_cols_size,
-                             T** outputs_data) {
-  int64_t curr_segment = 0;
-  int64_t curr_offset = out_cols[0];
-  CUDA_KERNEL_LOOP_TYPE(tid_x, in_col, int64_t) {
-    int64_t curr_col_offset = out_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-
-    int64_t local_col = tid_x - curr_offset;
-    int64_t segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int64_t tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__device__ void SplitKernelDetail(const T* input_data,
-                                  const int64_t in_row,
-                                  const int64_t in_col,
-                                  const int64_t fixed_out_col,
-                                  T** outputs_data) {
-  CUDA_KERNEL_LOOP_TYPE(tid_x, in_col, int64_t) {
-    int64_t split = tid_x / fixed_out_col;
-    int64_t in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int64_t tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel_(const T* input_data,
-                             const int64_t in_row,
-                             const int64_t in_col,
-                             const int64_t fixed_out_col,
-                             T* outputs_addr0,
-                             T* outputs_addr1,
-                             T* outputs_addr2,
-                             T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
 /*
  * All tensors' dimension should be the same and the values of
  * each dimension must be the same, except the axis dimension.
@@ -708,37 +631,152 @@ struct ConcatFunctor<phi::GPUContext, T> {
   }
 };
 
-template <typename T>
-class SplitFunctor<phi::GPUContext, T> {
+template <typename T, typename IndexT, funcs::SegmentedArraySize Size>
+struct PointerAndColArray
+    : public funcs::PointerArraySetter<phi::GPUContext, T, Size> {
  public:
-  void operator()(const phi::GPUContext& context,
-                  const phi::DenseTensor& input,
-                  const std::vector<const phi::DenseTensor*>& ref_inputs,
-                  int axis,
-                  std::vector<phi::DenseTensor*>* outputs) {
-    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-    // tensors of shape [0,1,4]
-    if (input.numel() == 0) {
-      return;
+  funcs::ValueArray<IndexT, Size> val_array;
+
+  PointerAndColArray() {}
+  PointerAndColArray(const phi::GPUContext& ctx,
+                     const int out_col_num,
+                     IndexT* out_cols,
+                     std::vector<DenseTensor*>* t,
+                     T** pre_alloc_host_buf = nullptr)
+      : funcs::PointerArraySetter<phi::GPUContext, T, Size>(
+            ctx,
+            t,
+            /*need_alloc=*/false,
+            /*use_cuda_graph=*/true,
+            pre_alloc_host_buf) {
+    IndexT* dev_ptr = nullptr;
+    if (Size == SegmentedArraySize::kVariableLength) {
+      size_t num_bytes = out_col_num * sizeof(IndexT);
+      dev_ptr = reinterpret_cast<IndexT*>(this->AllocAndCopy(
+          ctx, reinterpret_cast<void*>(out_cols), num_bytes, true));
+      val_array.Set(dev_ptr, out_col_num);
+    } else {
+      val_array.Set(out_cols, out_col_num);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename DataArrayT>
+__global__ void SplitTensorWithSameShape(const T* input_data,
+                                         const IndexT out_row,
+                                         const IndexT cumulative_col,
+                                         const IndexT fixed_out_col,
+                                         DataArrayT data_array) {
+  CUDA_KERNEL_LOOP_TYPE(tid_x, cumulative_col, IndexT) {
+    IndexT split = tid_x / fixed_out_col;
+    IndexT in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = data_array.data[split];
+    if (output_ptr != nullptr) {
+      IndexT tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < out_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * cumulative_col + tid_x];
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename DataArrayT, typename ValArrayT>
+__global__ void SplitTensorWithDifferentShape(const T* input_data,
+                                              const IndexT out_row,
+                                              const IndexT cumulative_col,
+                                              DataArrayT data_array,
+                                              ValArrayT col_array) {
+  IndexT curr_segment = 0;
+  IndexT curr_offset = col_array.data[0];
+  CUDA_KERNEL_LOOP_TYPE(tid_x, cumulative_col, IndexT) {
+    IndexT curr_col_offset = col_array.data[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = col_array.data[curr_segment + 1];
     }
 
-    // TODO(zcd): Add input data validity checking
-    int o_num = outputs->size();
-    int64_t out_row = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      out_row *= dim_0[i];
+    IndexT local_col = tid_x - curr_offset;
+    IndexT segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = data_array.data[curr_segment];
+    if (output_ptr != nullptr) {
+      IndexT tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < out_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * cumulative_col + tid_x];
     }
+  }
+}
+
+template <typename T, typename IndexT, funcs::SegmentedArraySize Size>
+void SplitFunctionDispatchWithSameShape(const phi::GPUContext& ctx,
+                                        const IndexT out_col,
+                                        const IndexT out_row,
+                                        const IndexT cumulative_col,
+                                        const T* input_data,
+                                        std::vector<phi::DenseTensor*>* outs,
+                                        T** pre_alloc_host_buf) {
+  dim3 grid_dims;
+  dim3 block_dims;
+  GetBlockDims(ctx, out_row, cumulative_col, &block_dims, &grid_dims);
+
+  funcs::PointerArraySetter<phi::GPUContext, T, Size> setter(
+      ctx,
+      outs,
+      /*need_alloc=*/false,
+      /*use_cuda_graph=*/true,
+      pre_alloc_host_buf);
+  SplitTensorWithSameShape<T, IndexT, decltype(setter.array)>
+      <<<grid_dims, block_dims, 0, ctx.stream()>>>(
+          input_data, out_row, cumulative_col, out_col, setter.array);
+}
+
+template <typename T, typename IndexT, funcs::SegmentedArraySize Size>
+void SplitFunctionDispatchWithDifferentShape(
+    const phi::GPUContext& ctx,
+    const int out_col_num,
+    const IndexT out_row,
+    const IndexT cumulative_col,
+    const T* input_data,
+    std::vector<phi::DenseTensor*>* outs,
+    IndexT* output_cols,
+    T** pre_alloc_host_buf) {
+  dim3 grid_dims;
+  dim3 block_dims;
+  GetBlockDims(ctx, out_row, cumulative_col, &block_dims, &grid_dims);
+  PointerAndColArray<T, IndexT, Size> setter(
+      ctx, out_col_num, output_cols, outs, pre_alloc_host_buf);
+
+  SplitTensorWithDifferentShape<T,
+                                IndexT,
+                                decltype(setter.array),
+                                decltype(setter.val_array)>
+      <<<grid_dims, block_dims, 0, ctx.stream()>>>(
+          input_data, out_row, cumulative_col, setter.array, setter.val_array);
+}
 
-    int64_t out0_col = ref_inputs[0]->numel() / out_row;
-    int64_t in_col = 0, in_row = out_row;
-    bool has_same_shape = true;
+template <typename T, typename IndexT>
+void SplitFunctorDispatchWithIndexType(
+    const phi::GPUContext& ctx,
+    int axis,
+    const phi::DenseTensor& input,
+    const std::vector<const phi::DenseTensor*>& ref_ins,
+    std::vector<phi::DenseTensor*>* outs) {
+  // TODO(zcd): Add input data validity checking
+  int out_num = outs->size();
+  IndexT out_row = 1;
+  auto ref_dim = ref_ins[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    out_row *= ref_dim[i];
+  }
+  IndexT out_col = ref_ins[0]->numel() / out_row;
+  IndexT cumulative_col = 0;
+  bool has_same_shape = true;
 
-    int outputs_cols_num = o_num + 1;
-    std::vector<T*> outputs_data_vec(o_num);
-    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-    T** outputs_data = outputs_data_vec.data();
-    int64_t* outputs_cols = outputs_cols_vec.data();
+  int out_cols_num = out_num + 1;
+  std::vector<IndexT> outputs_cols_vec(out_cols_num, 0);
+  IndexT* outs_cols = outputs_cols_vec.data();
+  T** outs_data = nullptr;
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -748,128 +786,90 @@ class SplitFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-    paddle::memory::AllocationPtr data_alloc, cols_alloc;
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
-    data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                       o_num * sizeof(T*));
-    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-    // TODO(chentianyu03): try to find a method to remove the Alloc function
-    cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                       (outputs_cols_num) * sizeof(int64_t));
-    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+  paddle::memory::AllocationPtr data_alloc, cols_alloc;
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     out_num * sizeof(T*));
+  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     (out_cols_num) * sizeof(IndexT));
+  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
 #endif
 
-    outputs_cols[0] = 0;
-    for (int i = 0; i < o_num; ++i) {
-      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
-      if (has_same_shape) {
-        if (t_col != out0_col) has_same_shape = false;
-      }
-      in_col += t_col;
-      outputs_cols[i + 1] = in_col;
-      if (outputs->at(i) != nullptr) {
-        outputs_data[i] = outputs->at(i)->data<T>();
-      } else {
-        outputs_data[i] = nullptr;
-      }
+  outs_cols[0] = 0;
+  for (int i = 0; i < out_num; ++i) {
+    IndexT t_col = ref_ins.at(i)->numel() / out_row;
+    if (has_same_shape) {
+      has_same_shape &= (t_col == cumulative_col);
     }
-
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-    paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
-    T** dev_out_gpu_data = nullptr;
-    if (!has_same_shape || o_num < 2 || o_num > 4) {
-      // TODO(chentianyu03): try to find a method to remove the Alloc function
-      tmp_dev_outs_data = paddle::memory::Alloc(
-          context.GetPlace(),
-          o_num * sizeof(T*),
-          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          outputs_data, o_num);
-      paddle::memory::Copy(context.GetPlace(),
-                           tmp_dev_outs_data->ptr(),
-                           phi::CPUPlace(),
-                           restored,
-                           o_num * sizeof(T*),
-                           context.stream());
-      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+    cumulative_col += t_col;
+    outs_cols[i + 1] = cumulative_col;
+  }
+  int limit_num = has_same_shape ? out_num : out_cols_num;
+  if (has_same_shape) {
+    switch (funcs::CalcArraySize(limit_num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          SplitFunctionDispatchWithSameShape<T, IndexT, kArraySize>(
+              ctx,
+              out_col,
+              out_row,
+              cumulative_col,
+              input.data<T>(),
+              outs,
+              outs_data));
     }
-
-    if (has_same_shape) {
-      if (o_num == 2) {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(),
-            in_row,
-            in_col,
-            out0_col,
-            outputs_data[0],
-            outputs_data[1]);
-      } else if (o_num == 3) {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(),
-            in_row,
-            in_col,
-            out0_col,
-            outputs_data[0],
-            outputs_data[1],
-            outputs_data[2]);
-      } else if (o_num == 4) {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(),
-            in_row,
-            in_col,
-            out0_col,
-            outputs_data[0],
-            outputs_data[1],
-            outputs_data[2],
-            outputs_data[3]);
-      } else {
-        SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-      }
-    } else {
-      auto tmp_dev_ins_col_data =
-          // TODO(chentianyu03): try to find a method to remove the Alloc
-          // function
-          paddle::memory::Alloc(
-              context.GetPlace(),
-              outputs_cols_num * sizeof(int64_t),
-              phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-      auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
-          outputs_cols, outputs_cols_num);
-      paddle::memory::Copy(context.GetPlace(),
-                           tmp_dev_ins_col_data->ptr(),
-                           phi::CPUPlace(),
-                           restored,
-                           outputs_cols_num * sizeof(int64_t),
-                           context.stream());
-      int64_t* dev_outs_col_data =
-          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-      SplitKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(),
-          in_row,
-          in_col,
-          dev_outs_col_data,
-          static_cast<int>(outputs_cols_num),
-          dev_out_gpu_data);
+  } else {
+    switch (funcs::CalcArraySize(limit_num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          SplitFunctionDispatchWithDifferentShape<T, IndexT, kArraySize>(
+              ctx,
+              out_cols_num,
+              out_row,
+              cumulative_col,
+              input.data<T>(),
+              outs,
+              outs_cols,
+              outs_data));
     }
+  }
 
 #ifdef PADDLE_WITH_HIP
-    // Prevent the pinned memory value from being covered and release the memory
-    // after the launch kernel of the stream is executed (reapply pinned memory
-    // next time)
-    auto* data_alloc_released = data_alloc.release();
-    auto* cols_alloc_released = cols_alloc.release();
-    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      paddle::memory::allocation::Allocator::AllocationDeleter(
-          data_alloc_released);
-      paddle::memory::allocation::Allocator::AllocationDeleter(
-          cols_alloc_released);
-    });
+  // Prevent pinned memory from being covered and release the memory after
+  // kernel launch of the stream is executed (reapply pinned memory next time)
+  auto* data_alloc_released = data_alloc.release();
+  auto* cols_alloc_released = cols_alloc.release();
+  ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        data_alloc_released);
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        cols_alloc_released);
+  });
 #endif
+}
+
+template <typename T>
+class SplitFunctor<phi::GPUContext, T> {
+ public:
+  void operator()(const phi::GPUContext& context,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
+                  int axis,
+                  std::vector<phi::DenseTensor*>* outputs) {
+    int64_t numel = input.numel();
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in
+    // 3 tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
+    if (numel < std::numeric_limits<int32_t>::max()) {
+      SplitFunctorDispatchWithIndexType<T, int32_t>(
+          context, axis, input, ref_inputs, outputs);
+    } else {
+      SplitFunctorDispatchWithIndexType<T, int64_t>(
+          context, axis, input, ref_inputs, outputs);
+    }
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index aa03eb4e9fcd2..cacaa8f81fe86 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
@@ -34,6 +35,26 @@ enum class SegmentedArraySize {
   kFixed64 = 64,
 };
 
+template <typename T, SegmentedArraySize Size, int Num = static_cast<int>(Size)>
+struct PADDLE_ALIGN(256) ValueArray {
+ public:
+  T data[Num];
+
+  void Set(T* ptr, const int num) {
+    for (auto i = 0; i < num; ++i) {
+      data[i] = ptr[i];
+    }
+  }
+};
+
+template <typename T>
+struct PADDLE_ALIGN(256) ValueArray<T, SegmentedArraySize::kVariableLength, 0> {
+ public:
+  T* data{nullptr};
+
+  void Set(T* ptr, const int num) { data = ptr; }
+};
+
 template <typename T, SegmentedArraySize Size>
 struct PADDLE_ALIGN(256) ConstPointerArray {
  public:
@@ -62,8 +83,8 @@ struct PADDLE_ALIGN(256) PointerArray {
  public:
   T* data[static_cast<int>(Size)];
 
-  void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
-    for (auto i = 0; i < ptrs.size(); ++i) {
+  void Set(T** ptrs, const int num, T** dev_ptr = nullptr) {
+    for (auto i = 0; i < num; ++i) {
       data[i] = ptrs[i];
     }
   }
@@ -74,9 +95,7 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
  public:
   T** data{nullptr};
 
-  void Set(const std::vector<T*>& ptrs, T** dev_ptr = nullptr) {
-    data = dev_ptr;
-  }
+  void Set(T** ptrs, const int num, T** dev_ptr = nullptr) { data = dev_ptr; }
 };
 
 #undef PADDLE_ALIGN
@@ -84,13 +103,24 @@ struct PADDLE_ALIGN(256) PointerArray<T, SegmentedArraySize::kVariableLength> {
 template <typename Context>
 struct ArraySetterBase {
  protected:
-  void* AllocAndCopy(const Context& ctx, void* src, size_t num_bytes) {
+  void* AllocAndCopy(const Context& ctx,
+                     void* src,
+                     size_t num_bytes,
+                     bool use_cuda_graph = false) {
     allocation = paddle::memory::Alloc(
         ctx.GetPlace(),
         num_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+
+    int8_t* restored = reinterpret_cast<int8_t*>(src);
+#ifdef PADDLE_WITH_CUDA
+    if (use_cuda_graph) {
+      restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph<int8_t>(
+          restored, num_bytes);
+    }
+#endif
     phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
-                                       src,
+                                       restored,
                                        num_bytes,
                                        phi::gpuMemcpyHostToDevice,
                                        ctx.stream());
@@ -131,13 +161,28 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
  public:
   PointerArray<T, Size> array;
 
-  PointerArraySetter(const Context& ctx, std::vector<DenseTensor*>* t) {
+  // need_alloc : tensor data needs extra buffer or not.
+  // use_cuda_graph: tensor data shall be captured by cuda_graph or not.
+  // pre_alloc_host_buf: tensor data is temporaily stored by pinned memory or
+  // not.
+  PointerArraySetter(const Context& ctx,
+                     std::vector<DenseTensor*>* t,
+                     bool need_alloc = false,
+                     bool use_cuda_graph = false,
+                     T** pre_alloc_host_buf = nullptr) {
     ptrs.resize(t->size());
+    T** data_ptr = ptrs.data();
+#ifdef PADDLE_WITH_HIP
+    if (pre_alloc_host_buf) {
+      data_ptr = pre_alloc_host_buf;
+    }
+#endif
     for (int i = 0; i < t->size(); ++i) {
       if (t->at(i) && (t->at(i)->numel() > 0)) {
-        ptrs[i] = ctx.template Alloc<T>(t->at(i));
+        data_ptr[i] =
+            need_alloc ? ctx.template Alloc<T>(t->at(i)) : t->at(i)->data<T>();
       } else {
-        ptrs[i] = nullptr;
+        data_ptr[i] = nullptr;
       }
     }
 
@@ -145,10 +190,9 @@ struct PointerArraySetter : public ArraySetterBase<Context> {
     if (Size == SegmentedArraySize::kVariableLength) {
       size_t num_bytes = t->size() * sizeof(T*);
       dev_ptr = reinterpret_cast<T**>(this->AllocAndCopy(
-          ctx, reinterpret_cast<void*>(ptrs.data()), num_bytes));
+          ctx, reinterpret_cast<void*>(data_ptr), num_bytes, use_cuda_graph));
     }
-
-    array.Set(ptrs, dev_ptr);
+    array.Set(data_ptr, t->size(), dev_ptr);
   }
 
  private:
diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
index c516d4892bf62..0b2b5443383a9 100644
--- a/paddle/phi/kernels/funcs/stack_and_unstack.h
+++ b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -192,7 +192,7 @@ void LaunchUnStackKernel(const Context& ctx,
           << ", out_col=" << out_col << ", num_splits=" << num_splits;
 
   auto x_ptr = x.data<T>();
-  PointerArraySetter<Context, T, Size> setter(ctx, outs);
+  PointerArraySetter<Context, T, Size> setter(ctx, outs, /*need_alloc=*/true);
 
   if (out_col == 1) {
     // For the case axis == (x.dims().size() - 1)

From 9f23114793dfb44445fb39df63f1dc92bdff9c53 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 1 Feb 2023 09:22:54 +0800
Subject: [PATCH 52/89] [PrimCinn]Fix some vars are wrongly gc in
 CINN+InterpreterCore (#50116)

* [PrimCinn]Fix some vars are wrongly gc in CINN+InterpreterCore

* fix baseline unittest config

* fix code style
---
 paddle/fluid/operators/cinn/cinn_launch_context.cc        | 8 ++++++--
 .../unittests/prim/prim/vjp/static/test_comp_add_grad.py  | 2 +-
 .../prim/prim/vjp/static/test_comp_add_tanh_grad.py       | 2 +-
 .../unittests/prim/prim/vjp/static/test_comp_div_grad.py  | 2 +-
 .../unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py | 2 +-
 .../unittests/prim/prim/vjp/static/test_comp_sub_grad.py  | 2 +-
 .../unittests/prim/prim/vjp/static/test_comp_tanh_grad.py | 2 +-
 7 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index af429e0f01e33..0b999ccab016f 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -119,12 +119,16 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
   // collect variables name list to be skipped in GC
   skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size());
   auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) {
+    // Always consider Input/Output of Graph as skip_gc_vars, because
+    // InterpreterCore has no eager_deletion_op to deal with it.
+
+    VLOG(4) << "Append a skip_gc_var for InterpreterCore:" << var_name;
+    skip_gc_vars_.insert(var_name);
     // if a var exists at the outer_varinfo map, that means it will be
     // erased by the following eager_deletion_op of current cinn_launch op
     if (!outer_varinfo.count(var_name)) {
       skip_eager_vars_.emplace_back(var_name);
-      skip_gc_vars_.insert(var_name);
-      VLOG(4) << "Append a skip_gc_var:" << var_name;
+      VLOG(4) << "Append a skip_gc_var for PE:" << var_name;
     }
   };
   std::for_each(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
index 1673ff083e7cf..50ef9f6f13036 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_grad.py
@@ -91,7 +91,7 @@ def train(self, use_prim, use_cinn):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
index 5dd7417130bc1..b037cc73bfd54 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_add_tanh_grad.py
@@ -92,7 +92,7 @@ def train(self, use_prim, use_cinn):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
index 95d3c3027fd9d..606b55b5a95c0 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_div_grad.py
@@ -91,7 +91,7 @@ def train(self, use_prim, use_cinn):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
index 8df50c768c2b7..8e623100dd09c 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sqrt_grad.py
@@ -70,7 +70,7 @@ def train(self, use_prim, use_cinn):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
index 693bf8b942bab..3245d118760b2 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_sub_grad.py
@@ -92,7 +92,7 @@ def train(self, use_prim, use_cinn):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(
diff --git a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
index e643cf620a811..d28f84a685b0d 100644
--- a/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
+++ b/python/paddle/fluid/tests/unittests/prim/prim/vjp/static/test_comp_tanh_grad.py
@@ -70,7 +70,7 @@ def train(self, use_prim, use_cinn):
     def test_cinn(self):
         paddle.disable_static()
         dy_res = self.train(use_prim=False, use_cinn=False)
-        comp_st_cinn_res = self.train(use_prim=True, use_cinn=False)
+        comp_st_cinn_res = self.train(use_prim=True, use_cinn=True)
 
         for i in range(len(dy_res)):
             np.testing.assert_allclose(

From 3e9d854842aba82b900b7c578c2d125c3a3d18cf Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Wed, 1 Feb 2023 10:32:13 +0800
Subject: [PATCH 53/89] fix gc and infinite buffer size (#50122)

---
 .../fleet_executor/compute_interceptor.cc     | 42 +++++++++++--------
 .../fleet_executor/compute_interceptor.h      |  2 +
 .../fleet_executor/fleet_executor.cc          | 27 ++++++------
 python/paddle/fluid/executor.py               |  7 ++--
 .../test_fleet_executor_cond_interceptor.py   | 17 +++++---
 5 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 9aedaa131400f..a03ac900e9f66 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -50,14 +50,17 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
   auto max_ready_size = it->second.first;
   auto ready_size = it->second.second;
   ready_size += 1;
-  PADDLE_ENFORCE_LE(ready_size,
-                    max_ready_size,
-                    platform::errors::OutOfRange(
-                        "upstream=%lld ready_size must <= max_ready_size, but "
-                        "now ready_size=%lld, max_ready_size=%lld",
-                        up_id,
-                        ready_size,
-                        max_ready_size));
+  if (max_ready_size != INFINITE_BUFFER_SIZE) {
+    PADDLE_ENFORCE_LE(
+        ready_size,
+        max_ready_size,
+        platform::errors::OutOfRange(
+            "upstream=%lld ready_size must <= max_ready_size, but "
+            "now ready_size=%lld, max_ready_size=%lld",
+            up_id,
+            ready_size,
+            max_ready_size));
+  }
   it->second.second = ready_size;
 }
 
@@ -96,6 +99,9 @@ bool ComputeInterceptor::CanWriteOutput() {
   for (auto& outs : out_buffs_) {
     auto max_buffer_size = outs.second.first;
     auto used_size = outs.second.second;
+    if (max_buffer_size == INFINITE_BUFFER_SIZE) {
+      continue;
+    }
     // full, return false
     if (used_size == max_buffer_size) {
       VLOG(3) << "Interceptor " << GetInterceptorId()
@@ -112,15 +118,17 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
     auto max_buff_size = outs.second.first;
     auto used_size = outs.second.second;
     used_size += 1;
-    PADDLE_ENFORCE_LE(
-        used_size,
-        max_buff_size,
-        platform::errors::OutOfRange("downstream=%lld used buff size must <= "
-                                     "max_buff_size, but now used_size=%lld, "
-                                     "max_buff_size=%lld",
-                                     down_id,
-                                     used_size,
-                                     max_buff_size));
+    if (max_buff_size != INFINITE_BUFFER_SIZE) {
+      PADDLE_ENFORCE_LE(
+          used_size,
+          max_buff_size,
+          platform::errors::OutOfRange("downstream=%lld used buff size must <= "
+                                       "max_buff_size, but now used_size=%lld, "
+                                       "max_buff_size=%lld",
+                                       down_id,
+                                       used_size,
+                                       max_buff_size));
+    }
     outs.second.second = used_size;
 
     InterceptorMessage ready_msg;
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index 9709cd4437f10..eade47fd8787e 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -22,6 +22,8 @@
 namespace paddle {
 namespace distributed {
 
+const int64_t INFINITE_BUFFER_SIZE = -1;
+
 class ComputeInterceptor : public Interceptor {
  public:
   ComputeInterceptor(int64_t interceptor_id, TaskNode* node);
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 88363696ede25..ae3776d2c5bea 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -111,21 +111,22 @@ void FleetExecutor::Init(
     task_node->SetUnusedVars(unused_vars);
     if (task_node->type() == "Cond") {
       std::vector<std::string> while_block_vars;
-      std::vector<std::string> vars_in_parent;
-      std::vector<std::string> vars_in_sub;
-      for (auto& var : program_desc.Block(0).AllVars()) {
-        vars_in_parent.emplace_back(var->Name());
-      }
+      VLOG(3) << "Vars in while sub block:";
       for (auto& var : program_desc.Block(1).AllVars()) {
-        vars_in_sub.emplace_back(var->Name());
+        VLOG(3) << var->Name();
+        while_block_vars.emplace_back(var->Name());
+      }
+      for (const auto& pair : unused_vars) {
+        if (pair.first->Type() == "while") {
+          for (const auto& var_name : pair.second) {
+            while_block_vars.emplace_back(var_name);
+          }
+        }
+      }
+      VLOG(3) << "Vars below will be removed after while:";
+      for (const auto& name : while_block_vars) {
+        VLOG(3) << name;
       }
-      std::sort(vars_in_parent.begin(), vars_in_parent.end());
-      std::sort(vars_in_sub.begin(), vars_in_sub.end());
-      std::set_difference(vars_in_sub.begin(),
-                          vars_in_sub.end(),
-                          vars_in_parent.begin(),
-                          vars_in_parent.end(),
-                          std::back_inserter(while_block_vars));
       task_node->SetWhileBlockVars(while_block_vars);
     }
     int64_t interceptor_id = task_node->task_id();
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index da9d12802434f..6e094588e686a 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2534,8 +2534,9 @@ def _prepare_fleet_executor_carrier(
         place = core.Place()
         place.set_place(self.place)
 
-        # NOTE: the last argument is used to force create some vars in root scope,
-        # won't be used during train.
+        inference_root_scope_vars = (
+            fleet_opt["fetch_var"] if "fetch_var" in fleet_opt else []
+        )
         self._fleet_executor.init(
             carrier_id,
             program.desc,
@@ -2544,7 +2545,7 @@ def _prepare_fleet_executor_carrier(
             num_micro_batches,
             tasks,
             task_id_to_rank,
-            [],
+            inference_root_scope_vars,
             micro_scope_list,
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
index 1ca8c869a96bd..f6418cdee2cce 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
@@ -165,19 +165,24 @@ def test_cond_interceptor(self):
             lazy_initialize=True,
         )
 
+        infinite_buff_size = -1
         task_a.add_downstream_task(task_b.task_id(), 2)
         task_b.add_upstream_task(task_a.task_id(), 2)
-        task_b.add_downstream_task(task_c.task_id(), 100)
-        task_c.add_upstream_task(task_b.task_id(), 100)
+        task_b.add_downstream_task(task_c.task_id(), infinite_buff_size)
+        task_c.add_upstream_task(task_b.task_id(), infinite_buff_size)
         task_c.add_downstream_task(task_d.task_id(), 2)
         task_d.add_upstream_task(task_c.task_id(), 2)
-        task_d.add_downstream_task(task_b.task_id(), 100, core.DependType.LOOP)
-        task_b.add_upstream_task(task_d.task_id(), 100, core.DependType.LOOP)
+        task_d.add_downstream_task(
+            task_b.task_id(), infinite_buff_size, core.DependType.LOOP
+        )
+        task_b.add_upstream_task(
+            task_d.task_id(), infinite_buff_size, core.DependType.LOOP
+        )
         task_b.add_downstream_task(
-            task_e.task_id(), 100, core.DependType.STOP_LOOP
+            task_e.task_id(), infinite_buff_size, core.DependType.STOP_LOOP
         )
         task_e.add_upstream_task(
-            task_b.task_id(), 100, core.DependType.STOP_LOOP
+            task_b.task_id(), infinite_buff_size, core.DependType.STOP_LOOP
         )
 
         main_program._pipeline_opt = {

From 7f1a1570c68985b8649edbe484b812ab82df26bb Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 10:37:18 +0800
Subject: [PATCH 54/89] Fix Python IndexError of case1: paddle.linalg.lstsq
 (#49985)

---
 .../tests/unittests/test_linalg_lstsq_op.py   | 33 +++++++++++++++++++
 python/paddle/tensor/linalg.py                | 19 +++++++++--
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
index 82576ab1bd1bf..94dc901a56d0c 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -278,5 +278,38 @@ def init_config(self):
         self._input_shape_2 = (50, 300)
 
 
+class TestLinalgLstsqAPIError(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_api_errors(self):
+        def test_x_bad_shape():
+            x = paddle.to_tensor(np.random.random(size=(5)), dtype=np.float32)
+            y = paddle.to_tensor(
+                np.random.random(size=(5, 15)), dtype=np.float32
+            )
+            out = paddle.linalg.lstsq(x, y, driver='gelsy')
+
+        def test_y_bad_shape():
+            x = paddle.to_tensor(
+                np.random.random(size=(5, 10)), dtype=np.float32
+            )
+            y = paddle.to_tensor(np.random.random(size=(5)), dtype=np.float32)
+            out = paddle.linalg.lstsq(x, y, driver='gelsy')
+
+        def test_shape_dismatch():
+            x = paddle.to_tensor(
+                np.random.random(size=(5, 10)), dtype=np.float32
+            )
+            y = paddle.to_tensor(
+                np.random.random(size=(4, 15)), dtype=np.float32
+            )
+            out = paddle.linalg.lstsq(x, y, driver='gelsy')
+
+        self.assertRaises(ValueError, test_x_bad_shape)
+        self.assertRaises(ValueError, test_y_bad_shape)
+        self.assertRaises(ValueError, test_shape_dismatch)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 4cce1b01968a1..46f11130c0354 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -3171,13 +3171,26 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     else:
         raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
 
-    if x.dtype == y.dtype and x.dtype in (paddle.float32, paddle.float64):
-        pass
-    else:
+    if not (x.dtype == y.dtype and x.dtype in (paddle.float32, paddle.float64)):
         raise ValueError(
             "Only support x and y have the same dtype such as 'float32' and 'float64'."
         )
 
+    if x.ndim < 2:
+        raise ValueError(
+            f"The shape of x should be (*, M, N), but received ndim is [{x.ndim} < 2]"
+        )
+
+    if y.ndim < 2:
+        raise ValueError(
+            f"The shape of y should be (*, M, K), but received ndim is [{y.ndim} < 2]"
+        )
+
+    if x.shape[-2] != y.shape[-2]:
+        raise ValueError(
+            f"x with shape (*, M = {x.shape[-2]}, N) and y with shape (*, M = {y.shape[-2]}, K) should have same M."
+        )
+
     if rcond is None:
         if x.dtype == paddle.float32:
             rcond = 1e-7 * max(x.shape[-2], x.shape[-1])

From 9ce8cfcf04fd53f1aa57d8e08d82b39eed3aaf3f Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 10:44:04 +0800
Subject: [PATCH 55/89] =?UTF-8?q?Fix=20UFA=E9=9D=9E=E6=B3=95=E5=9C=B0?=
 =?UTF-8?q?=E5=9D=80=E8=AE=BF=E9=97=AE(UFA=20illegal=20address=20access)?=
 =?UTF-8?q?=20of=20case4:=20paddle.unbind=20(#49995)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add axis check for unbind

* add axis range check for unbind

* update unittest and axis validation for unbind

* add unittest invalid axis for unbind

* restore axis extract for unbind
---
 paddle/phi/infermeta/unary.cc                     | 13 +++++++++++++
 .../fluid/tests/unittests/test_unbind_op.py       |  7 +++++++
 python/paddle/tensor/manipulation.py              | 15 ++++++++++-----
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 3b3202c291725..eb05437ada8a5 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -4253,7 +4253,20 @@ void UnbindInferMeta(const MetaTensor& x,
                      std::vector<MetaTensor*> outs) {
   auto in_dims = x.dims();
   std::vector<int> out_dim;
+
+  PADDLE_ENFORCE_GE(
+      axis,
+      -in_dims.size(),
+      phi::errors::InvalidArgument(
+          "axis must be in range(%d, %d).", -in_dims.size(), in_dims.size()));
+  PADDLE_ENFORCE_LT(
+      axis,
+      in_dims.size(),
+      phi::errors::InvalidArgument(
+          "axis must be in range(%d, %d).", -in_dims.size(), in_dims.size()));
+
   axis = axis < 0 ? in_dims.size() + axis : axis;
+
   for (int i = 0; i < in_dims.size(); ++i) {
     if (i != axis) out_dim.push_back(in_dims[i]);
   }
diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py
index 6ec82a96bc165..8cafc1b5a8e1b 100644
--- a/python/paddle/fluid/tests/unittests/test_unbind_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py
@@ -25,6 +25,7 @@
 
 class TestUnbind(unittest.TestCase):
     def test_unbind(self):
+        paddle.enable_static()
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
         [out_0, out_1] = tensor.unbind(input=x_1, axis=0)
@@ -59,6 +60,7 @@ def test_unbind_dygraph(self):
 
 class TestLayersUnbind(unittest.TestCase):
     def test_layers_unbind(self):
+        paddle.enable_static()
 
         x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1')
         [out_0, out_1] = paddle.unbind(input=x_1, axis=0)
@@ -214,6 +216,11 @@ def test_table_Variable():
 
             self.assertRaises(TypeError, test_table_Variable)
 
+            def test_invalid_axis():
+                tensor.unbind(input=x, axis=2)
+
+            self.assertRaises(ValueError, test_invalid_axis)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 923e6923d6d63..b5308e6cee63d 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2755,14 +2755,19 @@ def unbind(input, axis=0):
             # x2.shape [3, 5]
             # x3.shape [3, 5]
     """
+    if not isinstance(axis, (int)):
+        raise TypeError(
+            "The type of 'axis'  must be int, but received %s." % (type(axis))
+        )
+
+    if axis not in range(-input.ndim, input.ndim):
+        raise ValueError(
+            f'The axis must in range({-input.ndim}, {input.ndim}).'
+        )
+
     if in_dygraph_mode():
         return _C_ops.unbind(input, axis)
     else:
-        if not isinstance(axis, (int)):
-            raise TypeError(
-                "The type of 'axis'  must be int, but received %s."
-                % (type(axis))
-            )
         if isinstance(axis, np.generic):
             axis = np.asscalar(axis)
         input_shape = input.shape

From 0855d9828c62af0b60e4d625e4ac973d1309147c Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Wed, 1 Feb 2023 10:47:05 +0800
Subject: [PATCH 56/89] add clip_grad_norm_ API (#49935)

* add clip_grad_norm_ api.

* fix docs and some details according to the comments.

* fix code style.

* fix no_grad problem, and fix doc.

* fix code style.

* fix doc and remove type information
---
 .../tests/unittests/test_clip_grad_norm_.py   | 121 ++++++++++++++++++
 python/paddle/nn/utils/__init__.py            |   4 +-
 python/paddle/nn/utils/clip_grad_norm_.py     | 107 ++++++++++++++++
 3 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py
 create mode 100644 python/paddle/nn/utils/clip_grad_norm_.py

diff --git a/python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py b/python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py
new file mode 100644
index 0000000000000..308c59d094ec5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_clip_grad_norm_.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.nn.utils.clip_grad_norm_ import clip_grad_norm_
+
+
+class TestClipGradNorm(unittest.TestCase):
+    def test_basic(self):
+        run_test_equal(
+            self,
+            shape=[16, 16],
+            dtype=np.float32,
+            max_norm=5,
+            norm_type=2,
+        )
+        run_test_equal(
+            self,
+            shape=(100,),
+            dtype=np.float32,
+            max_norm=1e20,
+            norm_type=2,
+        )
+        run_test_equal(
+            self,
+            shape=[4, 8, 16],
+            dtype=np.float32,
+            max_norm=1.0,
+            norm_type=float("inf"),
+        )
+
+    def test_errors(self):
+        def TestValueError():
+            input_pd = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            input_pd.grad = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            clip_grad_norm_(input_pd, max_norm=2, norm_type=float("-inf"))
+
+        self.assertRaises(ValueError, TestValueError)
+
+        def TestRuntimeError():
+            input_pd = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            input_pd.grad = paddle.full([1, 2], float("inf"))
+            clip_grad_norm_(
+                input_pd, max_norm=2, norm_type=2, error_if_nonfinite=True
+            )
+
+        self.assertRaises(RuntimeError, TestRuntimeError)
+
+        def TestRuntimeErrorStaticMode():
+            paddle.enable_static()
+            input_pd = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            input_pd.grad = paddle.to_tensor(
+                np.random.random([1, 2]).astype(np.float32)
+            )
+            clip_grad_norm_(input_pd, max_norm=2, norm_type=float("inf"))
+            paddle.disable_static()
+
+        self.assertRaises(RuntimeError, TestRuntimeErrorStaticMode)
+
+
+def run_test_equal(
+    self,
+    shape,
+    dtype,
+    max_norm,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+):
+    input = np.random.random(shape).astype(dtype)
+    grad = np.random.random(shape).astype(dtype)
+    input_pd = paddle.to_tensor(input)
+    input_pd.grad = paddle.to_tensor(grad)
+
+    if norm_type == 2:
+        grad = grad.reshape(1, grad.size)
+        output = np.linalg.norm(grad, 'fro')
+    elif norm_type == np.inf:
+        output = np.amax(np.abs(grad))
+    else:
+        output = np.linalg.norm(grad, norm_type)
+    clip_grad_norm_result = clip_grad_norm_(
+        input_pd,
+        max_norm=max_norm,
+        norm_type=norm_type,
+        error_if_nonfinite=error_if_nonfinite,
+    )
+
+    np.testing.assert_allclose(
+        clip_grad_norm_result.numpy(),
+        output,
+        rtol=1e-05,
+        atol=1e-05,
+        equal_nan=False,
+    )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index 23e1e233cc0dc..82b17c8c05d24 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
     vector_to_parameters,
     _stride_column,
 )  # noqa: F401
+from .clip_grad_norm_ import clip_grad_norm_  # noqa: F401
 
 __all__ = [  # noqa
     'weight_norm',
@@ -26,4 +27,5 @@
     'spectral_norm',
     'parameters_to_vector',
     'vector_to_parameters',
+    'clip_grad_norm_',
 ]
diff --git a/python/paddle/nn/utils/clip_grad_norm_.py b/python/paddle/nn/utils/clip_grad_norm_.py
new file mode 100644
index 0000000000000..3a3ecb38b4428
--- /dev/null
+++ b/python/paddle/nn/utils/clip_grad_norm_.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+__all__ = ['clip_grad_norm_']
+
+
+def clip_grad_norm_(
+    parameters,
+    max_norm,
+    norm_type=2.0,
+    error_if_nonfinite=False,
+):
+    r"""Clips gradient norm of the iteratable parameters.
+
+    Norms are calculated together on all gradients, just as they are
+    connected into one vector. The gradient will be modified in place.
+
+    This API can only run in dynamic graph mode, not static graph mode.
+
+    Args:
+        parameters (Iterable[paddle.Tensor] or paddle.Tensor): Tensors or a single Tensor
+            that will be normalized gradients
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be `inf` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, throw an error if the total
+            norm of the gradients from :attr:`parameters` is `nan`,
+            `inf`, or `-inf`.
+
+    Returns:
+        Total norm of the parameter gradients (treated as a single vector).
+    Example:
+        .. code-block:: python
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            max_norm = float(5.0)
+            linear = paddle.nn.Linear(in_features=10, out_features=10)
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            paddle.nn.utils.clip_grad_norm_(linear.parameters(), max_norm)
+
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters())
+            sdg.step()
+    """
+    if not paddle.in_dynamic_mode():
+        raise RuntimeError('this API can only run in dynamic mode.')
+
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+
+    support_norm_type = [float("inf"), 0, 1, 2]
+    if norm_type not in support_norm_type:
+        raise ValueError(f'norm_type only support {support_norm_type}')
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return paddle.to_tensor(0.0)
+    if norm_type == float("inf"):
+        norms = [g.detach().abs().max() for g in grads]
+        total_norm = (
+            norms[0] if len(norms) == 1 else paddle.max(paddle.stack(norms))
+        )
+    else:
+        total_norm = paddle.linalg.norm(
+            paddle.stack(
+                [paddle.linalg.norm(g.detach(), norm_type) for g in grads]
+            ),
+            norm_type,
+        )
+
+    if error_if_nonfinite and paddle.logical_or(
+        total_norm.isnan(), total_norm.isinf()
+    ):
+        raise RuntimeError(
+            f'The total norm of {norm_type} order of the gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. In any case, '
+            'disable this error and scale the gradient by non-finite norm, '
+            'set `error_if_nonfinite=False`'
+        )
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: when the coef is clamped to 1, it is redundant to multiply the clamped coef, but this
+    # avoids the `if clip_coef < 1:` condition.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    with paddle.no_grad():
+        for _, p in enumerate(parameters):
+            g = p.grad
+            if g is not None:
+                p.grad = paddle.multiply(x=g, y=clip_coef_clamped)
+    return total_norm

From 2b636166d24dcf695117ba576efa532ff24fc73b Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Wed, 1 Feb 2023 10:49:05 +0800
Subject: [PATCH 57/89] nccl 2.7.8 to 2.10.3 (#50121)

---
 .../dockerfile/build_scripts/install_nccl2.sh  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 6d44dbb90542f..d39e74f3cf537 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -17,18 +17,18 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ]; then
+elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ] || [ "$VERSION" == "11.8" ]; then
   if [ -f "/etc/redhat-release" ];then
-    rm -f /usr/local/lib/libnccl.so 
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm
-    rpm -ivh libnccl-2.7.8-1+cuda10.2.x86_64.rpm
-    rpm -ivh libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm
-    rpm -ivh libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && rm -f libnccl-*
+    rm -f /usr/local/lib/libnccl.so
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.10.3-1+cuda11.4.x86_64.rpm
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.10.3-1+cuda11.4.x86_64.rpm
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.10.3-1+cuda11.4.x86_64.rpm
+    rpm -Fivh libnccl-2.10.3-1+cuda11.4.x86_64.rpm
+    rpm -Fivh libnccl-devel-2.10.3-1+cuda11.4.x86_64.rpm
+    rpm -Fivh libnccl-static-2.10.3-1+cuda11.4.x86_64.rpm && rm -f libnccl-*
     exit 0
   fi
-  DEB="nccl-repo-ubuntu1604-2.7.8-ga-cuda10.2_1-1_amd64.deb"
+  DEB="nccl-repo-ubuntu1804-2.10.3-cuda11.4_1.0-1_amd64.deb"
 elif [ "$VERSION" == "9.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.3.7-ga-cuda9.0_1-1_amd64.deb"
 else

From 73f3e67673ecf63e8899b595b6d28d9b2ba8fe0a Mon Sep 17 00:00:00 2001
From: wangxiaoning <71813629+wangxn12138@users.noreply.github.com>
Date: Wed, 1 Feb 2023 10:56:22 +0800
Subject: [PATCH 58/89] clean ps_trainer_pass (#50117)

---
 python/paddle/distributed/passes/ps_trainer_pass.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 2a60b0df5f5eb..f25ede7f05ee5 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -464,7 +464,7 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                     "is_sparse": True,
                 },
             )
-            PSGPU = paddle.fluid.core.PSGPU()
+            PSGPU = core.PSGPU()
             try:
                 gpu_slot = [int(var.name) for var in gpups_inputs]
             except (ValueError):
@@ -1052,7 +1052,7 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         block_vars_detail = find_block_joints(
             program, program_block_ops, heter_ops
         )
-        heter_program = framework.Program()
+        heter_program = paddle.framework.Program()
         self._create_heter_program(
             program,
             attrs,
@@ -1628,13 +1628,13 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         debug_program(_main_file, prog_b)
 
         if not self.is_part_b:
-            self.partA_program = framework.Program()
+            self.partA_program = paddle.framework.Program()
             self._get_partA_program(prog_a.global_block())
             pass_ctx._attrs['part_a_main_program'] = self.partA_program
             self._clear_op_device_flag(self.partA_program)
             check_program(self.partA_program)
         else:
-            self.partB_program = framework.Program()
+            self.partB_program = paddle.framework.Program()
             self._get_partB_program(prog_b.global_block())
             pass_ctx._attrs['part_b_main_program'] = self.partB_program
             self._clear_op_device_flag(self.partB_program)

From af6730909071ff7b4de73ba6744ad4641a016d04 Mon Sep 17 00:00:00 2001
From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Wed, 1 Feb 2023 14:03:24 +0800
Subject: [PATCH 59/89] add dynamic shape support for running paddle-trt in
 calib_mode (#50033)

---
 .../operators/tensorrt/tensorrt_engine_op.h   | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 579549a4c3ec4..2f5da3c44b97f 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -438,11 +438,32 @@ class TensorRTEngineOp : public framework::OperatorBase {
       calib_res->calib_.reset(new TRTInt8Calibrator(
           calib_buffers, runtime_batch, calibration_engine_key_, dev_place));
       calib_res->thr_.reset(new std::thread([&]() {
+        std::map<std::string, std::vector<int>> min_input_shape;
+        std::map<std::string, std::vector<int>> max_input_shape;
+        std::map<std::string, std::vector<int>> opt_input_shape;
+        std::map<std::string, std::vector<int>> min_shape_tensor;
+        std::map<std::string, std::vector<int>> max_shape_tensor;
+        std::map<std::string, std::vector<int>> opt_shape_tensor;
+        if (shape_range_info_path_.size())
+          inference::DeserializeShapeRangeInfo(shape_range_info_path_,
+                                               &min_input_shape,
+                                               &max_input_shape,
+                                               &opt_input_shape,
+                                               &min_shape_tensor,
+                                               &max_shape_tensor,
+                                               &opt_shape_tensor);
+
         calib_res->engine_.reset(new TensorRTEngine(max_batch_size_,
                                                     workspace_size_,
                                                     precision_mode_,
                                                     calib_res->calib_.get(),
-                                                    dev_place.device));
+                                                    dev_place.device,
+                                                    min_input_shape,
+                                                    max_input_shape,
+                                                    opt_input_shape,
+                                                    min_shape_tensor,
+                                                    max_shape_tensor,
+                                                    opt_shape_tensor));
         VLOG(3) << "start the calib trt engine thread";
         PrepareTRTEngine(scope, calib_res->engine_.get());
       }));

From bdae548148b32b92a5ca922ea28abd7ed0c28517 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Wed, 1 Feb 2023 14:09:23 +0800
Subject: [PATCH 60/89] Combination of multiple paddle::memory::allocate
 operation into one for ops (#49126)

* A leap of try for cudaLaunchCooperativeKernel

* fix bugs

* Totally replace the lar cuda kernel

* Fix bugs

* fix code according to comments

* fix codes according to  review comments

* adding some function overload

* relocate the power operation.

* add bf16 support for index select relevant ops

* revert bf16 type change.

* add changes for more op

* fix code writting bugs
---
 .../phi/kernels/funcs/elementwise_grad_base.h | 63 ++++++++-----------
 paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 26 +++-----
 .../kernels/funcs/values_vectors_functor.h    | 10 +--
 3 files changed, 39 insertions(+), 60 deletions(-)

diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index b9ffb4e3f1237..f577f1781ff09 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -1530,37 +1530,31 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   ComputeBroadcastKernelSize(
       y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
 
-  auto x_strides_array_tmp = paddle::memory::Alloc(
+  // One part buffer for x_strides_array, rest for y_strides_array and
+  // out_dims_array.
+  size_t tmp_total_bytes = bytes * 3;
+  auto tmp_buffer = paddle::memory::Alloc(
       ctx.GetPlace(),
-      bytes,
+      tmp_total_bytes,
       phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *x_strides_array_gpu =
-      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
+  int *x_strides_array_gpu = reinterpret_cast<int *>(tmp_buffer->ptr());
+  int *y_strides_array_gpu =
+      reinterpret_cast<int *>(x_strides_array_gpu + max_dim);
+  int *out_dims_array_gpu =
+      reinterpret_cast<int *>(y_strides_array_gpu + max_dim);
+
   paddle::memory::Copy(gplace,
                        x_strides_array_gpu,
                        cplace,
                        x_strides_array.data(),
                        bytes,
                        ctx.stream());
-
-  auto y_strides_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *y_strides_array_gpu =
-      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
   paddle::memory::Copy(gplace,
                        y_strides_array_gpu,
                        cplace,
                        y_strides_array.data(),
                        bytes,
                        ctx.stream());
-
-  auto out_dims_array_tmp = paddle::memory::Alloc(
-      ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
   paddle::memory::Copy(
       gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
 
@@ -1569,24 +1563,21 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
   int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
   int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
   if (dx) {
-    auto x_strides_order_tmp = paddle::memory::Alloc(
+    size_t dx_total_bytes = bytes * 2;
+    auto dx_tmp_buffer = paddle::memory::Alloc(
         ctx.GetPlace(),
-        bytes,
+        dx_total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_strides_order_gpu =
-        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
+    int *x_strides_order_gpu = reinterpret_cast<int *>(dx_tmp_buffer->ptr());
+    int *x_dims_order_gpu =
+        reinterpret_cast<int *>(x_strides_order_gpu + max_dim);
+
     paddle::memory::Copy(gplace,
                          x_strides_order_gpu,
                          cplace,
                          x_strides_order.data(),
                          bytes,
                          ctx.stream());
-
-    auto x_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          x_dims_order_gpu,
                          cplace,
@@ -1610,24 +1601,22 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
                                                       dx_op);
   }
   if (dy) {
-    auto y_strides_order_tmp = paddle::memory::Alloc(
+    // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
+    size_t dy_total_bytes = bytes * 2;
+    auto dy_tmp_buffer = paddle::memory::Alloc(
         ctx.GetPlace(),
-        bytes,
+        dy_total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_strides_order_gpu =
-        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
+    int *y_strides_order_gpu = reinterpret_cast<int *>(dy_tmp_buffer->ptr());
+    int *y_dims_order_gpu =
+        reinterpret_cast<int *>(y_strides_order_gpu + max_dim);
+
     paddle::memory::Copy(gplace,
                          y_strides_order_gpu,
                          cplace,
                          y_strides_order.data(),
                          bytes,
                          ctx.stream());
-
-    auto y_dims_order_tmp = paddle::memory::Alloc(
-        ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
     paddle::memory::Copy(gplace,
                          y_dims_order_gpu,
                          cplace,
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
index c43c3c04755f3..3961f82c8fd0f 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -55,11 +55,14 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
     cpu_ptrs[i + batch_size] = a_inv->data<T>() + i * n * n;
   }
 
-  // Copy the addresses of A and A_inv from host to device.
+  // Copy the addresses of A and A_inv from host to device,
+  // and allocate device memory for info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
   paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
       paddle::memory::Alloc(
           dev_ctx.GetPlace(),
-          cpu_ptrs.size() * sizeof(T*),
+          total_bytes,
           phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
@@ -67,20 +70,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                        static_cast<void*>(cpu_ptrs.data()),
                        cpu_ptrs.size() * sizeof(T*),
                        dev_ctx.stream());
-  T** gpu_inv_ptrs =
-      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-  // Allocate device memory for info and pivots.
-  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(
-          dev_ctx.GetPlace(),
-          num_ints * sizeof(int),
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+  T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
+  T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
+  int* gpu_info_ptr =
+      reinterpret_cast<int*>(gpu_inv_pivot_info + cpu_ptrs.size());
 
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
   std::vector<int> info;  // only for singular checking
   info.resize(batch_size);
   // This functions in cuBLAS is intended to be used for matrices of small
@@ -100,8 +95,7 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
     // This function performs the LU factorization of each matrix A by the
     // equation P * A = L * U. L and U are written back to original matrix A,
     // and diagonal elements of L are discarded.
-    int* gpu_pivot_ptr =
-        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+    int* gpu_pivot_ptr = gpu_info_ptr + batch_size;
     blas.BatchedGETRF(n,
                       reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
                       gpu_pivot_ptr,
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 63202ca4a484d..d4314307873f4 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -354,12 +354,6 @@ struct MatrixEighFunctor<GPUContext, T> {
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
     ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
-    auto info = paddle::memory::Alloc(
-        dev_ctx.GetPlace(),
-        sizeof(int) * batch_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    auto *info_ptr = reinterpret_cast<int *>(info->ptr());
-
     DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
     T *input_vector = input_trans.data<T>();
 
@@ -410,11 +404,13 @@ struct MatrixEighFunctor<GPUContext, T> {
                 out_value,
                 &workspace_size);
     }
+    size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
     auto work = paddle::memory::Alloc(
         dev_ctx.GetPlace(),
-        sizeof(T) * workspace_size,
+        total_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
+    auto *info_ptr = reinterpret_cast<int *>(work_ptr + workspace_size);
 
     for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;

From 5dfddaea8834b610641007139878fc3fbdde869d Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Wed, 1 Feb 2023 14:40:43 +0800
Subject: [PATCH 61/89] [Divide by 0 Error] add norm check (#49966)

* [Divide by 0 Error] add norm check

* [Divide by 0 Error] fix x AttributeError

* [Divide by 0 Error] norm check migrate to c++
---
 paddle/phi/kernels/cpu/p_norm_kernel.cc              | 7 +++++++
 paddle/phi/kernels/gpu/p_norm_kernel.cu              | 7 +++++++
 paddle/phi/kernels/xpu/p_norm_kernel.cc              | 8 ++++++++
 python/paddle/fluid/tests/unittests/test_norm_all.py | 9 +++++++++
 4 files changed, 31 insertions(+)

diff --git a/paddle/phi/kernels/cpu/p_norm_kernel.cc b/paddle/phi/kernels/cpu/p_norm_kernel.cc
index 597939953b277..bb33b8a397e02 100644
--- a/paddle/phi/kernels/cpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc
@@ -61,6 +61,13 @@ void PNormKernel(const Context& dev_ctx,
   int pre, n, post;
   GetDims(xdim, axis, &pre, &n, &post, asvector);
 
+  for (int i = 0; i < xdim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   auto* place = dev_ctx.eigen_device();
 
   Eigen::DSizes<int, 3> shape(pre, n, post);
diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu
index c7a6261ce381e..fb869a00d9c50 100644
--- a/paddle/phi/kernels/gpu/p_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu
@@ -105,6 +105,13 @@ void PNormKernel(const Context& dev_ctx,
   std::vector<int> reduce_axis =
       funcs::details::GetReduceDim(axis_dims, xdim.size(), asvector);
 
+  for (int i = 0; i < xdim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   using MT = typename dtype::MPTypeTrait<T>::Type;
   if (porder == 0) {
     phi::funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
diff --git a/paddle/phi/kernels/xpu/p_norm_kernel.cc b/paddle/phi/kernels/xpu/p_norm_kernel.cc
index 7ef72c61ad3aa..60abc59517b78 100644
--- a/paddle/phi/kernels/xpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/p_norm_kernel.cc
@@ -55,6 +55,14 @@ void PNormKernel(const Context& dev_ctx,
   int n = 1;
   int t = 1;
   GetDims(xdim, axis, &m, &t, &n, asvector);
+
+  for (int i = 0; i < xdim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   x_dim.push_back(m);
   x_dim.push_back(t);
   x_dim.push_back(n);
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index d70d0dd9f065d..beff458bd1b70 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -655,6 +655,15 @@ def err_dtype(p, shape_x, xdtype, out=None):
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1]
             )
 
+        with fluid.dygraph.guard():
+            # The size of input in Norm should not be 0.
+            def test_0_size():
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0, 0]), dtype='float32')
+                paddle.linalg.norm(x, axis=0)
+
+            self.assertRaises(ValueError, test_0_size)
+
 
 if __name__ == '__main__':
     paddle.enable_static()

From 226a6567f5f66ec6946b1cd4e62af3664e4b0caf Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Wed, 1 Feb 2023 14:46:34 +0800
Subject: [PATCH 62/89] [Divide by 0 Error] add eig check (#49971)

* [Divide by 0 Error] add eig check

* [Divide by 0 Error] eig check migrate to c++

* [Divide by 0 Error] Fix class name error
---
 paddle/phi/kernels/cpu/eig_kernel.cc          |  5 +++
 .../tests/unittests/test_linalg_eig_op.py     | 34 +++++++++++++++++++
 python/paddle/tensor/linalg.py                |  1 +
 3 files changed, 40 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_linalg_eig_op.py

diff --git a/paddle/phi/kernels/cpu/eig_kernel.cc b/paddle/phi/kernels/cpu/eig_kernel.cc
index 42a843391872f..c9bdf8af11682 100644
--- a/paddle/phi/kernels/cpu/eig_kernel.cc
+++ b/paddle/phi/kernels/cpu/eig_kernel.cc
@@ -31,6 +31,11 @@ void EigKernel(const Context& dev_ctx,
     int batch_count = BatchCount(x);
     int order = x.dims()[x.dims().size() - 1];
 
+    PADDLE_ENFORCE_LT(0,
+                      order,
+                      errors::InvalidArgument(
+                          "The order of Input(X) should be greater than 0."));
+
     DenseTensor real_w;
     DenseTensor real_v;
 
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_eig_op.py b/python/paddle/fluid/tests/unittests/test_linalg_eig_op.py
new file mode 100644
index 0000000000000..18d95a4f383d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linalg_eig_op.py
@@ -0,0 +1,34 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestEigAPIError(unittest.TestCase):
+    def test_errors(self):
+        # The size of input in Eig should not be 0.
+        def test_0_size():
+            array = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [0, 0]), dtype='float32')
+            paddle.linalg.eig(x)
+
+        self.assertRaises(ValueError, test_0_size)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 46f11130c0354..10c8c24a78724 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2323,6 +2323,7 @@ def eig(x, name=None):
             #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
             #         (-0.21026087843552282+0j)])
     """
+
     if in_dygraph_mode():
         return _C_ops.eig(x)
     else:

From f0811bb7dbcbda67793b5a6c1b520ca0adb8c2ac Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Wed, 1 Feb 2023 14:48:02 +0800
Subject: [PATCH 63/89] Fix errors for test_standalone_custom_stream (#50103)

---
 .../standalone_executor/test_standalone_custom_stream.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py
index 116aa60d05202..4126f84ed1e8c 100644
--- a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py
+++ b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_custom_stream.py
@@ -50,10 +50,10 @@ def set_custom_stream(self, prog):
         ops = prog.global_block().ops
         for op_index in op_index_for_stream1:
             ops[op_index].dist_attr.execution_stream = "s1"
-            ops[op_index].dist_attr.stream_priority = -1
+            ops[op_index].dist_attr.stream_priority = 0
         for op_index in op_index_for_stream2:
             ops[op_index].dist_attr.execution_stream = "s2"
-            ops[op_index].dist_attr.stream_priority = -2
+            ops[op_index].dist_attr.stream_priority = -1
 
     def run_program(self, apply_custom_stream=False):
         paddle.seed(2022)

From f71796b6a58f5c71505821cae5b83fdd7851ca1a Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Wed, 1 Feb 2023 14:48:24 +0800
Subject: [PATCH 64/89] [Divide by 0 Error] add lu check (#49974)

* [Divide by 0 Error] add lu check

* [Divide by 0 Error] lu check migrate to c++
---
 paddle/phi/kernels/impl/lu_kernel_impl.h          |  8 ++++++++
 python/paddle/fluid/tests/unittests/test_lu_op.py | 14 ++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h
index 31a83ea540176..5315e36b47172 100644
--- a/paddle/phi/kernels/impl/lu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_kernel_impl.h
@@ -520,6 +520,14 @@ DenseTensor Transpose2DTo6D(const Context& dev_ctx, const DenseTensor& x) {
   auto x_dim = x.dims();
   auto x_vec = phi::vectorize<int>(x_dim);
   int rank = x_vec.size();
+
+  for (int i = 0; i < x_dim.size(); i++) {
+    PADDLE_ENFORCE_LT(0,
+                      x_dim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+  }
+
   std::swap(x_vec[rank - 1], x_vec[rank - 2]);
   std::vector<int> out_shape = x_vec;
   std::vector<int> axis(rank);
diff --git a/python/paddle/fluid/tests/unittests/test_lu_op.py b/python/paddle/fluid/tests/unittests/test_lu_op.py
index 790ebb36f6d7c..3e083c76b71df 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_op.py
@@ -303,6 +303,20 @@ def run_lu_static(shape, dtype):
             run_lu_static(tensor_shape, dtype)
 
 
+class TestLUAPIError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.fluid.dygraph.guard():
+            # The size of input in lu should not be 0.
+            def test_0_size():
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [0, 0, 0]), dtype='float32'
+                )
+                paddle.linalg.lu(x, get_infos=True)
+
+            self.assertRaises(ValueError, test_0_size)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()

From 520f48d6367b519aa5c3ec0d4073fdc380c458b1 Mon Sep 17 00:00:00 2001
From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com>
Date: Wed, 1 Feb 2023 14:49:29 +0800
Subject: [PATCH 65/89] support grid_sampler_grad op for XPU (#49857)

---
 paddle/phi/backends/xpu/xpu2_op_list.cc       |  1 +
 .../kernels/xpu/grid_sample_grad_kernel.cc    | 87 +++++++++++++++++++
 .../fluid/tests/unittests/op_test_xpu.py      |  7 +-
 .../unittests/xpu/test_grid_sampler_op_xpu.py |  1 +
 4 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc

diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 67ac2b17a7094..b3635652ffd10 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -313,6 +313,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::INT64,
                      phi::DataType::INT32,
                      phi::DataType::FLOAT32})},
+      {"grid_sampler_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"grid_sampler", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid", XPUKernelSet({phi::DataType::FLOAT32})},
diff --git a/paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc
new file mode 100644
index 0000000000000..86e78b4b15cf9
--- /dev/null
+++ b/paddle/phi/kernels/xpu/grid_sample_grad_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grid,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  PADDLE_ENFORCE_EQ(
+      x.dims().size(),
+      4,
+      phi::errors::InvalidArgument(
+          ("XPU is only support input_dims == 4 in grid_sample_grad op.")));
+
+  const int64_t n = grid.dims()[0];
+  const int64_t out_h = grid.dims()[1];
+  const int64_t out_w = grid.dims()[2];
+  const int64_t c = x.dims()[1];
+  const int64_t in_h = x.dims()[2];
+  const int64_t in_w = x.dims()[3];
+
+  x_grad->Resize({n, c, in_h, in_w});
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+
+  T* grid_grad_ptr = nullptr;
+  if (grid_grad != nullptr) {
+    grid_grad->Resize({n, out_h, out_w, 2});
+    grid_grad_ptr = dev_ctx.template Alloc<T>(grid_grad);
+  }
+
+  bool is_nearest = false;
+  if (mode == "nearest") {
+    is_nearest = true;
+  }
+  int64_t padding_mode_type = 0;
+  if (padding_mode == "border") {
+    padding_mode_type = 1;
+  } else if (padding_mode == "reflection") {
+    padding_mode_type = 2;
+  }
+
+  int r = xpu::grid_sample_grad<T>(dev_ctx.x_context(),
+                                   x.data<T>(),
+                                   grid.data<T>(),
+                                   out_grid.data<T>(),
+                                   x_grad_ptr,
+                                   grid_grad_ptr,
+                                   n,
+                                   c,
+                                   in_h,
+                                   in_w,
+                                   out_h,
+                                   out_w,
+                                   is_nearest,
+                                   align_corners,
+                                   padding_mode_type,
+                                   true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sample_grad");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    grid_sample_grad, XPU, ALL_LAYOUT, phi::GridSampleGradKernel, float) {}
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index a7cb066db7a1d..c6a76c55635dd 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -35,6 +35,7 @@ def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
         cls.use_xpu = True
         cls.use_mkldnn = False
+        cls.epsilon_xpu2xpu = 0.00000001
         super().setUpClass()
 
     @classmethod
@@ -212,7 +213,11 @@ def check_grad_with_place(
             user_defined_grad_outputs=user_defined_grad_outputs,
         )
         self._assert_is_close(
-            a1, a2, inputs_to_check, 0.00000001, "Gradient Check On two xpu"
+            a1,
+            a2,
+            inputs_to_check,
+            self.epsilon_xpu2xpu,
+            "Gradient Check On two xpu",
         )
         self._assert_is_close(
             a1,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
index 3b2deaf4396bb..c92ddc9531b21 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
@@ -170,6 +170,7 @@ def setUp(self):
             self.place = paddle.XPUPlace(0)
             self.init_dtype()
             self.op_type = 'grid_sampler'
+            self.epsilon_xpu2xpu = 0.000001
 
             self.use_cudnn = False
             self.align_corners = True

From 5349b9b9b3fce86204686dcf9c6bb327d56ba4a5 Mon Sep 17 00:00:00 2001
From: ZZK <359521840@qq.com>
Date: Wed, 1 Feb 2023 15:11:31 +0800
Subject: [PATCH 66/89] bump isort version to 5.11.5 (#50126)

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bd09715e0a707..8168824189643 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -61,7 +61,7 @@ repos:
     -   id: black
         files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/pycqa/isort
-    rev: 5.10.1
+    rev: 5.11.5
     hooks:
     -   id: isort
 -   repo: https://github.com/PyCQA/flake8

From c62657b3eccf2fa41e12c94b5a34d82a6f54890f Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 15:15:12 +0800
Subject: [PATCH 67/89] Fix Python IndexError of case9:
 paddle.static.nn.deform_conv2d (#49990)

* add dimension check for deformable_conv

* add unittest
---
 .../tests/unittests/test_deform_conv2d.py      | 18 ++++++++++++++++++
 python/paddle/static/nn/common.py              |  5 +++++
 2 files changed, 23 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
index 7a5093f872ced..d484e140b6e1d 100644
--- a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
@@ -726,5 +726,23 @@ def setUp(self):
         self.no_bias = False
 
 
+class TestDeformConv2DError(unittest.TestCase):
+    def test_input_error(self):
+        def test_input_rank_error():
+            paddle.enable_static()
+            x = paddle.static.data(name='error_x_1', shape=[0], dtype='float32')
+            offset = paddle.static.data(
+                name='error_offset_1', shape=[0], dtype='float32'
+            )
+            mask = paddle.static.data(
+                name='error_mask_1', shape=[0, 0, 0], dtype='float32'
+            )
+            out = paddle.static.nn.deform_conv2d(
+                x, offset, mask, 0, 0, deformable_groups=0
+            )
+
+        self.assertRaises(ValueError, test_input_rank_error)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index c43385a8e9140..1581f299214df 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -2244,6 +2244,11 @@ def deformable_conv(
         mask, 'mask', (paddle.static.Variable, type(None)), 'deformable_conv'
     )
 
+    if input.ndim != 4:
+        raise ValueError(
+            f'The input should be of [N, C, H, W] format, but received {input.shape}'
+        )
+
     num_channels = input.shape[1]
     assert param_attr is not False, "param_attr should not be False here."
 

From 9fa2eb387429c25f2ccbdf6969ab8886ed5ef6dc Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 1 Feb 2023 15:20:42 +0800
Subject: [PATCH 68/89] jit layer support multi thread and fix predictor clone
 (#50095)

* jit layer support multi thread

* fix bug

* clone prediector not do graph optimizer

* format

* fix comment and format

* fix override and fromat

* fix

* fix
---
 .../fluid/inference/api/analysis_predictor.cc |  3 +
 paddle/fluid/jit/compilation_unit.cc          |  8 +++
 paddle/fluid/jit/compilation_unit.h           |  2 +
 paddle/fluid/jit/engine/base_engine.h         |  2 +
 paddle/fluid/jit/engine/interpreter_engine.cc |  9 ++-
 paddle/fluid/jit/engine/interpreter_engine.h  |  8 ++-
 paddle/fluid/jit/engine/predictor_engine.cc   | 17 ++++++
 paddle/fluid/jit/engine/predictor_engine.h    | 13 ++++-
 paddle/fluid/jit/layer.cc                     | 12 +++-
 paddle/fluid/jit/layer.h                      |  3 +
 paddle/fluid/jit/layer_test.cc                | 56 +++++++++++++++++++
 paddle/fluid/jit/serializer.cc                |  2 +
 12 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e89bcfa2c6a99..49e18f95000ff 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1086,6 +1086,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 }
 
 void AnalysisPredictor::PrepareArgument() {
+  VLOG(3) << "AnalysisPredictor::PrepareArgument";
   // Init std::unique_ptr argument_.
   argument_.reset(new Argument);
   argument_->SetUseGPU(config_.use_gpu());
@@ -2246,10 +2247,12 @@ AnalysisPredictor::~AnalysisPredictor() {
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
+  VLOG(3) << "AnalysisPredictor::Clone";
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->status_is_cloned_ = true;
   x->root_predictor_id_ = this->root_predictor_id_;
+  x->config_.apply_optim_ = false;
   if (config_.use_external_stream_ && stream == nullptr) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "config has been configured to use external stream, but the Clone "
diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc
index 0f241d864fe07..1a2351048f90a 100644
--- a/paddle/fluid/jit/compilation_unit.cc
+++ b/paddle/fluid/jit/compilation_unit.cc
@@ -38,5 +38,13 @@ void CompilationUnit::SetEngine(const std::string &name,
 
 const jit::EngineMap &CompilationUnit::EngineMap() const { return engine_map_; }
 
+std::shared_ptr<CompilationUnit> CompilationUnit::Clone(void *stream) {
+  auto x = std::make_shared<CompilationUnit>();
+  for (auto &it : engine_map_) {
+    x->SetEngine(it.first, std::move(it.second->Clone(stream)));
+  }
+  return x;
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h
index b862faa23f978..25e725fe57b9e 100644
--- a/paddle/fluid/jit/compilation_unit.h
+++ b/paddle/fluid/jit/compilation_unit.h
@@ -36,6 +36,8 @@ class CompilationUnit {
 
   const jit::EngineMap &EngineMap() const;
 
+  std::shared_ptr<CompilationUnit> Clone(void *stream = nullptr);
+
  private:
   jit::EngineMap engine_map_;
 };
diff --git a/paddle/fluid/jit/engine/base_engine.h b/paddle/fluid/jit/engine/base_engine.h
index eaf3c1221c8a2..b6571d7ebdd41 100644
--- a/paddle/fluid/jit/engine/base_engine.h
+++ b/paddle/fluid/jit/engine/base_engine.h
@@ -29,6 +29,8 @@ class BaseEngine {
 
   virtual std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) = 0;
 
+  virtual std::unique_ptr<BaseEngine> Clone(void *stream = nullptr) = 0;
+
   virtual ~BaseEngine() {}
 };
 
diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
index 410fd4dc01bed..36f8a2271d1ef 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.cc
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -28,14 +28,14 @@ namespace jit {
 InterpreterEngine::InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
                                      const VariableMap &params_dict,
                                      const phi::Place &place)
-    : info_(info), place_(place) {
+    : info_(info), params_dict_(params_dict), place_(place) {
   info_->RemoveDescFeedFetch();
   PADDLE_ENFORCE_GT(
       static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
       0,
       platform::errors::PreconditionNotMet(
           "There is no operator in ProgramDesc."));
-  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
+  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict_, &scope_);
   VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
   CreateInterpreterCore();
 }
@@ -98,5 +98,10 @@ const std::shared_ptr<FunctionInfo> &InterpreterEngine::Info() const {
   return info_;
 }
 
+std::unique_ptr<BaseEngine> InterpreterEngine::Clone(void *stream) {
+  auto *x = new InterpreterEngine(info_, params_dict_, place_);
+  return std::unique_ptr<BaseEngine>(x);
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/engine/interpreter_engine.h b/paddle/fluid/jit/engine/interpreter_engine.h
index 8c7f43f297d22..d7aa5d610a50e 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.h
+++ b/paddle/fluid/jit/engine/interpreter_engine.h
@@ -43,14 +43,18 @@ class InterpreterEngine : public BaseEngine {
 
   void CreateInterpreterCore();
 
-  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs);
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) override;
 
-  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs);
+  std::vector<DenseTensor> operator()(
+      const std::vector<DenseTensor> &inputs) override;
 
   const std::shared_ptr<FunctionInfo> &Info() const;
 
+  std::unique_ptr<BaseEngine> Clone(void *stream = nullptr) override;
+
  private:
   std::shared_ptr<FunctionInfo> info_;
+  VariableMap params_dict_;
   framework::Scope scope_;
   phi::Place place_;
   std::shared_ptr<framework::InterpreterCore> inner_interpreter_;
diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index 6a44c192c16f7..bac6f993b04f6 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -55,6 +55,17 @@ PredictorEngine::PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
       scope_, std::make_shared<framework::ProgramDesc>(info_->ProgramDesc()));
 }
 
+PredictorEngine::PredictorEngine(
+    const std::shared_ptr<FunctionInfo> &info,
+    const std::shared_ptr<framework::Scope> &scope,
+    const phi::Place &place,
+    const std::shared_ptr<PaddlePredictor> &predictor)
+    : info_(info),
+      scope_(scope),
+      place_(place),
+      predictor_(std::dynamic_pointer_cast<AnalysisPredictor, PaddlePredictor>(
+          predictor)) {}
+
 std::vector<Tensor> PredictorEngine::operator()(
     const std::vector<Tensor> &inputs) {
   auto dense_tensors = utils::ToDenseTensors(inputs);
@@ -188,5 +199,11 @@ static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
   return true;
 }
 
+std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
+  auto *x = new PredictorEngine(
+      info_, scope_, place_, std::move(predictor_->Clone(stream)));
+  return std::unique_ptr<BaseEngine>(x);
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/engine/predictor_engine.h b/paddle/fluid/jit/engine/predictor_engine.h
index 026b012cbfb02..ad07a7a7ffbf5 100644
--- a/paddle/fluid/jit/engine/predictor_engine.h
+++ b/paddle/fluid/jit/engine/predictor_engine.h
@@ -20,6 +20,7 @@
 
 namespace paddle {
 class AnalysisPredictor;
+class PaddlePredictor;
 
 namespace framework {
 class Scope;
@@ -33,11 +34,19 @@ class PredictorEngine : public BaseEngine {
                   const VariableMap &params_dict,
                   const phi::Place &place);
 
+  PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
+                  const std::shared_ptr<framework::Scope> &scope,
+                  const phi::Place &place,
+                  const std::shared_ptr<PaddlePredictor> &predictor);
+
   ~PredictorEngine() noexcept {}
 
-  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs);
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) override;
+
+  std::vector<DenseTensor> operator()(
+      const std::vector<DenseTensor> &inputs) override;
 
-  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs);
+  std::unique_ptr<BaseEngine> Clone(void *stream = nullptr) override;
 
  private:
   std::shared_ptr<FunctionInfo> info_;
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index 75a7e282e6be8..332c53a8e3649 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -30,7 +30,10 @@ Layer::Layer(const VariableMap& params_map,
              const VariableMap& attrs_map,
              const FunctionInfoMap& info_map,
              const phi::Place& place)
-    : params_map_(params_map), attrs_map_(attrs_map), info_map_(info_map) {
+    : params_map_(params_map),
+      attrs_map_(attrs_map),
+      info_map_(info_map),
+      place_(place) {
   unit_.reset(new CompilationUnit());
 }
 
@@ -94,5 +97,12 @@ PD_SPECIALZE_ATTRIBUTE_TYPE(std::vector<int>)
 PD_SPECIALZE_ATTRIBUTE_TYPE(std::vector<float>)
 PD_SPECIALZE_ATTRIBUTE_TYPE(std::vector<std::string>)
 
+std::shared_ptr<Layer> Layer::Clone(void* stream) {
+  std::shared_ptr<Layer> x =
+      std::make_shared<Layer>(params_map_, attrs_map_, info_map_, place_);
+  x->unit_ = unit_->Clone(stream);
+  return x;
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h
index dd5ff5d9f91cd..ed8b739a0b72f 100644
--- a/paddle/fluid/jit/layer.h
+++ b/paddle/fluid/jit/layer.h
@@ -67,10 +67,13 @@ class Layer {
 
   std::vector<std::string> FunctionNames() const;
 
+  std::shared_ptr<Layer> Clone(void* stream = nullptr);
+
  private:
   VariableMap params_map_;
   VariableMap attrs_map_;
   FunctionInfoMap info_map_;
+  phi::Place place_;
   std::shared_ptr<CompilationUnit> unit_;
 };
 
diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc
index 4e367d8cc1b51..c163f3c50d9dd 100644
--- a/paddle/fluid/jit/layer_test.cc
+++ b/paddle/fluid/jit/layer_test.cc
@@ -20,6 +20,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -78,7 +79,11 @@ TEST(CpuLayerTest, Function) {
 TEST(CpuLayerTest, Construct) {
   auto place = phi::CPUPlace();
   std::string path = "./multi_program_load/export";
+  paddle::platform::Timer timer;
+  timer.Start();
   auto layer = jit::Load(path, place);
+  timer.Pause();
+  std::cout << "jit::Load coast" << timer.ElapsedMS() << std::endl;
 
   float fbias = layer.Attribute<float>("fbias");
   EXPECT_FLOAT_EQ(fbias, 1.4);
@@ -119,6 +124,41 @@ TEST(CpuLayerTest, Construct) {
   EXPECT_NEAR(out_data[0], pow(1.41562390, 2.0), 1e-6);
 }
 
+TEST(CpuLayerTest, Clone) {
+  auto place = phi::CPUPlace();
+  std::string path = "./multi_program_load/export";
+
+  paddle::platform::Timer timer;
+  timer.Start();
+  auto layer = jit::Load(path, place);
+  timer.Pause();
+  std::cout << "jit::Load cost " << timer.ElapsedMS() << " ms" << std::endl;
+
+  timer.Start();
+  auto layer2 = layer.Clone();
+  timer.Pause();
+  std::cout << "jit::Layer::Clone cost " << timer.ElapsedMS() << " ms"
+            << std::endl;
+
+  float fbias = layer2->Attribute<float>("fbias");
+  EXPECT_FLOAT_EQ(fbias, 1.4);
+
+  auto inputs = PrepareInputs(place);
+  auto outs = layer2->forward(inputs);
+  auto out_data = outs[0].data<float>();
+  EXPECT_NEAR(out_data[0], 0.02194316, 1e-6);
+
+  auto func = layer2->Function("infer");
+  EXPECT_TRUE(func.IsValid());
+  outs = func(inputs);
+  out_data = outs[0].data<float>();
+  EXPECT_NEAR(out_data[0], 1.41562390, 1e-6);
+  auto pow_out =
+      paddle::experimental::pow(outs[0], paddle::experimental::Scalar(2));
+  out_data = pow_out.data<float>();
+  EXPECT_NEAR(out_data[0], pow(1.41562390, 2.0), 1e-6);
+}
+
 #if defined(PADDLE_WITH_CUDA)
 TEST(GpuLayerTest, Construct) {
   auto place = phi::GPUPlace();
@@ -147,6 +187,22 @@ TEST(GpuLayerTest, Construct) {
   out_data = cpu_tensor.data<float>();
   EXPECT_NEAR(out_data[0], sqrt(1.41562390), 1e-6);
 }
+
+TEST(GpuLayerTest, Clone) {
+  auto place = phi::GPUPlace();
+
+  std::string path = "./multi_program_load/export";
+  auto layer = jit::Load(path, place);
+  auto inputs = PrepareInputs(place);
+
+  auto layer2 = layer.Clone();
+  auto outs = layer2->forward(inputs);
+  auto gpu_tensor = outs[0];
+  auto cpu_tensor =
+      paddle::experimental::copy_to(gpu_tensor, phi::CPUPlace(), true);
+  auto out_data = cpu_tensor.data<float>();
+  EXPECT_NEAR(out_data[0], 0.02194316, 1e-6);
+}
 #endif
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
index 0a7fdc0e3525a..436717a8dc389 100644
--- a/paddle/fluid/jit/serializer.cc
+++ b/paddle/fluid/jit/serializer.cc
@@ -30,8 +30,10 @@ DECLARE_string(jit_engine_type);
 
 namespace paddle {
 namespace jit {
+
 using FunctionInfoMap =
     std::unordered_map<std::string, std::shared_ptr<FunctionInfo>>;
+
 Layer Deserializer::operator()(const std::string& path,
                                const phi::Place& place) {
   const auto& pdmodel_paths = utils::PdmodelFilePaths(path);

From e03718f5b0049c7cddc4729d3aa786398c6873ce Mon Sep 17 00:00:00 2001
From: Wang Bojun <105858416+wwbitejotunn@users.noreply.github.com>
Date: Wed, 1 Feb 2023 15:22:31 +0800
Subject: [PATCH 69/89] Preln fix (#49802)

* preln_residual 2 fused_bias_residual

* skip layernorm fix and ut

* code refine

* code style refine

* fix ut

* fix output

* add trt layer fall back info

* refine op teller and ut

* DropoutMaskOut output fix
---
 .../ir/preln_residual_bias_fuse_pass.cc       | 51 +++++++++++++++----
 .../ir/trt_skip_layernorm_fuse_pass.cc        |  2 +-
 .../fluid/inference/api/analysis_predictor.cc |  2 +-
 .../tensorrt/convert/preln_residual_bias.cc   | 27 +++++-----
 paddle/fluid/inference/tensorrt/op_teller.cc  | 20 ++++++--
 ...sed_bias_dropout_residual_layer_norm_op.cc |  9 ++--
 ...sed_bias_dropout_residual_layer_norm_op.cu |  8 ++-
 .../fused_layernorm_residual_dropout_bias.h   |  7 +--
 .../unittests/ir/inference/CMakeLists.txt     |  9 ----
 .../test_trt_convert_preln_residual_bias.py   | 15 +++++-
 ...test_trt_convert_preln_residual_no_bias.py | 16 +++++-
 .../test_ir_preln_residual_bias_fuse_pass.py  |  4 +-
 12 files changed, 117 insertions(+), 53 deletions(-)

diff --git a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
index 13b7b4ac72f96..48baf1f4b102f 100644
--- a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
@@ -129,6 +129,24 @@ void PrelnResidualBias::operator()(PDNode *x, PDNode *y) {
 
 }  // namespace patterns
 
+void setIntermediateOut(OpDesc *desc,
+                        const std::string &out_name,
+                        const std::string &scope_name) {
+  std::string new_name = scope_name + "/at." + out_name + ".new";
+  desc->SetOutput(out_name, {new_name});
+}
+
+void addIntermediateOut(Node *op_node,
+                        const std::string &out_name,
+                        const std::string &scope_name,
+                        Graph *graph) {
+  std::string new_name = scope_name + "/at." + out_name + ".new";
+  VarDesc out_var(new_name);
+  out_var.SetPersistable(false);
+  auto *node_var = graph->CreateVarNode(&out_var);
+  IR_NODE_LINK_TO(op_node, node_var);
+}
+
 int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
                                             bool with_bias) const {
   PADDLE_ENFORCE_NOT_NULL(
@@ -207,7 +225,7 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
     // on each other, so we make below check to ensure only one
     // PrelnResidualBias pattern is delalted with.
     for (auto op : elementwise1_out->inputs) {
-      if (op->Name() == "preln_residual_bias") return;
+      if (op->Name() == "fused_bias_dropout_residual_layer_norm") return;
     }
 
     if (!IsCompat(subgraph, graph)) {
@@ -218,31 +236,37 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
     std::unordered_set<const Node *> del_node_set;
     // Create an PrelnResidualBias op node
     OpDesc new_desc;
-    new_desc.SetType("preln_residual_bias");
+    new_desc.SetType("fused_bias_dropout_residual_layer_norm");
     // inputs
     new_desc.SetInput("X", {subgraph.at(x)->Name()});
-    new_desc.SetInput("Y", {subgraph.at(y)->Name()});
-    new_desc.SetInput("Scale", {layer_norm_scale->Name()});
-    new_desc.SetInput("Bias", {layer_norm_bias->Name()});
+    new_desc.SetInput("Residual", {subgraph.at(y)->Name()});
+    new_desc.SetInput("LnScale", {layer_norm_scale->Name()});
+    new_desc.SetInput("LnBias", {layer_norm_bias->Name()});
     if (with_bias) {
-      new_desc.SetInput("EleBias", {elementwise_bias->Name()});
+      new_desc.SetInput("Bias", {elementwise_bias->Name()});
     }
     // outputs
-    new_desc.SetOutput("Out_0", {layer_norm_out->Name()});
-    new_desc.SetOutput("Out_1", {elementwise1_out->Name()});
+    new_desc.SetOutput("Y", {layer_norm_out->Name()});
+    new_desc.SetOutput("BiasDropoutResidualOut", {elementwise1_out->Name()});
+    new_desc.SetOutput("LnMean", {layer_norm_mean->Name()});
+    new_desc.SetOutput("LnVariance", {layer_norm_variance->Name()});
+    setIntermediateOut(&new_desc, "DropoutMaskOut", "preln_residual_bias_fuse");
     // attrs
-    new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("ln_epsilon", layer_norm->Op()->GetAttr("epsilon"));
+    new_desc.SetAttr("dropout_rate", 0.0f);
+    new_desc.SetAttr("is_test", true);
     new_desc.SetAttr("begin_norm_axis",
                      layer_norm->Op()->GetAttr("begin_norm_axis"));
     auto fused_node = graph->CreateOpNode(&new_desc);  // OpDesc will be copied.
+    addIntermediateOut(
+        fused_node, "DropoutMaskOut", "preln_residual_bias_fuse", graph);
+
     if (with_bias) {
       del_node_set.insert(elementwise0);
       del_node_set.insert(elementwise0_out);
     }
     del_node_set.insert(elementwise1);
     del_node_set.insert(layer_norm);
-    del_node_set.insert(layer_norm_mean);
-    del_node_set.insert(layer_norm_variance);
     GraphSafeRemoveNodes(graph, del_node_set);
     IR_NODE_LINK_TO(subgraph.at(x), fused_node);
     IR_NODE_LINK_TO(subgraph.at(y), fused_node);
@@ -253,6 +277,9 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
     IR_NODE_LINK_TO(layer_norm_bias, fused_node);
     IR_NODE_LINK_TO(fused_node, layer_norm_out);
     IR_NODE_LINK_TO(fused_node, elementwise1_out);
+    IR_NODE_LINK_TO(fused_node, layer_norm_mean);
+    IR_NODE_LINK_TO(fused_node, layer_norm_variance);
+
     found_subgraph_count++;
   };
 
@@ -261,6 +288,8 @@ int PrelnResidualBiasFusePass::ApplyPattern(ir::Graph *graph,
 }
 
 void PrelnResidualBiasFusePass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(1) << "Fuse PrelnResidualBias into "
+             "fused_bias_dropout_residual_layer_norm op with dropout rate = 0";
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init("preln_residual_bias_fuse", graph);
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index db023746ac4c7..18ea8850dc5bf 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -170,7 +170,7 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     // attrs
     new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon"));
 
-    if (new_desc.HasAttr("begin_norm_axis")) {
+    if (layer_norm->Op()->HasAttr("begin_norm_axis")) {
       int32_t begin_norm_axis = PADDLE_GET_CONST(
           int32_t, layer_norm->Op()->GetAttr("begin_norm_axis"));
       int32_t input_rank =
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 49e18f95000ff..bd49153f6b85e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2464,7 +2464,7 @@ USE_TRT_CONVERTER(rsqrt);
 USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
 USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(preln_skip_layernorm)
-USE_TRT_CONVERTER(preln_residual_bias)
+USE_TRT_CONVERTER(fused_bias_dropout_residual_layer_norm)
 USE_TRT_CONVERTER(c_allreduce_sum)
 USE_TRT_CONVERTER(roll)
 USE_TRT_CONVERTER(strided_slice)
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
index 28847aa5b7a30..85f9106b01148 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -26,16 +26,12 @@ class PrelnResidualBiasOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
-    VLOG(4) << "convert fused preln_residual_bias op to tensorrt layer";
-    if (!engine_->with_dynamic_shape()) {
-      PADDLE_THROW(
-          platform::errors::Fatal("Unsupported static graph mode. Please set "
-                                  "dynamic shape of inputs."));
-    }
+    VLOG(4) << "convert fused_bias_dropout_residual_layer_norm op with "
+               "drop_rate = 0 to preln_residual_bias tensorrt layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Residual")[0]);
     std::vector<nvinfer1::ITensor*> inputs;
     inputs.push_back(input1);
     inputs.push_back(input2);
@@ -50,18 +46,18 @@ class PrelnResidualBiasOpConverter : public OpConverter {
       return temp_data;
     };
     framework::DDim bias_dims, scale_dims, ele_bias_dims;
-    auto* bias = get_persistable_data("Bias", &bias_dims);
-    auto* scale = get_persistable_data("Scale", &scale_dims);
+    auto* bias = get_persistable_data("LnBias", &bias_dims);
+    auto* scale = get_persistable_data("LnScale", &scale_dims);
     auto const& vars = op_desc.Inputs(false);
-    bool has_bias = vars.find("EleBias") != vars.end();
+    bool has_bias = vars.find("Bias") != vars.end();
     float* ele_bias =
-        has_bias ? get_persistable_data("EleBias", &ele_bias_dims) : nullptr;
+        has_bias ? get_persistable_data("Bias", &ele_bias_dims) : nullptr;
 
     int bias_size = phi::product(bias_dims);
 
     int scale_size = phi::product(scale_dims);
     int ele_bias_size = has_bias ? phi::product(ele_bias_dims) : 0;
-    float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon"));
+    float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("ln_epsilon"));
     bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
     if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
       with_fp16 = true;
@@ -102,8 +98,8 @@ class PrelnResidualBiasOpConverter : public OpConverter {
     plugin_inputs.emplace_back(input2);
     layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
     std::vector<std::string> output_names;
-    output_names.push_back(op_desc.Output("Out_0")[0]);
-    output_names.push_back(op_desc.Output("Out_1")[0]);
+    output_names.push_back(op_desc.Output("Y")[0]);
+    output_names.push_back(op_desc.Output("BiasDropoutResidualOut")[0]);
     RreplenishLayerAndOutput(
         layer, "preln_residual_bias", output_names, test_mode);
   }
@@ -113,4 +109,5 @@ class PrelnResidualBiasOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_TRT_OP_CONVERTER(preln_residual_bias, PrelnResidualBiasOpConverter);
+REGISTER_TRT_OP_CONVERTER(fused_bias_dropout_residual_layer_norm,
+                          PrelnResidualBiasOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 0075c64759333..e9c34408bb6bf 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1495,7 +1495,21 @@ struct SimpleOpTypeSetTeller : public Teller {
         return false;
       }
     }
-
+    if (op_type == "fused_bias_dropout_residual_layer_norm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_bias_dropout_residual_layer_norm should run on "
+                   "dynamic shape mode.";
+        return false;
+      }
+      float dropout_rate =
+          PADDLE_GET_CONST(float, desc.GetAttr("dropout_rate"));
+      if (dropout_rate != 0.0f) {
+        VLOG(4) << "preln_residual_bias trt layer can not work with "
+                   "fused_bias_dropout_residual_layer_norm op in which the "
+                   "dropout_rate != 0, stop convert";
+        return false;
+      }
+    }
     if (op_type == "fused_preln_embedding_eltwise_layernorm") {
       if (!with_dynamic_shape) {
         VLOG(3) << "fused_preln_embedding_eltwise_layernorm should run on "
@@ -2594,7 +2608,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "slice",
       "strided_slice",
       "fused_preln_embedding_eltwise_layernorm",
-      "preln_residual_bias",
+      "fused_bias_dropout_residual_layer_norm",
       "c_allreduce_sum",
       "c_allreduce_min",
       "c_allreduce_max",
@@ -2744,7 +2758,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "strided_slice",
       "fused_preln_embedding_eltwise_layernorm",
       "preln_skip_layernorm",
-      "preln_residual_bias",
+      "fused_bias_dropout_residual_layer_norm",
       "c_allreduce_sum",
       "c_allreduce_min",
       "c_allreduce_max",
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index a6fa80a493972..7f877867050ed 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -35,16 +35,17 @@ class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
                    "Output",
                    "LnVariance",
                    "FusedBiasDropoutResidualLnOp");
-    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"),
-                   "Output",
-                   "BiasDropoutResidualOut",
-                   "FusedBiasDropoutResidualLnOp");
     OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"),
                    "Output",
                    "DropoutMaskOut",
                    "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"),
+                   "Output",
+                   "BiasDropoutResidualOut",
+                   "FusedBiasDropoutResidualLnOp");
     OP_INOUT_CHECK(
         ctx->HasOutput("Y"), "Output", "Y", "FusedBiasDropoutResidualLnOp");
+
     auto x_dim = ctx->GetInputDim("X");
     int left = 1;
     for (int i = 0; i < x_dim.size() - 1; i++) {
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 2562c2cc22575..01a233950b279 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -54,8 +54,12 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(ln_mean, ln_mean->numel() * sizeof(U));
     auto *ln_var_data = dev_ctx.Alloc<U>(ln_var, ln_var->numel() * sizeof(U));
-    auto *dropout_mask_out_data = dev_ctx.Alloc<uint8_t>(
-        dropout_mask_out, dropout_mask_out->numel() * sizeof(uint8_t));
+    auto *dropout_mask_out_data =
+        (dropout_mask_out == nullptr)
+            ? nullptr
+            : dev_ctx.Alloc<uint8_t>(
+                  dropout_mask_out,
+                  dropout_mask_out->numel() * sizeof(uint8_t));
     auto *y_data = dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
 
     const auto input_x_dims = input_x->dims();
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index c65364d2818d1..0c4e10fa156f9 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -854,9 +854,10 @@ void LaunchLayernormResidualDropoutBias(
                  residual,
                  rows * cols * sizeof(T),
                  ctx.stream());
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
-        mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
-
+    if (mask_data != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
+          mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
+    }
     // call layernorm forward
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index bdcf6ab951022..2dd35d10d5ff0 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -18,15 +18,6 @@ string(REPLACE ".py" "" TEST_TRT_CONVERTER "${TEST_TRT_CONVERTER}")
 
 if(NOT WITH_DISTRIBUTE)
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_delete_c_identity_op_pass")
-  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
-       "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_preln_residual_bias")
-  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
-       "test_trt_convert_preln_residual_no_bias")
-  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_preln_residual_no_bias")
-  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_preln_residual_no_bias")
-
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_trt_convert_c_allreduce")
   list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_c_allreduce")
   list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_c_allreduce")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
index 9e17b83ab9c1e..a45ddfcae189e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_bias.py
@@ -158,11 +158,24 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            return 1, 4
+            if dynamic_shape:
+                return 1, 4
+            else:
+                return 0, 5
 
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
+        # for static_shape, fall back to fluid fused op
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
 
         # just support dynamic_shape
         generate_dynamic_shape(attrs)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
index aef2142bf3e8e..fd3bdb64c7ede 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_preln_residual_no_bias.py
@@ -146,12 +146,26 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            return 1, 4
+            if dynamic_shape:
+                return 1, 4
+            else:
+                return 0, 5
 
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
+        # for static_shape, fall back to fluid fused op
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-2  # atol=1e-2 while rtol is 1e-8
+
         # just support dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
index 8f74ceebb6586..c66ee86453288 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_preln_residual_bias_fuse_pass.py
@@ -38,7 +38,7 @@ def setUp(self):
 
         self.fetch_list = [out, elementwise_out]
         self.pass_names = "preln_residual_bias_fuse_pass"
-        self.fused_op_type = "preln_residual_bias"
+        self.fused_op_type = "fused_bias_dropout_residual_layer_norm"
         self.num_fused_ops = 1
         # self.graph_attrs = {
         #     "embedding_eltwise_layernorm_fuse_pass_flag": True,
@@ -72,7 +72,7 @@ def setUp(self):
 
         self.fetch_list = [out, elementwise_out]
         self.pass_names = "preln_residual_bias_fuse_pass"
-        self.fused_op_type = "preln_residual_bias"
+        self.fused_op_type = "fused_bias_dropout_residual_layer_norm"
         self.num_fused_ops = 1
 
     def test_check_program(self):

From 776021c10278d5e3e714237943022e8118fad526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Wed, 1 Feb 2023 15:34:42 +0800
Subject: [PATCH 70/89] fix the NullPointerError of matrix_power (#50015)

---
 python/paddle/fluid/tests/unittests/test_matrix_power_op.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
index 6381aeeca9868..8296aa320f59b 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py
@@ -317,6 +317,12 @@ def test_errors(self):
         input = fluid.data(name="input_4", shape=[1, 1, 0, 0], dtype="float32")
         self.assertRaises(ValueError, paddle.linalg.matrix_power, input, 2)
 
+        # The size of input should not be 0
+        input = fluid.data(name="input_5", shape=[0, 0], dtype="float32")
+        self.assertRaises(
+            ValueError, paddle.linalg.matrix_power, input, -956301312
+        )
+
 
 class TestMatrixPowerSingularAPI(unittest.TestCase):
     def setUp(self):

From 3a73d3488410ffac3cc049fdce7d9fbf7a3a62f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Wed, 1 Feb 2023 15:36:01 +0800
Subject: [PATCH 71/89] fix the div 0 error of sparse_embedding (#49948)

* fix the div 0 error of sparse_embedding

* add unittest
---
 python/paddle/fluid/contrib/layers/nn.py      |  3 ++
 .../unittests/test_sparse_embedding_op.py     | 37 +++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py

diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index b836dfa451c33..d2aff8bfcf659 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -793,6 +793,9 @@ def sparse_embedding(
         'paddle.static.nn.sparse_embedding',
     )
 
+    if input.size == 0:
+        raise ValueError("input size should not be 0")
+
     w = helper.create_parameter(
         attr=helper.param_attr,
         shape=size,
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py b/python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py
new file mode 100644
index 0000000000000..0e0beda67971e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_embedding_op.py
@@ -0,0 +1,37 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+
+class TestSparseEmbeddingAPIError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.fluid.dygraph.guard():
+            # The size of input in sparse_embedding should not be 0.
+            def test_0_size():
+                input = paddle.to_tensor([], dtype='int64')
+                paddle.static.nn.sparse_embedding(
+                    input,
+                    [2097152, 2097152, 2097152, 2097152],
+                    padding_idx=2097152,
+                )
+
+            self.assertRaises(ValueError, test_0_size)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()

From 0361903789ea754079a95fe7df5876196fdf9ed7 Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Wed, 1 Feb 2023 16:33:04 +0800
Subject: [PATCH 72/89] Skip the int input operator when inserting a quant node
 & fix some bug (#49926)

---
 .../static/quantization/quantization_pass.py  | 86 ++++++++++++-------
 1 file changed, 53 insertions(+), 33 deletions(-)

diff --git a/python/paddle/static/quantization/quantization_pass.py b/python/paddle/static/quantization/quantization_pass.py
index 83587563c4930..c9094998dfe24 100644
--- a/python/paddle/static/quantization/quantization_pass.py
+++ b/python/paddle/static/quantization/quantization_pass.py
@@ -2890,6 +2890,19 @@ def apply(self, graph):
                         )
                         if in_node.persistable():
                             continue
+
+                        if in_node.dtype() not in [
+                            paddle.float64,
+                            paddle.float32,
+                            paddle.float16,
+                        ]:
+                            _logger.warning(
+                                "Since the {} contains an input of type INT, the quantization of this layer is skipped.".format(
+                                    op_node.name()
+                                )
+                            )
+                            break
+
                         if arg_name in dequantized_vars_map:
                             dequant_var_node = dequantized_vars_map[arg_name]
                         else:
@@ -3137,7 +3150,7 @@ def __init__(
         self._save_int_weight = save_int_weight
         assert self._scope is not None, "scope must not be None."
         assert self._place is not None, "place must not be None."
-        self._quantized_ops = set()
+        self._quantized_ops = {}
 
     def apply(self, graph):
         assert isinstance(
@@ -3176,7 +3189,6 @@ def apply(self, graph):
                 quant_axis = _op.op().attr("quant_axis")
                 bits_length = _op.op().attr("bit_length")
                 if x_node.name() not in self._quantized_ops:
-                    self._quantized_ops.add(x_node.name())
                     quantized_param_v = utils.quant_tensor(
                         param_v.copy(),
                         scale_v,
@@ -3211,10 +3223,13 @@ def apply(self, graph):
                         self._scope,
                         self._place,
                     )
+                    self._quantized_ops[x_node.name()] = quant_weight_node
 
                 for next_op_node in out_node.outputs:
                     graph.update_input_link(
-                        out_node, quant_weight_node, next_op_node
+                        out_node,
+                        self._quantized_ops[x_node.name()],
+                        next_op_node,
                     )
                 graph.safe_remove_nodes(_op)
         self._remove_unused_var_nodes(graph)
@@ -3298,9 +3313,9 @@ def apply(self, graph):
                         op_node.outputs, var_name
                     )
                     if out_node.dtype() not in [
-                        core.VarDesc.VarType.FP64,
-                        core.VarDesc.VarType.FP32,
-                        core.VarDesc.VarType.FP16,
+                        paddle.float64,
+                        paddle.float32,
+                        paddle.float16,
                     ]:
                         continue
                     if var_name in dequantized_vars_map:
@@ -3319,7 +3334,10 @@ def apply(self, graph):
             else:
                 var_names = utils._get_op_input_var_names(op_node)
                 for var_name in var_names:
-                    if var_name in dequant_node_map:
+                    if (
+                        var_name in dequant_node_map
+                        and dequant_node_map[var_name]
+                    ):
                         in_node = graph._find_node_by_name(
                             op_node.inputs, var_name
                         )
@@ -3345,39 +3363,41 @@ def _insert_quant_dequant_op(self, graph, var_node):
             shape=var_node.shape(),
             var_dtype=var_node.dtype(),
         )
-        if not self._calibration_range_dict:
-            try:
-                scale_var_node = graph._find_node_by_name(
-                    graph.all_persistable_nodes(), self._scale_name(var_name)
+
+        try:
+            scale_var_node = graph._find_node_by_name(
+                graph.all_persistable_nodes(), self._scale_name(var_name)
+            )
+        except:
+            if (
+                self._calibration_range_dict
+                and var_name in self._calibration_range_dict
+            ):
+                scale_value = self._calibration_range_dict[var_name]
+                scale_var_node = graph.create_persistable_node(
+                    name=self._scale_name(var_name),
+                    var_type=var_node.type(),
+                    shape=[1],
+                    var_dtype=var_node.dtype(),
                 )
-            except:
+                data_type = (
+                    'float64'
+                    if var_node.dtype() == core.VarDesc.VarType.FP64
+                    else 'float32'
+                )
+                _init_var_node(
+                    scale_var_node,
+                    np.array(scale_value, dtype=data_type),
+                    self._scope,
+                    self._place,
+                )
+            else:
                 _logger.warning(
                     "Cannot find the target node {} in scope, so skip adding quant node.".format(
                         var_name
                     )
                 )
                 return None
-        elif var_name in self._calibration_range_dict:
-            scale_value = self._calibration_range_dict[var_name]
-            scale_var_node = graph.create_persistable_node(
-                name=self._scale_name(var_name),
-                var_type=var_node.type(),
-                shape=[1],
-                var_dtype=var_node.dtype(),
-            )
-            data_type = (
-                'float64'
-                if var_node.dtype() == core.VarDesc.VarType.FP64
-                else 'float32'
-            )
-            _init_var_node(
-                scale_var_node,
-                np.array(scale_value, dtype=data_type),
-                self._scope,
-                self._place,
-            )
-        else:
-            return None
         try:
             zero_point_node = graph._find_node_by_name(
                 graph.all_persistable_nodes(),

From 6f0ae156ec3f342a0fd841fad9fedb3c9c8049be Mon Sep 17 00:00:00 2001
From: PuQing <me@puqing.work>
Date: Wed, 1 Feb 2023 16:52:53 +0800
Subject: [PATCH 73/89] [Numpy]Fix NumpyScaler2Tensor dtype error (#50018)

* fix numpyScaler2Tensor type error

* fix to_tensor docs, test=document_fix
---
 .../unittests/test_npscaler_to_tensor.py      | 95 +++++++++++++++++++
 python/paddle/tensor/creation.py              | 17 ++++
 2 files changed, 112 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py

diff --git a/python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py b/python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py
new file mode 100644
index 0000000000000..da6569d7d2973
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_npscaler_to_tensor.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+DTYPE_MAP = {
+    paddle.bool: np.bool_,
+    paddle.int32: np.int32,
+    paddle.int64: np.int64,
+    paddle.float16: np.float16,
+    paddle.float32: np.float32,
+    paddle.float64: np.float64,
+    paddle.complex64: np.complex64,
+}
+
+
+class NumpyScaler2Tensor(unittest.TestCase):
+    def setUp(self):
+        self.dtype = np.float32
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+    def test_dynamic_scaler2tensor(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        self.assertEqual(DTYPE_MAP[x.dtype], self.dtype)
+        self.assertEqual(x.numpy(), self.x_np)
+        if self.dtype in [
+            np.bool_
+        ]:  # bool is not supported convert to 0D-Tensor
+            return
+        self.assertEqual(len(x.shape), 0)
+
+    def test_static_scaler2tensor(self):
+        if self.dtype in [np.float16, np.complex64]:
+            return
+        paddle.enable_static()
+        x = paddle.to_tensor(self.x_np)
+        self.assertEqual(DTYPE_MAP[x.dtype], self.dtype)
+        if self.dtype in [
+            np.bool_,
+            np.float64,
+        ]:  # bool is not supported convert to 0D-Tensor and float64 not supported in static mode
+            return
+        self.assertEqual(len(x.shape), 0)
+
+
+class NumpyScaler2TensorBool(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.bool_
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorFloat16(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.float16
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorFloat64(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.float64
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorInt32(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.int32
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorInt64(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.int64
+        self.x_np = np.array([1], dtype=self.dtype)[0]
+
+
+class NumpyScaler2TensorComplex64(NumpyScaler2Tensor):
+    def setUp(self):
+        self.dtype = np.complex64
+        self.x_np = np.array([1], dtype=self.dtype)[0]
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9ad83ba74b7f5..7523845c2b8b2 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -533,6 +533,9 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
 
 def _to_tensor_non_static(data, dtype=None, place=None, stop_gradient=True):
 
+    if isinstance(data, np.number):  # Special case for numpy scalars
+        data = np.array(data)
+
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
@@ -627,6 +630,8 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
     if isinstance(data, Variable) and (dtype is None or dtype == data.dtype):
         output = data
     else:
+        if isinstance(data, np.number):  # Special case for numpy scalars
+            data = np.array(data)
 
         if not isinstance(data, np.ndarray):
             if np.isscalar(data) and not isinstance(data, str):
@@ -690,6 +695,18 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
     If you only want to change stop_gradient property, please call ``Tensor.stop_gradient = stop_gradient`` directly.
 
+    .. code-block:: text
+
+        We use the dtype conversion rules following this:
+                Keep dtype
+        np.number ───────────► paddle.Tensor
+                                (0D-Tensor)
+                    default_dtype
+        Python Number ───────────────► paddle.Tensor
+                                        (1D-Tensor)
+                    Keep dtype
+        np.ndarray ───────────► paddle.Tensor
+
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.

From 71f247b14a1d07eddb2c1a661c5973bee7f8835c Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 1 Feb 2023 16:56:41 +0800
Subject: [PATCH 74/89] run infer ut in A10 (#48535)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* run infer ut in A10

* 增加cuda11.2-cudnn8-trt8.4镜像

* add paddle_coverage_new.sh
---
 paddle/scripts/paddle_build.sh                |  29 +-
 tools/coverage/paddle_coverage_new.sh         | 287 ++++++++++++++++++
 tools/dockerfile/build_scripts/install_trt.sh |   5 +
 tools/dockerfile/ci_dockerfile.sh             |  28 ++
 4 files changed, 340 insertions(+), 9 deletions(-)
 create mode 100644 tools/coverage/paddle_coverage_new.sh

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4c48154b80a4b..2c83897b16678 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1326,6 +1326,17 @@ function card_test() {
     cardnumber=$2
     parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
 
+    # run ut based on the label
+    if [[ "${UT_RUN_TYPE_SETTING}" == "INFER" ]];then
+        run_label_mode="-L (RUN_TYPE=INFER)"
+    elif [[ "${UT_RUN_TYPE_SETTING}" == "DIST" ]];then
+        run_label_mode="-L (RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE)"
+    elif [[ "${UT_RUN_TYPE_SETTING}" == "WITHOUT_INFER" ]];then
+        run_label_mode="-LE (RUN_TYPE=INFER)"
+    elif [[ "${UT_RUN_TYPE_SETTING}" == "OTHER" ]];then
+        run_label_mode="-LE (RUN_TYPE=INFER|RUN_TYPE=DIST|RUN_TYPE=EXCLUSIVE)"
+    fi
+
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
@@ -1375,15 +1386,15 @@ function card_test() {
         tmpfile=$tmp_dir/$tmpfile_rand"_"$i
         if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} -V --timeout 120 -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 -V -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         else
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -E "($disable_ut_quickly)" ${run_label_mode} --timeout 120 --output-on-failure  -j $parallel_job | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         fi
     done
@@ -2364,7 +2375,7 @@ set +x
                             if [[ "${failed_test_lists}" == "" ]];then
                                 break
                             else
-                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                                retry_unittests=$( echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                             fi
                         fi
                         echo "========================================="
@@ -2687,10 +2698,10 @@ set +x
             if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
                 bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
             fi
-            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
             need_retry_ut_arr=(${need_retry_ut_str})
             need_retry_ut_count=${#need_retry_ut_arr[@]}
-            read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
             while ( [ $exec_times -lt $retry_time ] )
                 do
                     if [[ "${exec_times}" == "0" ]] ;then
@@ -2700,7 +2711,7 @@ set +x
                             is_retry_execuate=1
                         fi
                     elif [[ "${exec_times}" == "1" ]] ;then
-                        read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                         need_retry_ut_arr=(${need_retry_ut_str})
                         need_retry_ut_count=${#need_retry_ut_arr[@]}
                         if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
@@ -2718,7 +2729,7 @@ set +x
                             if [[ "${failed_test_lists}" == "" ]];then
                                 break
                             else
-                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                             fi
                         fi
                         echo "========================================="
diff --git a/tools/coverage/paddle_coverage_new.sh b/tools/coverage/paddle_coverage_new.sh
new file mode 100644
index 0000000000000..98de591fd154b
--- /dev/null
+++ b/tools/coverage/paddle_coverage_new.sh
@@ -0,0 +1,287 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xe
+
+PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+
+function lcov_init(){
+    # install lcov
+    if [ ! -f "/root/.cache/lcov-1.14.tar.gz" ];then
+        wget -P /home https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz --no-proxy --no-check-certificate || exit 101 
+        cp /home/lcov-1.14.tar.gz /root/.cache/lcov-1.14.tar.gz
+    else
+        cp /root/.cache/lcov-1.14.tar.gz /home/lcov-1.14.tar.gz
+    fi
+    tar -xf /home/lcov-1.14.tar.gz -C /
+    cd /lcov-1.14
+    make install
+}
+
+function gen_cpp_covinfo(){
+    # run paddle coverage
+    cd /paddle/build
+    python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
+    lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
+}
+
+
+# full html report
+
+function gen_full_html_report() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/framework/*' \
+        '/paddle/paddle/fluid/imperative/*' \
+        '/paddle/paddle/fluid/inference/*' \
+        '/paddle/paddle/fluid/memory/*' \
+        '/paddle/paddle/fluid/operators/*' \
+        '/paddle/paddle/fluid/recordio/*' \
+        '/paddle/paddle/fluid/string/*' \
+        '/paddle/paddle/fluid/eager/*' \
+        '/paddle/paddle/phi/*' \
+        '/paddle/paddle/utils/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        '/paddle/paddle/fluid/eager/tests/*' \
+        '/paddle/paddle/phi/tests/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+}
+
+function gen_full_html_report_xpu() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/operators/*xpu*' \
+        '/paddle/paddle/phi/kernels/xpu/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+}
+
+function gen_full_html_report_npu() {
+    lcov --extract coverage.info \
+        '/paddle/paddle/fluid/operators/*npu*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+
+    lcov --remove coverage-full.info \
+        '/paddle/paddle/fluid/framework/*_test*' \
+        '/paddle/paddle/fluid/*/*test*' \
+        '/paddle/paddle/fluid/*/*/*test*' \
+        '/paddle/paddle/fluid/inference/tests/*' \
+        '/paddle/paddle/fluid/inference/api/demo_ci/*' \
+        -o coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f coverage-full.tmp coverage-full.info
+}
+
+# if [ ${WITH_XPU:-OFF} == "ON" ]; then
+#     gen_full_html_report_xpu || true
+# elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
+#     gen_full_html_report_npu || true
+# else
+#     gen_full_html_report || true
+# fi
+
+# diff html report
+
+function gen_diff_html_report() {
+    if [ "${GIT_PR_ID}" != "" ]; then
+
+        COVERAGE_DIFF_PATTERN="`python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+
+        python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+    fi
+
+    lcov --extract coverage-full.info \
+        ${COVERAGE_DIFF_PATTERN} \
+        -o coverage-diff.info \
+        --rc lcov_branch_coverage=0
+
+    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+
+    mv -f coverage-diff.tmp coverage-diff.info
+
+    genhtml -o coverage-diff -t 'Diff Coverage' --no-function-coverage --no-branch-coverage coverage-diff.info
+}
+
+# gen_diff_html_report || true
+
+function gen_py_covinfo(){
+    # python coverage
+
+    export COVERAGE_FILE=/paddle/build/python-coverage.data
+    coverage combine `$(ls python-coverage.data.*)` || NO_PYTHON_COVERAGE_DATA=1
+    `$(coverage xml -i -o python-coverage.xml)` || [[ "${NO_PYTHON_COVERAGE_DATA}" == "1" ]]
+    sed -i 's/mnt\/paddle/paddle/g' python-coverage.xml
+    `$(python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info)` || [[ "${NO_PYTHON_COVERAGE_DATA}" == "1" ]]
+}
+
+
+# python full html report
+#
+function gen_python_full_html_report() {
+    lcov --extract python-coverage.info \
+        '/paddle/python/*' \
+        -o python-coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f python-coverage-full.tmp python-coverage-full.info
+
+    lcov --remove python-coverage-full.info \
+        '/*/tests/*' \
+        -o python-coverage-full.tmp \
+        --rc lcov_branch_coverage=0
+
+    mv -f python-coverage-full.tmp python-coverage-full.info
+}
+
+# gen_python_full_html_report || true
+
+# python diff html report
+
+function gen_python_diff_html_report() {
+    if [ "${GIT_PR_ID}" != "" ]; then
+        COVERAGE_DIFF_PATTERN="`python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+
+        python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > python-git-diff.out
+    fi
+
+    lcov --extract python-coverage-full.info \
+        ${COVERAGE_DIFF_PATTERN} \
+        -o python-coverage-diff.info \
+        --rc lcov_branch_coverage=0
+
+    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py python-coverage-diff.info python-git-diff.out > python-coverage-diff.tmp
+
+    mv -f python-coverage-diff.tmp python-coverage-diff.info
+
+    genhtml -o python-coverage-diff \
+        -t 'Python Diff Coverage' \
+        --no-function-coverage \
+        --no-branch-coverage \
+        --ignore-errors source \
+        python-coverage-diff.info
+}
+
+# gen_python_diff_html_report || true
+
+# assert coverage lines
+
+function covinfo_combine_full(){
+    if [ -f "other-coverage.info" ];then
+        if [ -f "infer-coverage.info" ];then
+            lcov -a other-coverage.info -a infer-coverage.info -o coverage.info
+        else
+            mv other-coverage.info coverage.info
+        fi
+    elif [ -f "infer-coverage.info" ];then
+        mv infer-coverage.info coverage.info
+    else
+        echo "Cannot found coverage.info"
+    fi
+
+    if [ -f "other-python-coverage-full.info" ];then
+        if [ -f "infer-python-coverage-full.info" ];then
+            lcov -a other-python-coverage-full.info -a infer-python-coverage-full.info -o python-coverage-full.info
+        else
+            mv other-python-coverage-full.info python-coverage-full.info
+        fi
+    elif [ -f "infer-coverage.info" ];then
+        mv infer-python-coverage-full.info python-coverage-full.info
+    else
+        echo "Cannot found python coverage.info"
+    fi  
+}
+
+function cov_rate_judge(){
+    echo "Assert CPP Diff Coverage"
+    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py coverage-diff.info 0.9 || COVERAGE_LINES_ASSERT=1
+
+    echo "Assert Python Diff Coverage"
+
+    if [ ${WITH_XPU:-OFF} == "ON" ]; then
+        echo "XPU has no python coverage!"
+    elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
+        echo "NPU has no python coverage!"
+    else
+        if [[ python-coverage-diff.info ]];then
+            python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
+        fi
+    fi
+    if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
+        echo "exit 9" > /tmp/paddle_coverage.result
+        exit 9
+    fi
+}
+
+function print_usage() {
+    echo -e "\n${RED}Usage${NONE}:
+    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
+
+    echo -e "\n${RED}Options${NONE}:
+    ${BLUE}gen_cov_info${NONE}: generate coverage info
+    ${BLUE}test${NONE}: coverage info combine
+    "
+}
+
+function main () {
+    local CMD=$1
+    lcov_init
+    case $CMD in 
+      gen_cov_info)
+        gen_cpp_covinfo
+        gen_py_covinfo
+        ;;
+      combine_cov_info)
+      covinfo_combine_full
+      gen_diff_html_report
+      gen_python_diff_html_report
+      cov_rate_judge
+        ;;
+      *)
+        print_usage
+        exit 1
+        ;;
+      esac
+}
+
+main $@
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index 490b0af5289c5..2129f92adbac3 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -36,6 +36,11 @@ elif [[ "$1" == "trt8406" ]];then
    tar -zxf TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3.tar.gz -C /usr/local
    cp -rf /usr/local/TensorRT-8.4.0.6/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.4.0.6/lib/* /usr/lib/
    rm -f TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3.tar.gz
+elif [[ "$1" == "trt8431" ]];then
+   wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz --no-check-certificate --no-proxy
+   tar -zxf TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz -C /usr/local
+   cp -rf /usr/local/TensorRT-8.4.3.1/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.4.3.1/lib/* /usr/lib/
+   rm -f TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
 elif [[ "$VERSION" == "11.2" ]];then
   wget -q --no-proxy https://paddle-ci.gz.bcebos.com/TRT/TensorRT7-cuda11.1-cudnn8.1.tar.gz --no-check-certificate
   tar -zxf TensorRT7-cuda11.1-cudnn8.1.tar.gz -C /usr/local
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 17ba5b3ee4c32..16471f2edd8f2 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -166,6 +166,33 @@ function make_unbuntu18_cu117_dockerfile(){
   sed -i 's# && rm /etc/apt/sources.list.d/nvidia-ml.list##g' ${dockerfile_name}
 }
 
+function make_ubuntu18_cu112_dockerfile(){
+  dockerfile_name="Dockerfile.cuda11.2_cudnn8.1_trt8.4_gcc8.2_ubuntu18"
+  sed "s#<baseimg>#nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04#g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name}
+  sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8431#g' ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
+     tar -xzf     hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libsndfile1 zstd pigz libcurl4-openssl-dev gettext zstd ninja-build \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub distro \&\& pip3.8 install PyGithub distro" ${dockerfile_name}
+  sed -i 's#<install_cpu_package>##g' ${dockerfile_name}
+  sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
+    COPY tools/dockerfile/build_scripts /build_scripts \\
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
+    RUN cp gcc  gcc.bak \&\& cp g++  g++.bak \&\& rm gcc \&\& rm g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
+    ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
+}
+
 function main() {
   make_ubuntu_dockerfile
   make_ubuntu_trt7_dockerfile
@@ -173,6 +200,7 @@ function main() {
   make_cinn_dockerfile
   make_ce_framework_dockcerfile
   make_unbuntu18_cu117_dockerfile
+  make_ubuntu18_cu112_dockerfile
 }
 
 main "$@"

From e4e94a889a7e172ca92b9d0c4aca8c3c08a39fea Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 1 Feb 2023 17:01:31 +0800
Subject: [PATCH 75/89] [Zero-Dim] Fix 0-dim tensor for arg_min_max op.
 (#49570)

* fix 0-d tensor for arg_min_max op.

* fix xpu.

* fix zero dims

* fix

* Update arg_min_max_kernel.cc

* Update arg_min_max_kernel.cc

* Update arg_min_max_kernel.cc

* Update test_zero_dim_tensor.py

* Update test_zero_dim_tensor_xpu.py

* Update test_zero_dim_tensor.py

* Update arg_min_max_kernel.cc

* Update arg_min_max_kernel.cc

* Update arg_min_max_kernel.cc
---
 paddle/phi/infermeta/unary.cc                 | 47 +++++++++++++------
 paddle/phi/kernels/cpu/arg_min_max_kernel.cc  |  6 +++
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |  7 +++
 paddle/phi/kernels/xpu/arg_min_max_kernel.cc  |  9 ++++
 .../tests/unittests/test_zero_dim_tensor.py   | 15 ++++--
 .../unittests/xpu/test_zero_dim_tensor_xpu.py |  5 +-
 6 files changed, 68 insertions(+), 21 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index eb05437ada8a5..2b35545db1cd8 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -160,22 +160,34 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
   auto int_axis = axis.to<int64_t>();
   const auto& x_dims = x.dims();
 
-  PADDLE_ENFORCE_GE(
-      int_axis,
-      -x_dims.size(),
-      phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to"
-                                   " -Rank(X)(%d).",
-                                   int_axis,
-                                   -x_dims.size()));
-  PADDLE_ENFORCE_LT(int_axis,
-                    x_dims.size(),
-                    phi::errors::InvalidArgument(
-                        "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
-                        int_axis,
-                        x_dims.size()));
+  auto x_rank = x.dims().size();
+  if (x_rank > 0) {
+    PADDLE_ENFORCE_GE(int_axis,
+                      -x_rank,
+                      phi::errors::InvalidArgument(
+                          "'axis'(%d) must be greater than or equal to"
+                          " -Rank(X)(%d).",
+                          int_axis,
+                          -x_rank));
+    PADDLE_ENFORCE_LT(
+        int_axis,
+        x_rank,
+        phi::errors::InvalidArgument(
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+            int_axis,
+            x_rank));
+  } else {
+    // 0-dim tensor
+    PADDLE_ENFORCE_EQ((int_axis == 0 || int_axis == -1) && flatten,
+                      true,
+                      phi::errors::InvalidArgument(
+                          "'axis'(%d) must be 0 or -1 if input tensor is "
+                          "0-dim. and flatten should be true.",
+                          int_axis));
+  }
 
-  auto x_rank = x_dims.size();
   if (int_axis < 0) int_axis += x_rank;
+
   if (config.is_runtime) {
     if (dtype == phi::TransToProtoVarType(DataType::INT32)) {
       int64_t all_element_num = 0;
@@ -195,8 +207,12 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
               INT_MAX));
     }
   }
+
   std::vector<int64_t> vec;
-  if (flatten) {
+
+  if (x_rank == 0) {
+    // vec is set to empty
+  } else if (flatten) {
     vec.emplace_back(static_cast<int64_t>(1));
   } else {
     for (int64_t i = 0; i < int_axis; i++) vec.emplace_back(x_dims[i]);
@@ -205,6 +221,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
     }
     for (int64_t i = int_axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
   }
+
   out->set_dims(phi::make_ddim(vec));
   if (dtype == 2) {
     out->set_dtype(DataType::INT32);
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
index 61d20ac32f15a..694698050a0c0 100644
--- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -96,6 +96,12 @@ struct VisitDataArgMinMaxFunctor {
       if (axis < 0) new_axis = axis + x_dims.size();
     }
 
+    // For 0D Tensor
+    if (x.dims().size() == 0) {
+      phi::funcs::set_constant(dev_ctx, out, 0);
+      return;
+    }
+
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                         \
   ArgMinMaxFunctor<Context, T, Tout, rank, EnumArgMinMaxValue> functor##rank; \
   functor##rank(dev_ctx, x, out, x_dims, new_axis, new_keepdims)
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index affd36a95ef8b..199ecc8e5b989 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 
 namespace {  // NOLINT
@@ -180,6 +181,12 @@ struct VisitDataCudaArgMinMaxFunctor {
       x_dims = x.dims();
       if (axis < 0) new_axis = axis + x.dims().size();
     }
+    // For 0D Tensor
+    if (x.dims().size() == 0) {
+      dev_ctx.template Alloc<IndType>(out);
+      phi::funcs::set_constant(dev_ctx, out, 0);
+      return;
+    }
 
     int64_t numel = x.numel();
     int64_t groups = numel / x_dims[new_axis];
diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
index 3513b64bc600e..ebf13142345ce 100644
--- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
@@ -39,7 +40,15 @@ void ArgMaxKernel(const Context& dev_ctx,
           DataType::INT64,
           DataType::INT32,
           dtype));
+  // TODO(ZHUI): fix dtype of out
   dev_ctx.template Alloc<int64_t>(out);
+  if (x.dims().size() == 0) {
+    xpu::constant(dev_ctx.x_context(),
+                  out->data<int64_t>(),
+                  x.numel(),
+                  static_cast<int64_t>(0));
+    return;
+  }
 
   DDim x_dims;
   int axis_val = axis.to<int>();
diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
index 2d07ab31334df..fcc171674deab 100644
--- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py
@@ -189,6 +189,8 @@ def test_static_unary(self):
     paddle.logsumexp,
     paddle.all,
     paddle.any,
+    paddle.argmax,
+    paddle.argmin,
 ]
 
 
@@ -208,12 +210,13 @@ def test_dygraph_reduce(self):
             out.retain_grads()
             out.backward()
 
-            out_empty_list = api(x, [])
-            self.assertEqual(out_empty_list, out)
-
             self.assertEqual(x.shape, [])
             self.assertEqual(out.shape, [])
-            np.testing.assert_allclose(out.numpy(), x.numpy())
+            if api not in [paddle.argmax, paddle.argmin]:
+                np.testing.assert_allclose(out.numpy(), x.numpy())
+                out_empty_list = api(x, [])
+                self.assertEqual(out_empty_list, out)
+
             if x.grad is not None:
                 self.assertEqual(x.grad.shape, [])
                 self.assertEqual(out.grad.shape, [])
@@ -250,7 +253,9 @@ def test_static_reduce(self):
                 res = exe.run(main_prog, fetch_list=fetch_list)
                 self.assertEqual(res[0].shape, ())
                 self.assertEqual(res[1].shape, ())
-                np.testing.assert_allclose(res[0], res[1])
+                if api not in [paddle.argmax, paddle.argmin]:
+                    np.testing.assert_allclose(res[0], res[1])
+
                 if len(res) > 2:
                     self.assertEqual(res[2].shape, ())
                     self.assertEqual(res[3].shape, ())
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
index f6f64aefe9db7..35e98e3cdaa75 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
@@ -132,6 +132,8 @@ def test_dygraph_unary(self):
     paddle.logsumexp,
     paddle.all,
     paddle.any,
+    paddle.argmax,
+    paddle.argmin,
 ]
 
 
@@ -153,7 +155,8 @@ def test_dygraph_reduce(self):
 
             self.assertEqual(x.shape, [])
             self.assertEqual(out.shape, [])
-            np.testing.assert_allclose(out.numpy(), x.numpy())
+            if api not in [paddle.argmax, paddle.argmin]:
+                np.testing.assert_allclose(out.numpy(), x.numpy())
             if x.grad is not None:
                 self.assertEqual(x.grad.shape, [])
                 self.assertEqual(out.grad.shape, [])

From 3ab6faa8b311fdfcd9458641cd30eb8faf8379d8 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 17:08:09 +0800
Subject: [PATCH 76/89] Fix div 0 error of case11:
 paddle.nn.functional.max_pool1d/max_pool2d/max_pool3d (#50010)

* add stride check for MaxPool

* add unittests
---
 paddle/fluid/operators/pool_with_index_op.cc       |  5 +++++
 paddle/phi/kernels/funcs/pooling.h                 |  5 +++++
 .../fluid/tests/unittests/test_pool1d_api.py       | 14 +++++++++++++-
 .../fluid/tests/unittests/test_pool2d_api.py       | 12 ++++++++++++
 .../fluid/tests/unittests/test_pool3d_api.py       | 12 ++++++++++++
 5 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 74b98069bf647..79262db30fafb 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -27,6 +27,11 @@ inline int MaxPoolOutputSize(int input_size,
                              int filter_size,
                              int padding,
                              int stride) {
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument(
+          "The stride of MaxPool shall not be 0, but received %d.", stride));
   int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
   return output_size;
 }
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 17b87a0e17d51..c0741672a458e 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -402,6 +402,11 @@ inline int MaxPoolOutputSize(int input_size,
                              int filter_size,
                              int padding,
                              int stride) {
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument(
+          "The stride of MaxPool shall not be 0, but received %d.", stride));
   int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
   return output_size;
 }
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 2c191bf4892b7..3816822e8f3ec 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -274,7 +274,7 @@ def test_pool1d(self):
             self.check_max_dygraph_return_index_results(place)
 
 
-class TestPool2DError_API(unittest.TestCase):
+class TestPool1DError_API(unittest.TestCase):
     def test_error_api(self):
         def run1():
             with fluid.dygraph.guard():
@@ -417,6 +417,18 @@ def run_stride_out_of_range():
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
+        def run_zero_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1]), dtype='float32'
+                )
+                out = F.max_pool1d(
+                    x, 1, stride=0, padding=1, return_mask=True, ceil_mode=True
+                )
+
+        self.assertRaises(ValueError, run_zero_stride)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 44ef18605ed2a..c55ea337c41b7 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -597,6 +597,18 @@ def run_stride_out_of_range():
 
         self.assertRaises(ValueError, run_stride_out_of_range)
 
+        def run_zero_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1]), dtype='float32'
+                )
+                out = max_pool2d(
+                    x, 1, stride=0, padding=1, return_mask=True, ceil_mode=True
+                )
+
+        self.assertRaises(ValueError, run_zero_stride)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index 961f0b5c569f0..30a03ab220bcc 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -563,6 +563,18 @@ def run_size_out_of_range():
 
         self.assertRaises(ValueError, run_size_out_of_range)
 
+        def run_zero_stride():
+            with fluid.dygraph.guard():
+                array = np.array([1], dtype=np.float32)
+                x = paddle.to_tensor(
+                    np.reshape(array, [1, 1, 1, 1, 1]), dtype='float32'
+                )
+                out = max_pool3d(
+                    x, 1, stride=0, padding=1, return_mask=True, ceil_mode=True
+                )
+
+        self.assertRaises(ValueError, run_zero_stride)
+
 
 if __name__ == '__main__':
     unittest.main()

From ccf8d96c5da5d3c6df8d1c15cbb812d15dfc81f9 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 1 Feb 2023 17:08:34 +0800
Subject: [PATCH 77/89] support cuda11.7 manylinux (#44217)

---
 tools/dockerfile/build_scripts/install_cudnn.sh | 8 ++++++++
 tools/dockerfile/build_scripts/install_nccl2.sh | 2 +-
 tools/dockerfile/centos7_manylinux.sh           | 9 +++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/dockerfile/build_scripts/install_cudnn.sh b/tools/dockerfile/build_scripts/install_cudnn.sh
index 0817634fa91af..2310370f223c8 100644
--- a/tools/dockerfile/build_scripts/install_cudnn.sh
+++ b/tools/dockerfile/build_scripts/install_cudnn.sh
@@ -45,4 +45,12 @@ elif [[ "$1" == "cudnn821" && "$VERSION" == "11.2" ]]; then
   cp -r lib64 /usr && cd ../ && \
   rm -f cudnn-11.3-linux-x64-v8.2.1.32.tgz && \
   rm -rf cuda
+elif [[ "$1" == "cudnn841" && "$VERSION" == "11.7" ]]; then
+  wget -q https://paddle-ci.gz.bcebos.com/cudnn/cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz --no-check-certificate
+  tar xJvf cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz && \
+  cd cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive && \
+  cp -r include /usr && \
+  cp -r lib /usr && cd ../ && \
+  rm -f cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive.tar.xz && \
+  rm -rf cudnn-linux-x86_64-8.4.1.50_cuda11.6-archive
 fi
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index d39e74f3cf537..c21267807976d 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -17,7 +17,7 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ] || [ "$VERSION" == "11.8" ]; then
+elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ] || [ "$VERSION" == "11.7" ] || [ "$VERSION" == "11.8" ]; then
   if [ -f "/etc/redhat-release" ];then
     rm -f /usr/local/lib/libnccl.so
     wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.10.3-1+cuda11.4.x86_64.rpm
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 7b21a51045618..4beb8b3a592ad 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -128,6 +128,12 @@ function make_cuda116cudnn840trt8406gcc82() {
   sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp
 }
 
+function make_cuda117cudnn841() {
+  sed 's/<baseimg>/11.7.0-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRun bash build_scripts/install_cudnn.sh cudnn841 \nENV CUDNN_VERSION=8.4.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+  sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp
+}
+
 function main() {
   local CMD=$1 
   case $CMD in
@@ -188,6 +194,9 @@ function main() {
     cuda116cudnn840trt8406gcc82)
       make_cuda116cudnn840trt8406gcc82
      ;;
+    cuda117cudnn841)
+      make_cuda117cudnn841
+     ;;
     *)
       echo "Make dockerfile error, Without this paramet."
       exit 1

From 838dc660e51f6b1479274d706434d992ba4a44a6 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 1 Feb 2023 17:09:03 +0800
Subject: [PATCH 78/89] [dockerfile] support python3.10 in manylinux dockerfile
 (#42126)

* support python3.10 in manylinux dockerfile

* fix python3.10 ssl

* do not check for python3.10
---
 tools/dockerfile/Dockerfile.centos            |  7 +++++--
 tools/dockerfile/build_scripts/build.sh       | 17 ++++++++++-------
 tools/dockerfile/build_scripts/build_utils.sh |  7 +++++++
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 1bc7727f304af..3452db468b311 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -50,17 +50,20 @@ RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/re
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.10.0/bin/pip3 install setuptools -U
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.10.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.10.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.10.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz --no-check-certificate https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 4bbe8198556e3..5822fa10160b7 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -24,7 +24,7 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.9.0 3.8.0 3.7.0"
+CPYTHON_VERSIONS="3.10.0 3.9.0 3.8.0 3.7.0"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
@@ -80,11 +80,12 @@ build_cpythons $CPYTHON_VERSIONS
 PY37_BIN=/opt/python/cp37-cp37m/bin
 PY38_BIN=/opt/python/cp38-cp38m/bin
 PY39_BIN=/opt/python/cp39-cp39m/bin
+PY310_BIN=/opt/python/cp310-cp310m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib:$(dirname ${PY310_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
@@ -136,11 +137,13 @@ for PYTHON in /opt/python/*/bin/python; do
     # Add matching directory of libpython shared library to library lookup path
     LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
 
-    # Smoke test to make sure that our Pythons work, and do indeed detect as
-    # being manylinux compatible:
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
-    # Make sure that SSL cert checking works
-    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+    if [ "$(dirname $(dirname ${PYTHON}))" != "/opt/python/cp310-cp310" ]; then
+        # Smoke test to make sure that our Pythons work, and do indeed detect as
+        # being manylinux compatible:
+        LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+        # Make sure that SSL cert checking works
+        LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+    fi
 done
 
 # Restore LD_LIBRARY_PATH
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index 70071a9ccb07b..76ad518ae24cc 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -92,7 +92,14 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3.9 ]; then
         ln -s python3.9 ${prefix}/bin/python
     fi
+    if [ -e ${prefix}/bin/python3.10 ]; then
+        ln -s python3.10 ${prefix}/bin/python
+    fi
     # NOTE Make libpython shared library visible to python calls below
+    if [ -e ${prefix}/bin/python3.10 ]; then
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m pip config set global.trusted-host mirrors.aliyun.com
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m pip config set global.index-url http://mirrors.aliyun.com/pypi/simple/
+    fi
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /

From 1346cd3516b0965590691449d587c306bba10285 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 1 Feb 2023 17:09:43 +0800
Subject: [PATCH 79/89] [dockerfile] fix python3.7 setuptools bug in release18
 dockerfile (#42575)

* fix release dockerfile

* fix GPG error in ubuntu18

* fix cpu

* fix
---
 tools/dockerfile/Dockerfile.release18 | 6 ++++--
 tools/dockerfile/ubuntu18_release.sh  | 6 +++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18
index cf343873d943a..42b24030c00a8 100644
--- a/tools/dockerfile/Dockerfile.release18
+++ b/tools/dockerfile/Dockerfile.release18
@@ -17,7 +17,9 @@ ENV HOME /root
 # Add bash enhancements
 COPY paddle/scripts/docker/root/ /root/
 
-RUN apt-get update && \
+RUN chmod 777 /tmp
+
+RUN apt-get update --allow-unauthenticated && \
   apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
   apt-get update && \
   apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \ 
@@ -48,7 +50,7 @@ ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
 
 
 RUN apt-get update && \
-  apt-get install -y python3.7 python3.7-dev && \
+  apt-get install -y python3.7 python3.7-dev python3.7-distutils && \
   mv /usr/bin/python /usr/bin/python.bak && ln -s /usr/bin/python3.7 /usr/bin/python && \
   mv /usr/bin/python3 /usr/bin/python3.bak && ln -s /usr/bin/python3.7 /usr/bin/python3
 
diff --git a/tools/dockerfile/ubuntu18_release.sh b/tools/dockerfile/ubuntu18_release.sh
index 2c12d4b74c073..4c4cc780ce8be 100755
--- a/tools/dockerfile/ubuntu18_release.sh
+++ b/tools/dockerfile/ubuntu18_release.sh
@@ -80,7 +80,11 @@ function install_whl(){
 
 function set_cuda_env(){
   if [[ ${WITH_GPU} == "ON" ]]; then
-      sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-${ref_CUDA_MAJOR}/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" Dockerfile.tmp
+      sed -i "s#<setcuda>#ENV LD_LIBRARY_PATH=/usr/local/cuda-${ref_CUDA_MAJOR}/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH \\
+\\
+RUN apt-key del 7fa2af80 \\
+RUN rm /etc/apt/sources.list.d/cuda.list \&\& rm /etc/apt/sources.list.d/nvidia-ml.list \\
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub #g" Dockerfile.tmp
   else
       sed -i 's#<setcuda>##g' Dockerfile.tmp
   fi

From 3cf50f91472544b7560241b1a2ee9d9155175997 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 17:11:01 +0800
Subject: [PATCH 80/89] =?UTF-8?q?Fix=20=E7=A9=BA=E6=8C=87=E9=92=88=20(Null?=
 =?UTF-8?q?=20pointer)=20of=20case8:=20paddle.slice=20(#49979)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add check for input of slice

* add unittest
---
 paddle/phi/infermeta/unary.cc                 | 15 +++++++++++++
 .../fluid/tests/unittests/test_slice_op.py    | 21 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 2b35545db1cd8..f2fcb3162081f 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3395,6 +3395,21 @@ void SliceRawInferMeta(const MetaTensor& input,
     }
   }
 
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      starts_arr.size(),
+      phi::errors::InvalidArgument(
+          "The length of axes (%d) and length of starts (%d) should be same.",
+          axes.size(),
+          starts_arr.size()));
+  PADDLE_ENFORCE_EQ(
+      axes.size(),
+      ends_arr.size(),
+      phi::errors::InvalidArgument(
+          "The length of axes (%d) and length of ends (%d) should be same.",
+          axes.size(),
+          ends_arr.size()));
+
   // 2.1 Check attrs.
   std::vector<int64_t> starts = starts_arr.GetData();
   std::vector<int64_t> ends = ends_arr.GetData();
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 19aa669badf5c..157818e794301 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -852,6 +852,27 @@ def test_axis_less_than_zero(self):
                 paddle.slice(x, 0, starts, ends)
 
 
+class TestSliceOpError(unittest.TestCase):
+    def test_dismatch_shape(self):
+        with fluid.dygraph.guard():
+            with self.assertRaises(ValueError):
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0]), dtype='float32')
+                paddle.slice(x, axes=[0], starts=[], ends=[])
+
+            with self.assertRaises(ValueError):
+                array = np.array([], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [0]), dtype='float32')
+                paddle.slice(x, axes=[0], starts=[0], ends=[])
+
+            # if shape match, pass
+            array = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [0]), dtype='float32')
+            out = paddle.slice(x, axes=[0], starts=[0], ends=[0])
+            self.assertEqual(out.numel(), 0)
+            # self.assertEqual(out.shape)
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )

From fd5b8eea463ba14d86c12c327deae0475aa10f0f Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 17:15:42 +0800
Subject: [PATCH 81/89] Fix Python IndexError of case2-3 (#49986)

* add shape check for fused_multi_head_attention

* use raise for coverage test

* add unittest

* remove unnecessary pass

* add unittest
---
 .../test_fused_attention_no_dropout.py        | 12 +++++++++
 .../test_fused_multi_transformer_op.py        | 26 +++++++++++++++++++
 .../nn/functional/fused_transformer.py        |  5 ++++
 3 files changed, 43 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py b/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py
index c459f2dbb22e6..4f18abd79e0fe 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_no_dropout.py
@@ -192,5 +192,17 @@ def set_configs(self):
         self.normalize_before = True
 
 
+class TestFusedAttentionAPIError(unittest.TestCase):
+    def test_invalid_x_rank(self):
+        def test_x_rank_1():
+            with paddle.fluid.dygraph.guard():
+                layer = FusedMultiHeadAttention(embed_dim=1, num_heads=1)
+                array = np.array([1.9], dtype=np.float32)
+                x = paddle.to_tensor(np.reshape(array, [1]), dtype='float32')
+                out = layer(x)
+
+        self.assertRaises(ValueError, test_x_rank_1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index e3da925a01e42..8068387cfdcba 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -1051,5 +1051,31 @@ def test_fused_multi_transformer_op(self):
             )
 
 
+class TestFusedMultiAttentionAPIError(unittest.TestCase):
+    def test_errors(self):
+        def test_invalid_input_dim():
+            array = np.array([1.9], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [1]), dtype='float32')
+            layer = paddle.incubate.nn.FusedMultiHeadAttention(
+                embed_dim=1, num_heads=1
+            )
+            out = layer(x)
+
+        self.assertRaises(ValueError, test_invalid_input_dim)
+
+
+class TestFusedMultiTransformerAPIError(unittest.TestCase):
+    def test_errors(self):
+        def test_invalid_input_dim():
+            array = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(array, [0]), dtype='int32')
+            layer = paddle.incubate.nn.FusedTransformerEncoderLayer(
+                108, 108, 108, 0.0, 'relu'
+            )
+            out = layer(x)
+
+        self.assertRaises(ValueError, test_invalid_input_dim)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 01d2161b22342..19ec0ad2458d7 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -615,6 +615,11 @@ def fused_multi_head_attention(
         'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
     )  # semantic transfer
 
+    if x.ndim != 3:
+        raise ValueError(
+            f"The rank of the x should be 3, but received {x.ndim}."
+        )
+
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed

From 34bf3d09b125cc6590edfd98bcb5bdeaf98f7ad3 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 1 Feb 2023 18:00:53 +0800
Subject: [PATCH 82/89] =?UTF-8?q?Fix=20UFA=E9=9D=9E=E6=B3=95=E5=9C=B0?=
 =?UTF-8?q?=E5=9D=80=E8=AE=BF=E9=97=AE(UFA=20illegal=20address=20access)?=
 =?UTF-8?q?=20of=20case3:=20paddle.crop=20(#49994)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add range check for crop_kernel

* remove shape negative check

* add unittest
---
 paddle/phi/kernels/impl/crop_kernel_impl.h          | 10 ++++++++++
 python/paddle/fluid/tests/unittests/test_crop_op.py |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/paddle/phi/kernels/impl/crop_kernel_impl.h b/paddle/phi/kernels/impl/crop_kernel_impl.h
index d3cb672104d67..5aa951d4da09d 100644
--- a/paddle/phi/kernels/impl/crop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/crop_kernel_impl.h
@@ -100,6 +100,16 @@ void CropTensorFunction(const Context& dev_ctx,
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
   for (size_t i = 0; i < offsets_vec.size(); ++i) {
+    PADDLE_ENFORCE_GE(
+        offsets_vec[i],
+        0,
+        errors::InvalidArgument("The offsets (%d) of the %uth elements of"
+                                " Op(crop_tensor) "
+                                "should be greater than or "
+                                "equal to 0.",
+                                offsets_vec[i],
+                                i));
+
     PADDLE_ENFORCE_LE(offsets_vec[i] + shape_vec[i],
                       x_dims[i],
                       errors::InvalidArgument(
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 1050fb0ad5c57..f5886edc3350c 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -149,6 +149,13 @@ def test_crop_none_shape(self):
         self.assertEqual(crop.shape, (3, 6, 6))
 
 
+class TestCropError(unittest.TestCase):
+    def test_neg_offset_error(self):
+        with self.assertRaises(ValueError):
+            x = fluid.data(name='input2', shape=[1], dtype="float32")
+            out = paddle.crop(x, offsets=[-1])
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()

From e6d29e0020ac7057b2bed5fef9d689d5f18642be Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Wed, 1 Feb 2023 19:58:46 +0800
Subject: [PATCH 83/89] add information of build_size (#49397)

---
 paddle/scripts/paddle_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2c83897b16678..d389b76cfe2c9 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -459,7 +459,7 @@ EOF
         if ls ${PADDLE_ROOT}/build/python/dist/*whl >/dev/null 2>&1; then
             PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
         elif ls ${PADDLE_ROOT}/dist/*whl >/dev/null 2>&1; then
-            PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
+            PR_whlSize=$($com ${PADDLE_ROOT}/dist |awk '{print $1}')
         fi
         echo "PR whl Size: $PR_whlSize"
         echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" >> ${PADDLE_ROOT}/build/build_summary.txt
@@ -3763,6 +3763,8 @@ function run_setup(){
         exit 7;
     fi
 
+    build_size
+    
     endTime_s=`date +%s`
     [ -n "$startTime_firstBuild" ] && startTime_s=$startTime_firstBuild
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"

From 2b848aef55e47c514cef216af51ba7bcad1e43b7 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 1 Feb 2023 20:10:35 +0800
Subject: [PATCH 84/89] Fused attention pass fwd, create the fused_attention
 op. (#50125)

---
 .../framework/ir/fused_attention_pass.cc      | 246 +++++++++++++++---
 .../fluid/framework/ir/fused_attention_pass.h |  24 +-
 .../unittests/test_fused_attention_pass.py    |  14 +-
 3 files changed, 231 insertions(+), 53 deletions(-)

diff --git a/paddle/fluid/framework/ir/fused_attention_pass.cc b/paddle/fluid/framework/ir/fused_attention_pass.cc
index 72fa90db9b15a..7b0f469ff87cd 100644
--- a/paddle/fluid/framework/ir/fused_attention_pass.cc
+++ b/paddle/fluid/framework/ir/fused_attention_pass.cc
@@ -22,7 +22,6 @@ namespace patterns {
 
 PDNode* FusedAttentionPattern::operator()(PDNode* x,
                                           bool pre_layer_norm,
-                                          bool post_layer_norm,
                                           bool has_attn_mask,
                                           bool do_dropout,
                                           bool add_residual) {
@@ -259,7 +258,7 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x,
   out_linear_dropout_node->LinksFrom({out_linear_ele_add_out_node})
       .LinksTo({out_linear_dropout_mask_node, out_linear_dropout_out_node});
 
-  if (!add_residual && !post_layer_norm) {
+  if (!add_residual && pre_layer_norm) {
     return out_linear_dropout_out_node;
   }
 
@@ -276,7 +275,7 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x,
     residual_ele_add_node->LinksFrom({x, out_linear_dropout_out_node})
         .LinksTo({residual_ele_add_out_node});
 
-    if (!post_layer_norm) {
+    if (pre_layer_norm) {
       return residual_ele_add_out_node;
     }
   }
@@ -323,13 +322,12 @@ PDNode* FusedAttentionPattern::operator()(PDNode* x,
 
 PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
                                               bool pre_layer_norm,
-                                              bool post_layer_norm,
                                               bool has_attn_mask,
                                               bool do_dropout,
                                               bool add_residual) {
   // post layer norm
   PDNode* post_layer_norm_grad_out_node{nullptr};
-  if (post_layer_norm) {
+  if (!pre_layer_norm) {
     auto* post_layer_norm_grad_node =
         pattern->NewNode(post_layer_norm_grad_op_repr())
             ->assert_is_op("layer_norm_grad");
@@ -375,7 +373,7 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
   PDNode* residual_ele_add_grad_x_grad_node{nullptr};
   if (add_residual) {
     PDNode* ele_add_grad_input = x;
-    if (post_layer_norm) {
+    if (!pre_layer_norm) {
       ele_add_grad_input = post_layer_norm_grad_out_node;
     }
     auto* residual_ele_add_grad_node =
@@ -404,7 +402,7 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
 
   // get the real input x for dropout grad
   PDNode* out_linear_grad_input_node = x;
-  if (post_layer_norm && !add_residual) {
+  if (!pre_layer_norm && !add_residual) {
     out_linear_grad_input_node = post_layer_norm_grad_out_node;
   } else if (add_residual) {
     out_linear_grad_input_node = residual_ele_add_grad_out_node;
@@ -769,11 +767,11 @@ PDNode* FusedAttentionGradPattern::operator()(PDNode* x,
 void FusedAttentionsPass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
 
-  graph = PreMaskDropResPostFwd(graph);
-  graph = PreMaskDropResPostBwd(graph);
+  graph = PreMaskDropResFwd(graph);
+  graph = PreMaskDropResBwd(graph);
 }
 
-ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
+ir::Graph* FusedAttentionsPass::PreMaskDropResFwd(Graph* graph) const {
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode(patterns::PDNodeName(name_scope_, "x"))
@@ -784,7 +782,6 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
 
   fused_attention_pattern(x,
                           /* pre_layer_norm */ true,
-                          /* post_layer_norm */ true,
                           /* has_attn_mask */ true,
                           /* do_dropout */ true,
                           /* add_residual */ true);
@@ -835,10 +832,191 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
                               fused_attention_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         residual_ele_add_op_node, residual_ele_add_op, fused_attention_pattern);
+
+    OpDesc fused_attention_op_desc(pre_layer_norm_op_node->Op()->Block());
+    fused_attention_op_desc.SetType("fused_attention");
+    fused_attention_op_desc.SetInput("X", {subgraph.at(x)->Name()});
+
+    fused_attention_op_desc.SetAttr("pre_layer_norm", true);
+    GET_IR_NODE_FROM_SUBGRAPH(pre_layer_norm_scale_node,
+                              pre_layer_norm_scale,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        pre_layer_norm_bias_node, pre_layer_norm_bias, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        pre_layer_norm_out_node, pre_layer_norm_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        pre_layer_norm_mean_node, pre_layer_norm_mean, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pre_layer_norm_variance_node,
+                              pre_layer_norm_variance,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetInput("LnScale",
+                                     {pre_layer_norm_scale_node->Name()});
+    fused_attention_op_desc.SetInput("LnBias",
+                                     {pre_layer_norm_bias_node->Name()});
+    fused_attention_op_desc.SetOutput("LnOut",
+                                      {pre_layer_norm_out_node->Name()});
+    fused_attention_op_desc.SetOutput("LnMean",
+                                      {pre_layer_norm_mean_node->Name()});
+    fused_attention_op_desc.SetOutput("LnVariance",
+                                      {pre_layer_norm_variance_node->Name()});
+    fused_attention_op_desc.SetAttr(
+        "epsilon",
+        PADDLE_GET_CONST(float,
+                         pre_layer_norm_op_node->Op()->GetAttr("epsilon")));
+
+    fused_attention_op_desc.SetAttr("transpose_qkv_wb", true);
+    std::vector<int> shape = PADDLE_GET_CONST(
+        std::vector<int>, fuse_qkv_reshape_op_node->Op()->GetAttr("shape"));
+    fused_attention_op_desc.SetAttr("num_heads", shape[2]);
     GET_IR_NODE_FROM_SUBGRAPH(
-        post_layer_norm_op_node, post_layer_norm_op, fused_attention_pattern);
+        fuse_qkv_matmul_w_node, fuse_qkv_matmul_w, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        fuse_qkv_matmul_out_node, fuse_qkv_matmul_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_ele_add_bias_node,
+                              fuse_qkv_ele_add_bias,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_ele_add_out_node,
+                              fuse_qkv_ele_add_out,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fuse_qkv_transpose_out_node,
+                              fuse_qkv_transpose_out,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetInput("QKVW", {fuse_qkv_matmul_w_node->Name()});
+    fused_attention_op_desc.SetInput("QKVBias",
+                                     {fuse_qkv_ele_add_bias_node->Name()});
+    fused_attention_op_desc.SetOutput("QKVOut",
+                                      {fuse_qkv_matmul_out_node->Name()});
+    fused_attention_op_desc.SetOutput("QKVBiasOut",
+                                      {fuse_qkv_ele_add_out_node->Name()});
+    fused_attention_op_desc.SetOutput("TransposeOut2",
+                                      {fuse_qkv_transpose_out_node->Name()});
 
-    // TODO(Yuang Liu): finish the handler
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qk_matmul_out_node, qk_matmul_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(add_mask_ele_add_mask_node,
+                              add_mask_ele_add_mask,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(add_mask_ele_add_out_node,
+                              add_mask_ele_add_out,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qk_softmax_out_node, qk_softmax_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dropout_out_node, attn_dropout_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attn_dropout_mask_node, attn_dropout_mask, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qkv_matmul_out_node, qkv_matmul_out, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        qkv_reshape_out_node, qkv_reshape_out, fused_attention_pattern);
+    fused_attention_op_desc.SetOutput("QKOut", {qk_matmul_out_node->Name()});
+    fused_attention_op_desc.SetInput("SrcMask",
+                                     {add_mask_ele_add_mask_node->Name()});
+    fused_attention_op_desc.SetOutput("SrcMaskOut",
+                                      {add_mask_ele_add_out_node->Name()});
+    fused_attention_op_desc.SetOutput("SoftmaxOut",
+                                      {qk_softmax_out_node->Name()});
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_rate",
+        PADDLE_GET_CONST(float,
+                         attn_dropout_op_node->Op()->GetAttr("dropout_prob")));
+    fused_attention_op_desc.SetAttr(
+        "is_test",
+        PADDLE_GET_CONST(bool, attn_dropout_op_node->Op()->GetAttr("is_test")));
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_fix_seed",
+        PADDLE_GET_CONST(bool,
+                         attn_dropout_op_node->Op()->GetAttr("fix_seed")));
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_seed",
+        PADDLE_GET_CONST(int, attn_dropout_op_node->Op()->GetAttr("seed")));
+    fused_attention_op_desc.SetAttr(
+        "attn_dropout_implementation",
+        PADDLE_GET_CONST(
+            std::string,
+            attn_dropout_op_node->Op()->GetAttr("dropout_implementation")));
+    fused_attention_op_desc.SetOutput("AttnDropoutMaskOut",
+                                      {attn_dropout_mask_node->Name()});
+    fused_attention_op_desc.SetOutput("AttnDropoutOut",
+                                      {attn_dropout_out_node->Name()});
+    fused_attention_op_desc.SetOutput("QKTVOut", {qkv_matmul_out_node->Name()});
+    fused_attention_op_desc.SetOutput("FMHAOut",
+                                      {qkv_reshape_out_node->Name()});
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        out_linear_matmul_w_node, out_linear_matmul_w, fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_matmul_out_node,
+                              out_linear_matmul_out,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_ele_add_bias_node,
+                              out_linear_ele_add_bias,
+                              fused_attention_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_ele_add_out_node,
+                              out_linear_ele_add_out,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetInput("OutLinearW",
+                                     {out_linear_matmul_w_node->Name()});
+    fused_attention_op_desc.SetInput("OutLinearBias",
+                                     {out_linear_ele_add_bias_node->Name()});
+    fused_attention_op_desc.SetOutput("OutLinearOut",
+                                      {out_linear_matmul_out_node->Name()});
+    GET_IR_NODE_FROM_SUBGRAPH(out_linear_dropout_mask_node,
+                              out_linear_dropout_mask,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetAttr(
+        "dropout_rate",
+        PADDLE_GET_CONST(
+            float, out_linear_dropout_op_node->Op()->GetAttr("dropout_prob")));
+    fused_attention_op_desc.SetAttr(
+        "dropout_fix_seed",
+        PADDLE_GET_CONST(
+            bool, out_linear_dropout_op_node->Op()->GetAttr("fix_seed")));
+    fused_attention_op_desc.SetAttr(
+        "dropout_seed",
+        PADDLE_GET_CONST(int,
+                         out_linear_dropout_op_node->Op()->GetAttr("seed")));
+    fused_attention_op_desc.SetAttr(
+        "dropout_implementation",
+        PADDLE_GET_CONST(std::string,
+                         out_linear_dropout_op_node->Op()->GetAttr(
+                             "dropout_implementation")));
+    fused_attention_op_desc.SetOutput("DropoutMaskOut",
+                                      {out_linear_dropout_mask_node->Name()});
+
+    GET_IR_NODE_FROM_SUBGRAPH(residual_ele_add_out_node,
+                              residual_ele_add_out,
+                              fused_attention_pattern);
+    fused_attention_op_desc.SetAttr("add_residual", true);
+    fused_attention_op_desc.SetOutput("Y", {residual_ele_add_out_node->Name()});
+
+    auto fused_attention_node = g->CreateOpNode(&fused_attention_op_desc);
+
+    IR_NODE_LINK_TO(subgraph.at(x), fused_attention_node);
+    IR_NODE_LINK_TO(pre_layer_norm_scale_node, fused_attention_node);
+    IR_NODE_LINK_TO(pre_layer_norm_bias_node, fused_attention_node);
+    IR_NODE_LINK_TO(fuse_qkv_matmul_w_node, fused_attention_node);
+    IR_NODE_LINK_TO(fuse_qkv_ele_add_bias_node, fused_attention_node);
+    IR_NODE_LINK_TO(add_mask_ele_add_mask_node, fused_attention_node);
+    IR_NODE_LINK_TO(out_linear_matmul_w_node, fused_attention_node);
+    IR_NODE_LINK_TO(out_linear_ele_add_bias_node, fused_attention_node);
+
+    IR_NODE_LINK_TO(fused_attention_node, pre_layer_norm_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, pre_layer_norm_mean_node);
+    IR_NODE_LINK_TO(fused_attention_node, pre_layer_norm_variance_node);
+    IR_NODE_LINK_TO(fused_attention_node, fuse_qkv_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, fuse_qkv_ele_add_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, fuse_qkv_transpose_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qk_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, add_mask_ele_add_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qk_softmax_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, attn_dropout_mask_node);
+    IR_NODE_LINK_TO(fused_attention_node, attn_dropout_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qkv_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, qkv_reshape_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, out_linear_matmul_out_node);
+    IR_NODE_LINK_TO(fused_attention_node, out_linear_dropout_mask_node);
+    IR_NODE_LINK_TO(fused_attention_node, residual_ele_add_out_node);
 
     GraphSafeRemoveNodes(g,
                          {pre_layer_norm_op_node,
@@ -858,8 +1036,7 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
                           out_linear_matmul_op_node,
                           out_linear_ele_add_op_node,
                           out_linear_dropout_op_node,
-                          residual_ele_add_op_node,
-                          post_layer_norm_op_node});
+                          residual_ele_add_op_node});
     found_fused_attention++;
   };
 
@@ -869,18 +1046,17 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostFwd(Graph* graph) const {
   return graph;
 }
 
-ir::Graph* FusedAttentionsPass::PreMaskDropResPostBwd(Graph* graph) const {
+ir::Graph* FusedAttentionsPass::PreMaskDropResBwd(Graph* graph) const {
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode(patterns::PDNodeName(name_scope_, "x"))
                 ->AsInput()
-                ->assert_is_op_input("layer_norm_grad", "Y@GRAD");
+                ->assert_is_op_input("elementwise_add_grad", "Out@GRAD");
   patterns::FusedAttentionGradPattern fused_attention_grad_pattern(
       gpd.mutable_pattern(), "fused_attention_grad_pattern");
 
   fused_attention_grad_pattern(x,
                                /* pre_layer_norm */ true,
-                               /* post_layer_norm */ true,
                                /* has_attn_mask */ true,
                                /* do_dropout */ true,
                                /* add_residual */ true);
@@ -891,9 +1067,6 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostBwd(Graph* graph) const {
                      Graph* g) {
     VLOG(3) << "handle FusedMultiHeadAttention backward pass's fusion";
 
-    GET_IR_NODE_FROM_SUBGRAPH(post_layer_norm_grad_op_node,
-                              post_layer_norm_grad_op,
-                              fused_attention_grad_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(residual_ele_add_grad_op_node,
                               residual_ele_add_grad_op,
                               fused_attention_grad_pattern);
@@ -953,17 +1126,26 @@ ir::Graph* FusedAttentionsPass::PreMaskDropResPostBwd(Graph* graph) const {
 
     // TODO(Yuang Liu): finish the handler
 
-    GraphSafeRemoveNodes(
-        g, {post_layer_norm_grad_op_node,    residual_ele_add_grad_op_node,
-            out_linear_dropout_grad_op_node, out_linear_ele_add_grad_op_node,
-            out_linear_matmul_grad_op_node,  qkv_reshape_grad_op_node,
-            qkv_transpose_grad_op_node,      qkv_matmul_grad_op_node,
-            attn_dropout_grad_op_node,       qk_softmax_grad_op_node,
-            add_mask_ele_add_grad_op_node,   qk_scale_grad_op_node,
-            qk_matmul_grad_op_node,          fuse_qkv_split_grad_op_node,
-            fuse_qkv_transpose_grad_op_node, fuse_qkv_reshape_grad_op_node,
-            fuse_qkv_ele_add_grad_op_node,   fuse_qkv_matmul_grad_op_node,
-            pre_layer_norm_grad_op_node,     grad_accumulation_sum_op_node});
+    GraphSafeRemoveNodes(g,
+                         {residual_ele_add_grad_op_node,
+                          out_linear_dropout_grad_op_node,
+                          out_linear_ele_add_grad_op_node,
+                          out_linear_matmul_grad_op_node,
+                          qkv_reshape_grad_op_node,
+                          qkv_transpose_grad_op_node,
+                          qkv_matmul_grad_op_node,
+                          attn_dropout_grad_op_node,
+                          qk_softmax_grad_op_node,
+                          add_mask_ele_add_grad_op_node,
+                          qk_scale_grad_op_node,
+                          qk_matmul_grad_op_node,
+                          fuse_qkv_split_grad_op_node,
+                          fuse_qkv_transpose_grad_op_node,
+                          fuse_qkv_reshape_grad_op_node,
+                          fuse_qkv_ele_add_grad_op_node,
+                          fuse_qkv_matmul_grad_op_node,
+                          pre_layer_norm_grad_op_node,
+                          grad_accumulation_sum_op_node});
 
     found_fused_attention++;
   };
diff --git a/paddle/fluid/framework/ir/fused_attention_pass.h b/paddle/fluid/framework/ir/fused_attention_pass.h
index d360f7f6520d1..41a90bd59960f 100644
--- a/paddle/fluid/framework/ir/fused_attention_pass.h
+++ b/paddle/fluid/framework/ir/fused_attention_pass.h
@@ -28,7 +28,7 @@ namespace patterns {
 
 // Declare patterns for multi head attention.
 // Can detect:
-// 1. Pre layer norm, post layer norm or sandwich layer norm.
+// 1. Pre layer norm or post layer norm.
 // 2. Add attn mask for qk product before the softmax or not.
 // 3. Do attn dropout or not.
 // 4. Add residual to the out linear result or not.
@@ -37,11 +37,10 @@ struct FusedAttentionPattern : public PatternBase {
       : PatternBase(pattern, name_scope, "fused_attention_pattern") {}
 
   PDNode* operator()(PDNode* x,
-                     bool pre_layer_norm,   // do pre ln or not
-                     bool post_layer_norm,  // do post ln or not
-                     bool has_attn_mask,    // add attn mask to qk or not
-                     bool do_dropout,       // dropout the softmax(qk) or not
-                     bool add_residual);    // add residual to out linear or not
+                     bool pre_layer_norm,  // do pre ln or not
+                     bool has_attn_mask,   // add attn mask to qk or not
+                     bool do_dropout,      // dropout the softmax(qk) or not
+                     bool add_residual);   // add residual to out linear or not
 
   // pre layer norm
   PATTERN_DECL_NODE(pre_layer_norm_op);
@@ -134,11 +133,10 @@ struct FusedAttentionGradPattern : public PatternBase {
       : PatternBase(pattern, name_scope, "fused_attention_pattern") {}
 
   PDNode* operator()(PDNode* x,
-                     bool pre_layer_norm,   // pre ln
-                     bool post_layer_norm,  // post ln
-                     bool has_attn_mask,    // add attn mask to qk or not
-                     bool do_dropout,       // dropout the softmax(qk) or not
-                     bool add_residual);    // add residual to out linear or not
+                     bool pre_layer_norm,  // pre ln
+                     bool has_attn_mask,   // add attn mask to qk or not
+                     bool do_dropout,      // dropout the softmax(qk) or not
+                     bool add_residual);   // add residual to out linear or not
 
   // post layer norm grad
   PATTERN_DECL_NODE(post_layer_norm_grad_op);
@@ -275,9 +273,9 @@ class FusedAttentionsPass : public FusePassBase {
   // If true, the function name will have an abbreviation part.
   // If false, the function name won't contain an abbreviation for it.
 
-  ir::Graph* PreMaskDropResPostFwd(Graph* graph) const;
+  ir::Graph* PreMaskDropResFwd(Graph* graph) const;
 
-  ir::Graph* PreMaskDropResPostBwd(Graph* graph) const;
+  ir::Graph* PreMaskDropResBwd(Graph* graph) const;
 };
 
 }  // namespace ir
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py b/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
index cce05d8747cdf..12366a574db21 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_pass.py
@@ -31,7 +31,6 @@ def __init__(
         num_heads,
         add_residual=True,
         pre_ln=True,
-        post_ln=False,
         attn_dropout=True,
     ):
         super(MultiHeadAttention, self).__init__()
@@ -42,7 +41,6 @@ def __init__(
 
         self.add_residual = add_residual
         self.pre_ln = pre_ln
-        self.post_ln = post_ln
         self.attn_dropout = attn_dropout
 
         self.head_dim = embed_dim // num_heads
@@ -90,7 +88,7 @@ def forward(self, x, attn_mask=None):
         if self.add_residual:
             out = residual + out
 
-        if self.post_ln:
+        if not self.pre_ln:
             # post layer norm
             out = self.norm2(out)
 
@@ -104,7 +102,6 @@ class TestFusedAttentionPass(unittest.TestCase):
     def setUp(self):
         self.add_residual = True
         self.pre_ln = True
-        self.post_ln = True
         self.attn_dropout = True
         self.add_mask = True
 
@@ -120,6 +117,7 @@ def test_pass(self):
         ).astype('float32')
 
         main_prog = paddle.static.Program()
+        main_prog.random_seed = 1234
         startup_prog = paddle.static.Program()
 
         with paddle.static.program_guard(main_prog, startup_prog):
@@ -142,7 +140,6 @@ def test_pass(self):
                 num_heads,
                 add_residual=self.add_residual,
                 pre_ln=self.pre_ln,
-                post_ln=self.post_ln,
                 attn_dropout=self.attn_dropout,
             )
 
@@ -157,13 +154,14 @@ def test_pass(self):
         pass_manager.apply([main_prog], [startup_prog])
 
         ops = main_prog.global_block().ops
-        assert ops[2].type == 'reduce_mean'
-        assert ops[4].type == 'reduce_mean_grad'
+        assert ops[2].type == 'fused_attention'
+        assert ops[3].type == 'reduce_mean'
+        assert ops[5].type == 'reduce_mean_grad'
         # two ops for linear, one op for reduce mean
         # one fill constant
         # one op for reduce mean grad, two ops for linear bwd
         # the eighth op should be the optimizer
-        assert ops[7].type == 'sgd'
+        assert ops[8].type == 'sgd'
 
 
 if __name__ == "__main__":

From 6edc7bba6ab192595ec860c3b5034e6bed92110a Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Wed, 1 Feb 2023 21:38:27 +0800
Subject: [PATCH 85/89] remove fluid.initializer.UniformInitializer,
 ConstantInitializer, NormalInitializer, TruncatedNormalInitializer,
 XavierInitializer, BilinearInitializer, MSRAInitializer,
 NumpyArrayInitializer and calculate_gain.. (#49498)

* move UniformInitializer and ConstantInitializer

* more modify

* circular import resolved

* another circular import resolved?

* more circular import 2

* circular import 3

* change import paddle in metric.py

* BuildStrategy import from fluid

* modify the framework import path in common.py

* change rnn.py import, from static to original framework

* change import static in the nn folder

* default_main_program should import from common_ops_import

* add import paddle in param_attr.py

* use core not paddle module for using VarDesc

* another old uniform

* mistake that use Uniform instead of UniformInitializer

* modify UniformInitializer doc

* move fluid.NormalInitializer to nn.initializer.NormalInitializer

* remove import of Normal in fluid.layers.nn.py

* remove more import of old Normal

* remove more import of old Normal

* sample code modify and tests modify import

* is_listen_failed passing arg should be log file

* problem solved

* a mistake solved

* comments resoleved and remove paddle.fluid.initializer.TruncatedNormalInitializer

* remove paddle.fluid.initializer.XavierInitializer and paddle.fluid.initializer.MSRAInitializer

* remove paddle.fluid.initializer.BilinearInitializer NumpyArrayInitializer and set_global_initializer

* change fluid to static

* change static to fluid to avoid circular import in distributed_strategy.py

* fix example code and test_initializer

* ValueType

* sample code fix

* change set_global_initializer back to fluid

* put paddle.static.BuildStrategy.ReduceStrategy into the fuction to avoid circular import

* remove calculate_gain, delete BilinearInitializer and revert set_global_initializer

* change the time of using UniformInitializer, ConstantInitializer, NormalInitializer, TruncatedNormalInitializer, XavierInitializer, MSRAInitializer, NumpyArrayInitializer as few as possible

* fix argument incampatible

* fix more arg incompatible

* fix test_prelu_op_xpu.py Constant

* fix inaccurate doc

* more doc fix: default value
---
 python/paddle/common_ops_import.py            |    1 -
 .../fleet/base/distributed_strategy.py        |    3 +-
 .../distributed/fleet/layers/mpu/random.py    |    2 +-
 .../fleet/meta_optimizers/dgc_optimizer.py    |    4 +-
 .../distributed/fleet/metrics/metric.py       |    2 +-
 python/paddle/fluid/compiler.py               |    6 +-
 .../paddle/fluid/contrib/layers/metric_op.py  |    6 +-
 python/paddle/fluid/contrib/layers/nn.py      |   32 +-
 python/paddle/fluid/evaluator.py              |    1 -
 .../incubate/fleet/tests/fleet_deep_ctr.py    |    6 +-
 python/paddle/fluid/initializer.py            | 1218 +----------------
 python/paddle/fluid/install_check.py          |    5 +-
 python/paddle/fluid/layer_helper.py           |    7 +-
 python/paddle/fluid/layers/io.py              |    2 +-
 python/paddle/fluid/layers/nn.py              |    8 +-
 python/paddle/fluid/metrics.py                |    1 -
 python/paddle/fluid/optimizer.py              |   20 +-
 python/paddle/fluid/param_attr.py             |   13 +-
 .../unittests/auto_parallel_autoconvert.py    |    9 +-
 .../unittests/auto_parallel_save_load.py      |    5 +-
 .../collective/column_parallel_linear_api.py  |    8 +-
 .../fleet/parallel_dygraph_transformer.py     |   10 +-
 .../collective/fleet/pipeline_mnist.py        |    8 +-
 .../fleet/pipeline_mnist_multi_device.py      |    8 +-
 .../fleet/pipeline_mnist_one_device.py        |    6 +-
 .../fleet/static_model_parallel_by_col.py     |    6 +-
 .../fleet/static_model_parallel_by_row.py     |    8 +-
 .../fleet/static_model_parallel_embedding.py  |    6 +-
 .../multinode/dygraph_hybrid_dpppmp.py        |    2 +-
 .../multinode/dygraph_hybrid_fp16.py          |    2 +-
 .../multinode/dygraph_hybrid_recompute.py     |    2 +-
 .../collective/parallel_embedding_api.py      |    4 +-
 .../collective/row_parallel_linear_api.py     |    4 +-
 .../tests/unittests/dist_allreduce_op.py      |    6 +-
 .../paddle/fluid/tests/unittests/dist_ctr.py  |    6 +-
 .../fluid/tests/unittests/dist_fleet_ctr.py   |    6 +-
 .../dist_fleet_heter_pipeline_ctr.py          |    6 +-
 .../dist_fleet_raw_program_optimizer.py       |    6 +-
 ...et_raw_program_optimizer_fuse_allreduce.py |    6 +-
 .../tests/unittests/dist_fleet_simnet_bow.py  |   16 +-
 .../dist_fleet_sparse_embedding_ctr.py        |   10 +-
 .../fluid/tests/unittests/dist_mnist.py       |    6 +-
 .../fluid/tests/unittests/dist_se_resnext.py  |    8 +-
 .../unittests/dist_text_classification.py     |    8 +-
 .../fluid/tests/unittests/dist_transformer.py |   16 +-
 .../fluid/tests/unittests/dist_word2vec.py    |   12 +-
 .../test_auto_parallel_gradient_merge_pass.py |    9 +-
 .../dygraph_to_static/bert_dygraph_model.py   |   10 +-
 .../unittests/dygraph_to_static/darknet.py    |    6 +-
 .../seq2seq_dygraph_model.py                  |    6 +-
 .../dygraph_to_static/simnet_dygraph_model.py |    5 +-
 .../simnet_dygraph_model_v2.py                |    2 +-
 .../test_basic_api_transformation.py          |   14 +-
 .../unittests/dygraph_to_static/test_bmn.py   |    4 +-
 .../dygraph_to_static/test_convert_call.py    |    4 +-
 .../dygraph_to_static/test_cycle_gan.py       |   20 +-
 .../unittests/dygraph_to_static/test_lac.py   |   12 +-
 .../dygraph_to_static/test_mobile_net.py      |    7 +-
 .../dygraph_to_static/test_ptb_lm.py          |   14 +-
 .../dygraph_to_static/test_resnet.py          |    2 +-
 .../dygraph_to_static/test_se_resnet.py       |    6 +-
 .../dygraph_to_static/test_word2vec.py        |    4 +-
 .../transformer_dygraph_model.py              |   10 +-
 .../unittests/dygraph_to_static/yolov3.py     |    4 +-
 .../unittests/fleet_heter_ps_training.py      |    6 +-
 .../unittests/ir/inference/program_config.py  |    3 +-
 .../test_mkldnn_conv_bias_fuse_pass.py        |   12 +-
 .../ir/inference/test_trt_subgraph_pass.py    |    4 +-
 .../mlu/test_batch_norm_op_mlu_v2.py          |    6 +-
 .../tests/unittests/npu/test_adam_op_npu.py   |    4 +-
 .../unittests/npu/test_run_program_op_npu.py  |    4 +-
 .../parallel_dygraph_sparse_embedding.py      |    6 +-
 .../fluid/tests/unittests/simple_nets.py      |    4 +-
 .../static_model_parallel_fused_attention.py  |    6 +-
 ...static_model_parallel_fused_feedforward.py |    6 +-
 ..._model_parallel_fused_multi_transformer.py |    6 +-
 .../fluid/tests/unittests/test_adam_op.py     |    6 +-
 .../unittests/test_auto_parallel_mapper.py    |   17 +-
 .../test_avoid_twice_initialization.py        |    5 +-
 .../fluid/tests/unittests/test_base_layer.py  |    2 +-
 .../tests/unittests/test_batch_norm_op_v2.py  |    6 +-
 .../tests/unittests/test_calc_gradient.py     |    4 +-
 .../tests/unittests/test_communicator_geo.py  |    2 +-
 .../tests/unittests/test_conv2d_layer.py      |    5 +-
 .../unittests/test_conv2d_transpose_layer.py  |    5 +-
 .../tests/unittests/test_conv3d_layer.py      |    5 +-
 .../unittests/test_conv3d_transpose_layer.py  |    5 +-
 .../tests/unittests/test_cuda_random_seed.py  |    8 +-
 .../unittests/test_decoupled_py_reader.py     |    2 +-
 .../fluid/tests/unittests/test_desc_clone.py  |    4 +-
 .../fluid/tests/unittests/test_detach.py      |    4 +-
 ..._dist_fleet_a_sync_optimizer_auto_async.py |    2 +-
 .../test_dist_fleet_heter_program.py          |   18 +-
 .../unittests/test_dist_fleet_minimize.py     |   12 +-
 .../tests/unittests/test_dist_fleet_ps.py     |   12 +-
 .../tests/unittests/test_dist_fleet_ps11.py   |   12 +-
 .../tests/unittests/test_dist_fleet_ps12.py   |   12 +-
 .../tests/unittests/test_dist_fleet_ps13.py   |   12 +-
 .../tests/unittests/test_dist_fleet_ps2.py    |   12 +-
 .../tests/unittests/test_dist_fleet_ps3.py    |   12 +-
 .../tests/unittests/test_dist_fleet_ps4.py    |   12 +-
 .../tests/unittests/test_dist_fleet_ps5.py    |   12 +-
 .../tests/unittests/test_dist_fleet_ps6.py    |   12 +-
 .../test_dist_fleet_sparse_embedding_ctr.py   |    6 +-
 .../tests/unittests/test_dist_fleet_spmt.py   |   12 +-
 .../unittests/test_dist_sparse_load_ps0.py    |    8 +-
 .../fluid/tests/unittests/test_dist_train.py  |   12 +-
 .../tests/unittests/test_dist_transpiler.py   |   20 +-
 .../test_eager_deletion_delete_vars.py        |    2 +-
 .../test_eager_deletion_padding_rnn.py        |   14 +-
 .../test_eager_deletion_recurrent_op.py       |    4 +-
 .../tests/unittests/test_egr_python_api.py    |    6 +-
 .../tests/unittests/test_functional_conv2d.py |    9 +-
 .../test_functional_conv2d_transpose.py       |    9 +-
 .../tests/unittests/test_functional_conv3d.py |    9 +-
 .../test_functional_conv3d_transpose.py       |    9 +-
 .../tests/unittests/test_fuse_bn_act_pass.py  |    4 +-
 .../unittests/test_fuse_bn_add_act_pass.py    |   14 +-
 .../test_fused_multi_transformer_op.py        |   10 +-
 .../unittests/test_generator_dataloader.py    |    2 +-
 .../fluid/tests/unittests/test_hsigmoid_op.py |   15 +-
 .../tests/unittests/test_imperative_deepcf.py |    2 +-
 ..._imperative_lod_tensor_to_selected_rows.py |    4 +-
 .../test_imperative_ocr_attention_model.py    |    9 +-
 .../unittests/test_imperative_ptb_rnn.py      |   14 +-
 .../tests/unittests/test_imperative_resnet.py |    2 +-
 .../unittests/test_imperative_save_load_v2.py |   14 +-
 ..._imperative_selected_rows_to_lod_tensor.py |    6 +-
 ..._imperative_transformer_sorted_gradient.py |   10 +-
 .../fluid/tests/unittests/test_initializer.py |   73 +-
 .../tests/unittests/test_ir_inplace_pass.py   |    2 +-
 .../fluid/tests/unittests/test_layers.py      |   66 +-
 .../fluid/tests/unittests/test_linear.py      |    4 +-
 .../unittests/test_lookup_table_bf16_op.py    |    2 +-
 .../unittests/test_lookup_table_v2_bf16_op.py |    2 +-
 .../unittests/test_lookup_table_v2_op.py      |    4 +-
 ...cess_dataloader_iterable_dataset_static.py |    4 +-
 .../test_multiprocess_dataloader_static.py    |    4 +-
 .../paddle/fluid/tests/unittests/test_nce.py  |    5 +-
 .../test_nn_functional_embedding_static.py    |    4 +-
 .../tests/unittests/test_optimizer_grad.py    |    6 +-
 .../test_optimizer_in_control_flow.py         |    8 +-
 .../unittests/test_parallel_executor_mnist.py |    4 +-
 .../fluid/tests/unittests/test_parameter.py   |    3 +-
 .../fluid/tests/unittests/test_prelu_op.py    |    4 +-
 .../unittests/test_program_prune_backward.py  |    2 +-
 .../fluid/tests/unittests/test_prune.py       |    6 +-
 .../fluid/tests/unittests/test_py_func_op.py  |    2 +-
 .../fluid/tests/unittests/test_random_seed.py |    8 +-
 .../tests/unittests/test_recurrent_op.py      |    8 +-
 .../fluid/tests/unittests/test_row_conv_op.py |    2 +-
 .../tests/unittests/test_run_program_op.py    |    6 +-
 .../tests/unittests/test_set_bool_attr.py     |    4 +-
 .../fluid/tests/unittests/test_sgd_op_bf16.py |    2 +-
 .../tests/unittests/test_static_save_load.py  |   14 +-
 .../tests/unittests/test_tdm_child_op.py      |    4 +-
 .../tests/unittests/test_tdm_sampler_op.py    |    6 +-
 .../tests/unittests/test_uniform_random_op.py |    2 +-
 .../unittests/test_weight_normalization.py    |    3 +-
 .../tests/unittests/transformer_model.py      |   25 +-
 .../unittests/xpu/test_batch_norm_op_xpu.py   |    2 +-
 .../test_fused_resnet_basic_block_op_xpu.py   |   36 +-
 .../tests/unittests/xpu/test_prelu_op_xpu.py  |    2 +-
 .../fluid/transpiler/distribute_transpiler.py |    4 +-
 python/paddle/incubate/asp/asp.py             |    5 +-
 python/paddle/nn/decode.py                    |    2 +-
 python/paddle/nn/functional/common.py         |    2 +-
 python/paddle/nn/functional/conv.py           |    2 +-
 python/paddle/nn/functional/extension.py      |    2 +-
 python/paddle/nn/functional/input.py          |    2 +-
 python/paddle/nn/functional/loss.py           |    2 +-
 python/paddle/nn/functional/vision.py         |    2 +-
 python/paddle/nn/initializer/Bilinear.py      |  182 +++
 python/paddle/nn/initializer/__init__.py      |   13 +-
 python/paddle/nn/initializer/assign.py        |  120 +-
 python/paddle/nn/initializer/constant.py      |   65 +-
 python/paddle/nn/initializer/dirac.py         |    2 +-
 python/paddle/nn/initializer/initializer.py   |  159 +++
 python/paddle/nn/initializer/kaiming.py       |  188 ++-
 python/paddle/nn/initializer/normal.py        |  193 ++-
 python/paddle/nn/initializer/orthogonal.py    |    2 +-
 python/paddle/nn/initializer/uniform.py       |  133 +-
 python/paddle/nn/initializer/xavier.py        |  182 ++-
 python/paddle/nn/layer/rnn.py                 |    9 +-
 python/paddle/optimizer/optimizer.py          |    9 +-
 python/paddle/static/nn/common.py             |   14 +-
 python/paddle/static/nn/loss.py               |    4 +-
 python/paddle/static/nn/metric.py             |    5 +-
 python/paddle/tensor/array.py                 |    2 +-
 python/paddle/tensor/attribute.py             |    2 +-
 python/paddle/tensor/creation.py              |    8 +-
 .../paddle/tensor/layer_function_generator.py |    2 +-
 python/paddle/tensor/linalg.py                |    2 +-
 python/paddle/tensor/logic.py                 |    2 +-
 python/paddle/tensor/manipulation.py          |    3 +-
 python/paddle/tensor/math.py                  |    2 +-
 python/paddle/tensor/random.py                |    2 +-
 python/paddle/tensor/stat.py                  |    2 +-
 python/paddle/vision/ops.py                   |    4 +-
 199 files changed, 1927 insertions(+), 1925 deletions(-)
 create mode 100644 python/paddle/nn/initializer/Bilinear.py
 create mode 100644 python/paddle/nn/initializer/initializer.py
 mode change 100755 => 100644 python/paddle/tensor/logic.py

diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py
index 91a3f49cdbba2..1ec54064eb64e 100644
--- a/python/paddle/common_ops_import.py
+++ b/python/paddle/common_ops_import.py
@@ -32,7 +32,6 @@
     dygraph_only,
     in_dygraph_mode,
 )
-from paddle.fluid.initializer import Constant  # noqa: F401
 from paddle.fluid.layer_helper import LayerHelper  # noqa: F401
 from paddle.fluid.layers import fill_constant, utils  # noqa: F401
 from paddle.fluid.layers.layer_function_generator import (  # noqa: F401
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index efa94862b5246..fbe391b45f055 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -104,7 +104,6 @@ def _set_distributed_strategy(self, dist_strategy):
         self.job_info.strategy = dist_strategy
 
 
-ReduceStrategyFluid = paddle.static.BuildStrategy.ReduceStrategy
 ReduceStrategyFleet = int
 
 
@@ -261,7 +260,7 @@ def build_strategy(self):
         for f in fields:
             value = getattr(self.strategy.build_strategy, f.name)
             if f.name == 'reduce_strategy':
-                value = ReduceStrategyFluid(value)
+                value = paddle.static.BuildStrategy.ReduceStrategy(value)
             setattr(build_strategy, f.name, value)
         return build_strategy
 
diff --git a/python/paddle/distributed/fleet/layers/mpu/random.py b/python/paddle/distributed/fleet/layers/mpu/random.py
index 7b89330d951c8..718c85e855734 100644
--- a/python/paddle/distributed/fleet/layers/mpu/random.py
+++ b/python/paddle/distributed/fleet/layers/mpu/random.py
@@ -18,11 +18,11 @@
 
 import paddle
 from paddle import _legacy_C_ops
+from paddle.common_ops_import import Variable
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.framework import LayerHelper
-from paddle.static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 9dce0d540a16f..98d131822fe36 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -171,7 +171,7 @@ def _add_auto_increment_var(self, counter_name, begin, step=1):
         if is_new_var:
             helper.set_variable_initializer(
                 counter,
-                initializer=paddle.fluid.initializer.Constant(
+                initializer=paddle.nn.initializer.ConstantInitializer(
                     value=float(begin - 1), force_cpu=True
                 ),
             )
@@ -194,7 +194,7 @@ def _add_nranks_var(self, name, value=-1):
         if is_new_var:
             helper.set_variable_initializer(
                 counter,
-                initializer=paddle.fluid.initializer.Constant(
+                initializer=paddle.nn.initializer.ConstantInitializer(
                     value=float(value), force_cpu=True
                 ),
             )
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 999ab6f0af126..d2f72b0c7d047 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 import paddle
-from paddle.static import Variable
+from paddle.common_ops_import import Variable
 
 __all__ = []
 
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index e8393c63b1053..609bfa3d93e53 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -586,7 +586,6 @@ def convert_concrete_program(
         """
         from ..fluid.dygraph.base import switch_to_static_graph
         from ..fluid import backward
-        from ..fluid.initializer import Constant
         from ..fluid.framework import device_guard
         import paddle
 
@@ -645,7 +644,10 @@ def append_backward_desc():
                     device = optimizer._get_device_for_param(param_name)
                     with device_guard(device):
                         optimizer.helper.set_variable_initializer(
-                            var, initializer=Constant(value=0.0)
+                            var,
+                            initializer=paddle.nn.initializer.Constant(
+                                value=0.0
+                            ),
                         )
                     param_or_lr_tensor = scope.find_var(
                         var_tmp.name
diff --git a/python/paddle/fluid/contrib/layers/metric_op.py b/python/paddle/fluid/contrib/layers/metric_op.py
index 414fcf5b6cd51..07d6b464ddb11 100755
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ b/python/paddle/fluid/contrib/layers/metric_op.py
@@ -17,7 +17,6 @@
 
 import warnings
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.initializer import Normal, Constant
 from paddle.fluid.framework import Variable
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layers import tensor
@@ -147,7 +146,10 @@ def ctr_metric_bundle(input, label, ins_tag_weight=None):
         local_ins_num,
     ]:
         helper.set_variable_initializer(
-            var, Constant(value=0.0, force_cpu=True)
+            var,
+            paddle.nn.initializer.ConstantInitializer(
+                value=0.0, force_cpu=True
+            ),
         )
 
     helper.append_op(
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index d2aff8bfcf659..9064e4f9f09dd 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -24,7 +24,6 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import utils
 from ... import unique_name
-from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import (
     check_variable_and_dtype,
     check_type,
@@ -896,8 +895,10 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
 
     Examples:
         .. code-block:: python
+        import paddle
         import paddle.fluid as fluid
         import numpy as np
+        paddle.enable_static()
         x = fluid.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
         tree_info = [[0,0,0,1,2],
                      [0,1,0,3,4],[0,1,0,5,6],
@@ -908,7 +909,7 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
         child_nums = 2
         child, leaf_mask  = fluid.contrib.layers.tdm_child(x, node_nums, child_nums,
                                 param_attr=fluid.ParamAttr(
-                                    initializer=fluid.initializer.NumpyArrayInitializer(
+                                    initializer=paddle.nn.initializer.Assign(
                                                                             tree_info_np)))
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -925,7 +926,7 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
         attr=helper.param_attr,
         shape=[node_nums, 3 + child_nums],
         dtype=dtype,
-        default_initializer=Constant(0),
+        default_initializer=paddle.nn.initializer.Constant(0),
     )
     tree_info.stop_gradient = True
 
@@ -1003,8 +1004,10 @@ def tdm_sampler(
 
     Examples:
         .. code-block:: python
+        import paddle
         import paddle.fluid as fluid
         import numpy as np
+        paddle.enable_static()
         x = fluid.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
         travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num)
         layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1)
@@ -1022,10 +1025,10 @@ def tdm_sampler(
             layer_node_num_list,
             leaf_node_num,
             tree_travel_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     travel_array)),
             tree_layer_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     layer_array)),
             output_positive=True,
             output_list=True,
@@ -1089,7 +1092,7 @@ def tdm_sampler(
         attr=tree_travel_attr,
         shape=travel_shape,
         dtype=tree_dtype,
-        default_initializer=Constant(0),
+        default_initializer=paddle.nn.initializer.Constant(0),
     )
 
     layer_shape = [node_nums, 1]
@@ -1097,7 +1100,7 @@ def tdm_sampler(
         attr=tree_layer_attr,
         shape=layer_shape,
         dtype=tree_dtype,
-        default_initializer=Constant(0),
+        default_initializer=paddle.nn.initializer.Constant(0),
     )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -1640,7 +1643,7 @@ def build_program(main_program, startup_program):
         attr=helper.param_attr,
         shape=param_shape,
         dtype=bn_param_dtype,
-        default_initializer=Constant(1.0),
+        default_initializer=paddle.nn.initializer.Constant(1.0),
     )
     bias = helper.create_parameter(
         attr=helper.bias_attr,
@@ -1650,7 +1653,9 @@ def build_program(main_program, startup_program):
     )
     mean = helper.create_parameter(
         attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False
+            name=moving_mean_name,
+            initializer=paddle.nn.initializer.Constant(0.0),
+            trainable=False,
         ),
         shape=param_shape,
         dtype=bn_param_dtype,
@@ -1659,7 +1664,7 @@ def build_program(main_program, startup_program):
     variance = helper.create_parameter(
         attr=ParamAttr(
             name=moving_variance_name,
-            initializer=Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=False,
         ),
         shape=param_shape,
@@ -1723,13 +1728,16 @@ def pow2_decay_with_linear_warmup(
     helper = LayerHelper("pow2_decay_with_linear_warmup", **locals())
     lr = helper.create_global_variable(persistable=True, dtype=dtype, shape=[1])
     helper.set_variable_initializer(
-        lr, Constant(value=float(base_lr) / warmup_steps)
+        lr,
+        paddle.nn.initializer.Constant(value=float(base_lr) / warmup_steps),
     )
 
     step = helper.create_global_variable(
         persistable=True, dtype='int64', shape=[1]
     )
-    helper.set_variable_initializer(step, Constant(value=0))
+    helper.set_variable_initializer(
+        step, paddle.nn.initializer.Constant(value=0)
+    )
     assert (
         warmup_steps <= total_steps
     ), "warmup_steps cannot be larger than total_steps"
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 472bcbd3cac4b..a4d80ecbfed25 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -20,7 +20,6 @@
 from .framework import Program, Variable, program_guard
 from . import unique_name
 from .layer_helper import LayerHelper
-from .initializer import Constant
 
 
 def _clone_var_(block, var):
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 23f5a44fe139e..9fc9182017ec4 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -109,7 +109,7 @@ def model():
         size=[dnn_input_dim, dnn_layer_dims[0]],
         param_attr=fluid.ParamAttr(
             name="deep_embedding",
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
         ),
         is_sparse=True,
     )
@@ -121,7 +121,7 @@ def model():
             size=dim,
             activation="relu",
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
             name='dnn-fc-%d' % i,
         )
@@ -134,7 +134,7 @@ def model():
         size=[lr_input_dim, 1],
         param_attr=fluid.ParamAttr(
             name="wide_embedding",
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
         ),
         is_sparse=True,
     )
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 38650856b0720..6eb88d8f8ef3d 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -30,1139 +30,24 @@
 from paddle import _C_ops, _legacy_C_ops
 import paddle
 
-__all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'TruncatedNormal',
-    'Xavier',
-    'Bilinear',
-    'MSRA',
-    'ConstantInitializer',
-    'UniformInitializer',
-    'NormalInitializer',
-    'TruncatedNormalInitializer',
-    'XavierInitializer',
-    'BilinearInitializer',
-    'MSRAInitializer',
-    'NumpyArrayInitializer',
-    'set_global_initializer',
-]
+__all__ = ['set_global_initializer']
 
 _global_weight_initializer_ = None
 _global_bias_initializer_ = None
 
 
-class Initializer:
-    """Base class for variable initializers
-
-    Defines the common interface of variable initializers.
-    They add operations to the init program that are used
-    to initialize variables. Users should not use this class
-    directly, but need to use one of its implementations.
-    """
-
-    def __init__(self):
-        pass
-
-    def __call__(self, param, block=None):
-        if not lazy_init_helper().state:
-            return self.forward(param, block)
-
-        return self._lazy_init(param, block)
-
-    def forward(self, param, block=None):
-        """Add corresponding initialization operations to the network"""
-        raise NotImplementedError()
-
-    def _lazy_init(self, param, block=None):
-        """
-        Apply lazy initialization
-        """
-        assert in_dygraph_mode()
-
-        def init_op_creator(forward, param, block):
-            new_var = param._to_static_var(True, block=block)
-            # Record initializer operator
-            with lazy_init_helper():
-                forward(new_var, block)
-
-        # Add hook function for initializing param in dygraph mode
-        param.set_init_func(functools.partial(self.forward, param, block))
-        param._init_op_creator = functools.partial(
-            init_op_creator, self.forward, param
-        )
-
-        return param
-
-    def _check_block(self, block):
-        if block is None:
-            block = default_main_program().global_block()
-
-        return block
-
-    def _compute_fans(self, var):
-        """Compute the fan_in and the fan_out for layers
-
-        This method computes the fan_in and the fan_out
-        for neural network layers, if not specified. It is
-        not possible to perfectly estimate fan_in and fan_out.
-        This method will estimate it correctly for matrix multiply and
-        convolutions.
-
-        Args:
-            var: variable for which fan_in and fan_out have to be computed
-
-        Returns:
-            tuple of two integers (fan_in, fan_out)
-        """
-        shape = var.shape
-        if not shape or len(shape) == 0:
-            fan_in = fan_out = 1
-        elif len(shape) == 1:
-            fan_in = fan_out = shape[0]
-        elif len(shape) == 2:
-            # This is the case for simple matrix multiply
-            fan_in = shape[0]
-            fan_out = shape[1]
-        else:
-            # Assume this to be a convolutional kernel
-            # In PaddlePaddle, the shape of the kernel is like:
-            # [num_filters, num_filter_channels, ...] where the remaining
-            # dimensions are the filter_size
-            receptive_field_size = np.prod(shape[2:])
-            fan_in = shape[1] * receptive_field_size
-            fan_out = shape[0] * receptive_field_size
-
-        return (fan_in, fan_out)
-
-
-class ConstantInitializer(Initializer):
-    """Implements the constant initializer
-
-    Args:
-        value (float32): constant value to initialize the variable
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32")
-            fc = paddle.static.nn.fc(
-                x,
-                size=10,
-                weight_attr=fluid.initializer.Constant(value=2.0))
-
-    """
-
-    def __init__(self, value=0.0, force_cpu=False):
-        assert value is not None
-        super().__init__()
-        self._value = value
-        self._force_cpu = force_cpu
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with constant.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable) or isinstance(
-            var, framework.EagerParamBase
-        )
-        assert isinstance(block, framework.Block)
-
-        if in_dygraph_mode():
-            place = _current_expected_place()
-            if self._force_cpu:
-                place = core.CPUPlace()
-            _C_ops.full_(
-                var, var.shape, str(float(self._value)), var.dtype, place
-            )
-            return None
-        else:
-            op = block.append_op(
-                type="fill_constant",
-                outputs={"Out": var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": int(var.dtype),
-                    "value": float(self._value),
-                    'str_value': str(float(self._value)),
-                    'force_cpu': self._force_cpu,
-                },
-                stop_gradient=True,
-            )
-
-            var.op = op
-            return op
-
-
-class UniformInitializer(Initializer):
-    """Implements the random uniform distribution initializer
-
-    Args:
-        low (float): lower boundary of the uniform distribution
-        high (float): upper boundary of the uniform distribution
-        seed (int): random seed
-        diag_num (int): the number of diagonal elements to initialize.
-            If set to 0, diagonal initialization will be not performed.
-        diag_step (int): Step size between two diagonal elements,
-            which is generally the width of the square matrix.
-        diag_val (float): the value of the diagonal element to be initialized,
-            default 1.0. It takes effect only if the diag_num is greater than 0.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name='x', shape=[None, 1], dtype='float32')
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
-    """
-
-    def __init__(
-        self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0
-    ):
-        assert low is not None
-        assert high is not None
-        assert high >= low
-        assert seed is not None
-        assert diag_num is not None
-        assert diag_step is not None
-        assert diag_val is not None
-        if diag_num > 0 or diag_step > 0:
-            assert diag_num > 0 and diag_step > 0
-        super().__init__()
-        self._low = low
-        self._high = high
-        self._seed = seed
-        self._diag_num = diag_num
-        self._diag_step = diag_step
-        self._diag_val = diag_val
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Uniform distribution.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(block, framework.Block)
-        if not in_dygraph_mode():
-            check_variable_and_dtype(
-                var,
-                "Out",
-                ["uint16", "float16", "float32", "float64"],
-                "uniform_random",
-            )
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['uniform_random', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            out_var = _C_ops.uniform(
-                var.shape,
-                out_dtype,
-                self._low,
-                self._high,
-                self._seed,
-                _current_expected_place(),
-            )
-            if var.dtype == VarDesc.VarType.FP16:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            op = block.append_op(
-                type="uniform_random",
-                inputs={},
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "min": self._low,
-                    "max": self._high,
-                    "seed": self._seed,
-                    "diag_num": self._diag_num,
-                    "diag_step": self._diag_step,
-                    "diag_val": self._diag_val,
-                },
-                stop_gradient=True,
-            )
-
-            if var.dtype == VarDesc.VarType.FP16:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class NormalInitializer(Initializer):
-    """Implements the Random Normal(Gaussian) distribution initializer
-
-    Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
-        seed (int): random seed
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
-
-    """
-
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
-        assert seed is not None
-        super().__init__()
-        self._mean = loc
-        self._std_dev = scale
-        self._seed = seed
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Normal distribution.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(block, framework.Block)
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        if in_dygraph_mode():
-            place = _current_expected_place()
-            out_var = _C_ops.gaussian(
-                var.shape,
-                self._mean,
-                self._std_dev,
-                self._seed,
-                var.dtype,
-                place,
-            )
-            out_var._share_underline_tensor_to(var)
-            return None
-
-        else:
-            check_variable_and_dtype(
-                var,
-                "Out",
-                ["uint16", "float16", "float32", "float64"],
-                "guassian_random",
-            )
-            op = block.append_op(
-                type="gaussian_random",
-                outputs={"Out": var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": var.dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed,
-                    "use_mkldnn": False,
-                },
-                stop_gradient=True,
-            )
-            var.op = op
-            return op
-
-
-class TruncatedNormalInitializer(Initializer):
-    """Implements the Random TruncatedNormal(Gaussian) distribution initializer
-
-    Args:
-        loc (float): mean of the normal distribution
-        scale (float): standard deviation of the normal distribution
-        seed (int): random seed
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name='x', shape=[None, 1], dtype='float32')
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0))
-    """
-
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
-        assert loc is not None
-        assert scale is not None
-        assert seed is not None
-        super().__init__()
-        self._mean = loc
-        self._std_dev = scale
-        self._seed = seed
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with TruncatedNormal distribution.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['truncated_gaussian_random', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            out_var = _C_ops.truncated_gaussian_random(
-                var.shape,
-                self._mean,
-                self._std_dev,
-                self._seed,
-                out_dtype,
-                _current_expected_place(),
-            )
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-
-        else:
-            op = block.append_op(
-                type="truncated_gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": var.shape,
-                    "dtype": out_dtype,
-                    "mean": self._mean,
-                    "std": self._std_dev,
-                    "seed": self._seed,
-                },
-                stop_gradient=True,
-            )
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-            var.op = op
-            return op
-
-
-class XavierInitializer(Initializer):
-    r"""
-    This class implements the Xavier weight initializer from the paper
-    `Understanding the difficulty of training deep feedforward neural
-    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-    by Xavier Glorot and Yoshua Bengio.
-
-    This initializer is designed to keep the scale of the gradients
-    approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where
-
-    .. math::
-
-        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
-
-    In case of Normal distribution, the mean is 0 and the standard deviation
-    is
-
-    .. math::
-
-        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
-
-
-    Args:
-        uniform (bool,default True): whether to use uniform ,if False use normal distribution
-        fan_in (float,default None): fan_in for Xavier initialization. If None, it is
-                inferred from the variable.
-        fan_out (float,default None): fan_out for Xavier initialization. If None, it is
-                 inferred from the variable.
-        seed (int): random seed
-
-    Note:
-        It is recommended to set fan_in and fan_out to None for most cases.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            queries = fluid.data(name='x', shape=[None,1], dtype='float32')
-            fc = paddle.static.nn.fc(
-                x=queries, size=10,
-                weight_attr=fluid.initializer.Xavier(uniform=False))
-
-    """
-
-    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
-        assert uniform is not None
-        assert seed is not None
-        super().__init__()
-        self._uniform = uniform
-        self._fan_in = fan_in
-        self._fan_out = fan_out
-        self._seed = seed
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Xavier initialization.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(block, framework.Block)
-        if not in_dygraph_mode():
-            check_variable_and_dtype(
-                var,
-                "Out",
-                ["uint16", "float16", "float32", "float64"],
-                "xavier_init",
-            )
-
-        f_in, f_out = self._compute_fans(var)
-
-        # If fan_in and fan_out are passed, use them
-        fan_in = f_in if self._fan_in is None else self._fan_in
-        fan_out = f_out if self._fan_out is None else self._fan_out
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or (
-            var.dtype == VarDesc.VarType.BF16 and not self._uniform
-        ):
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['xavier_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
-                out_var = _C_ops.uniform(
-                    out_var.shape,
-                    out_dtype,
-                    -limit,
-                    limit,
-                    self._seed,
-                    _current_expected_place(),
-                )
-            else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
-
-                place = _current_expected_place()
-                out_var = _C_ops.gaussian(
-                    out_var.shape, 0.0, std, self._seed, out_dtype, place
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            if self._uniform:
-                limit = math.sqrt(6.0 / float(fan_in + fan_out))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": out_dtype,
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-            else:
-                std = math.sqrt(2.0 / float(fan_in + fan_out))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": out_var.dtype,
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class MSRAInitializer(Initializer):
-    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
-
-    This class implements the weight initialization from the paper
-    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
-    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
-    robust initialization method that particularly considers the rectifier
-    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
-
-    .. math::
-
-        x = gain \times \sqrt{\frac{3}{fan\_in}}
-
-    In case of Normal distribution, the mean is 0 and the standard deviation
-    is
-
-    .. math::
-
-        \frac{gain}{\sqrt{{fan\_in}}}
-
-    Args:
-        uniform (bool, optional): whether to use uniform or normal distribution
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. default is None.
-        seed (int32, optional): random seed.
-        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
-        nonlinearity(str, optional): the non-linear function. default is relu.
-
-    Note:
-        It is recommended to set fan_in to None for most cases.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
-            x = fluid.data(name="data", shape=[8, 32, 32], dtype="float32")
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.MSRA(uniform=False))
-
+def _global_weight_initializer():
     """
-
-    def __init__(
-        self,
-        uniform=True,
-        fan_in=None,
-        seed=0,
-        negative_slope=0,
-        nonlinearity='relu',
-    ):
-        """Constructor for MSRAInitializer"""
-        assert uniform is not None
-        assert seed is not None
-        super().__init__()
-        self._uniform = uniform
-        self._fan_in = fan_in
-        self._seed = seed
-        self._negative_slope = negative_slope
-        self._nonlinearity = nonlinearity
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with MSRA initialization.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-        f_in, f_out = self._compute_fans(var)
-
-        # If fan_in is passed, use it
-        fan_in = f_in if self._fan_in is None else self._fan_in
-
-        if fan_in == 0:
-            if self._fan_in is None:
-                raise ValueError(
-                    "The in_features of the Tensor contain zero, can not initialize the Tensor."
-                )
-            else:
-                raise ValueError(
-                    "fan_in should not be zero, can not initialize the Tensor."
-                )
-
-        if self._seed == 0:
-            self._seed = block.program.random_seed
-
-        # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or (
-            var.dtype == VarDesc.VarType.BF16 and not self._uniform
-        ):
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['masra_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if in_dygraph_mode():
-            if self._uniform:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
-                out_var = _C_ops.uniform(
-                    var.shape,
-                    out_dtype,
-                    -limit,
-                    limit,
-                    self._seed,
-                    _current_expected_place(),
-                )
-            else:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
-                place = _current_expected_place()
-                out_var = _C_ops.gaussian(
-                    out_var.shape, 0.0, std, self._seed, out_dtype, place
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            if self._uniform:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
-                op = block.append_op(
-                    type="uniform_random",
-                    inputs={},
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "min": -limit,
-                        "max": limit,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-
-            else:
-                gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
-                op = block.append_op(
-                    type="gaussian_random",
-                    outputs={"Out": out_var},
-                    attrs={
-                        "shape": out_var.shape,
-                        "dtype": int(out_dtype),
-                        "mean": 0.0,
-                        "std": std,
-                        "seed": self._seed,
-                    },
-                    stop_gradient=True,
-                )
-
-            if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform
-            ):
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class BilinearInitializer(Initializer):
+    Return the global weight initializer, The user doesn't need to use it.
     """
-    This initializer can be used in transposed convolution operator to
-    act as upsampling. Users can upsample a feature map with shape of
-    (B, C, H, W) by any integer factor. The usage is:
-
-    Examples:
-
-        .. code-block:: python
-
-            import math
-
-            import paddle
-            import paddle.nn as nn
-            from paddle.regularizer import L2Decay
-
-            factor = 2
-            C = 2
-            B = 8
-            H = W = 32
-            w_attr = paddle.ParamAttr(learning_rate=0.,
-                                      regularizer=L2Decay(0.),
-                                      initializer=nn.initializer.Bilinear())
-            data = paddle.rand([B, 3, H, W], dtype='float32')
-            conv_up = nn.Conv2DTranspose(3,
-                                         out_channels=C,
-                                         kernel_size=2 * factor - factor % 2,
-                                         padding=int(
-                                             math.ceil((factor - 1) / 2.)),
-                                         stride=factor,
-                                         weight_attr=w_attr,
-                                         bias_attr=False)
-            x = conv_up(data)
+    return _global_weight_initializer_
 
-    Where, `out_channels=C` and `groups=C` means this is channel-wise transposed
-    convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`,
-    This initializer will set a (K, K) interpolation kernel for every channel
-    of the filter identically. The resulting shape of the output feature map
-    will be (B, C, factor * H, factor * W). Note that the learning rate and the
-    weight decay are set to 0 in order to keep coefficient values of bilinear
-    interpolation unchanged during training.
 
+def _global_bias_initializer():
     """
-
-    def __init__(self):
-        """Constructor for BilinearInitializer."""
-        super().__init__()
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Bilinear initialization.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        if not isinstance(var, framework.Variable):
-            raise ValueError("var must be framework.Variable.")
-
-        if not isinstance(block, framework.Block):
-            raise ValueError("block must be framework.Block.")
-
-        shape = var.shape
-        if len(shape) != 4:
-            raise ValueError("the length of shape must be 4.")
-        if shape[2] != shape[3]:
-            raise ValueError("shape[2] must be equal to shape[3].")
-
-        weight = np.zeros(np.prod(var.shape), dtype='float32')
-        size = shape[3]
-        # factor
-        f = np.ceil(size / 2.0)
-        # center
-        c = (2 * f - 1 - f % 2) / (2.0 * f)
-        for i in range(np.prod(shape)):
-            x = i % size
-            y = (i / size) % size
-            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
-        weight = np.reshape(weight, shape)
-
-        # to be compatible of fp16 initalizers
-        if var.dtype in [
-            VarDesc.VarType.FP16,
-            VarDesc.VarType.BF16,
-            VarDesc.VarType.FP64,
-        ]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['bilinear_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
-        if out_dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in weight.flat]
-        else:
-            raise TypeError("Unsupported dtype %s", var.dtype)
-
-        if np.prod(shape) > 1024 * 1024:
-            raise ValueError("The size of input is too big. ")
-
-        if in_dygraph_mode():
-            _C_ops.assign_value_(
-                out_var,
-                list(shape),
-                out_dtype,
-                values,
-                _current_expected_place(),
-            )
-            if var.dtype in [
-                VarDesc.VarType.FP16,
-                VarDesc.VarType.BF16,
-                VarDesc.VarType.FP64,
-            ]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            op = block.append_op(
-                type='assign_value',
-                outputs={'Out': [out_var]},
-                attrs={
-                    'dtype': out_dtype,
-                    'shape': list(shape),
-                    value_name: values,
-                },
-            )
-
-            if var.dtype in [
-                VarDesc.VarType.FP16,
-                VarDesc.VarType.BF16,
-                VarDesc.VarType.FP64,
-            ]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
-
-
-class NumpyArrayInitializer(Initializer):
-    """Init an parameter with an numpy array
-    This op initialize the variable by numpy array.
-
-    Args:
-        value (numpy): numpy array to initialize the variable
-
-    Returns:
-        A Tensor variable initialized by numpy.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy
-            paddle.enable_static()
-            x = fluid.data(name="x", shape=[2, 1], dtype='float32')
-            fc = paddle.static.nn.fc(x, size=10,
-                weight_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
+    Return the global weight initializer, The user doesn't need to use it.
     """
-
-    def __init__(self, value):
-        import numpy
-
-        assert isinstance(value, numpy.ndarray)
-        super().__init__()
-        self._value = value
-
-    def forward(self, var, block=None):
-        """Initialize the input tensor with Numpy array.
-
-        Args:
-            var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
-                   should be added. Used in static graph only, default None.
-
-        Returns:
-            The initialization op
-        """
-        block = self._check_block(block)
-
-        assert isinstance(var, framework.Variable)
-        assert isinstance(block, framework.Block)
-
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            np_value = self._value.astype("float32")
-            out_var = block.create_var(
-                name=unique_name.generate(
-                    ".".join(['numpy_array_init', var.name, 'tmp'])
-                ),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False,
-            )
-        else:
-            out_var = var
-            out_dtype = var.dtype
-            np_value = self._value
-
-        if out_dtype == VarDesc.VarType.FP32:
-            value_name = "fp32_values"
-            values = [float(v) for v in np_value.flat]
-        elif out_dtype == VarDesc.VarType.INT32:
-            value_name = "int32_values"
-            values = [int(v) for v in np_value.flat]
-        else:
-            raise ValueError("Unsupported dtype %s", self._value.dtype)
-        if self._value.size > 1024 * 1024 * 1024:
-            raise ValueError(
-                "The size of input is too big. Please consider "
-                "saving it to file and 'load_op' to load it"
-            )
-
-        if in_dygraph_mode():
-            _C_ops.assign_value_(
-                out_var,
-                list(self._value.shape),
-                out_dtype,
-                values,
-                _current_expected_place(),
-            )
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
-            return None
-        else:
-            op = block.append_op(
-                type='assign_value',
-                outputs={'Out': out_var},
-                attrs={
-                    'dtype': out_dtype,
-                    'shape': list(self._value.shape),
-                    value_name: values,
-                },
-                stop_gradient=True,
-            )
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(
-                    type="cast",
-                    inputs={"X": out_var},
-                    outputs={"Out": var},
-                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
-                )
-
-            var.op = op
-            return op
+    return _global_bias_initializer_
 
 
 def set_global_initializer(weight_init, bias_init=None):
@@ -1218,7 +103,7 @@ def set_global_initializer(weight_init, bias_init=None):
     check_type(
         weight_init,
         'weight_init',
-        (Initializer, type(None)),
+        (paddle.nn.initializer.Initializer, type(None)),
         'set_global_initializer',
     )
     global _global_weight_initializer_
@@ -1227,93 +112,8 @@ def set_global_initializer(weight_init, bias_init=None):
     check_type(
         bias_init,
         'bias_init',
-        (Initializer, type(None)),
+        (paddle.nn.initializer.Initializer, type(None)),
         'set_global_initializer',
     )
     global _global_bias_initializer_
     _global_bias_initializer_ = bias_init
-
-
-def _global_weight_initializer():
-    """
-    Return the global weight initializer, The user doesn't need to use it.
-    """
-    return _global_weight_initializer_
-
-
-def _global_bias_initializer():
-    """
-    Return the global weight initializer, The user doesn't need to use it.
-    """
-    return _global_bias_initializer_
-
-
-def calculate_gain(nonlinearity, param=None):
-    """
-    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some
-    ``paddle.nn.initializer`` api to adjust the initialization value.
-
-    Args:
-        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as:
-            `linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose` , 1.0 will be returned.
-        param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to
-            'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
-
-    Returns:
-        A float value, which is the recommended gain for this nonlinearity function.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3
-            gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2))
-            initializer = paddle.nn.initializer.Orthogonal(gain)
-
-    """
-    if param is None:
-        param = 0.01
-    else:
-        assert isinstance(param, (bool, int, float))
-        param = float(param)
-    recommended_gain = {
-        'sigmoid': 1,
-        'linear': 1,
-        'conv1d': 1,
-        'conv2d': 1,
-        'conv3d': 1,
-        'conv1d_transpose': 1,
-        'conv2d_transpose': 1,
-        'conv3d_transpose': 1,
-        'tanh': 5.0 / 3,
-        'relu': math.sqrt(2.0),
-        'leaky_relu': math.sqrt(2.0 / (1 + param**2)),
-        'selu': 3.0 / 4,
-    }
-    if nonlinearity in recommended_gain.keys():
-        return recommended_gain[nonlinearity]
-    else:
-        raise ValueError(
-            "nonlinearity function {} is not suppported now.".format(
-                nonlinearity
-            )
-        )
-
-
-# We short the class name, since users will use the initializer with the package
-# name. The sample code:
-#
-# import paddle
-# import paddle.fluid as fluid
-#
-# hidden = paddle.static.nn.fc(...,
-#                          weight_attr=ParamAttr(fluid.initializer.Xavier()))
-#
-# It is no need to add an `Initializer` as the class suffix
-Constant = ConstantInitializer
-Uniform = UniformInitializer
-Normal = NormalInitializer
-TruncatedNormal = TruncatedNormalInitializer
-Xavier = XavierInitializer
-MSRA = MSRAInitializer
-Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 241dd71e200ab..ce93a25ccef9a 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -22,7 +22,6 @@
     cpu_places,
 )
 from .param_attr import ParamAttr
-from .initializer import Constant
 from . import layers
 from . import backward
 from .dygraph import Layer
@@ -42,7 +41,9 @@ def __init__(self, input_size):
         self._linear1 = paddle.nn.Linear(
             input_size,
             3,
-            weight_attr=ParamAttr(initializer=Constant(value=0.1)),
+            weight_attr=ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.1)
+            ),
         )
 
     def forward(self, inputs):
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 9c3de1ba49862..0342017822cfd 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import copy
-
+import paddle
 from .framework import (
     Parameter,
     dtype_is_floating,
@@ -22,7 +22,6 @@
     _global_flags,
 )
 from . import unique_name
-from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
 from . import core
 
@@ -178,10 +177,10 @@ def append_activation(self, input_var):
     # TODO (jiabin): should we remove this since it has never be used
     def _get_default_initializer(self, dtype):
         if dtype is None or dtype_is_floating(dtype) is True:
-            return Xavier()
+            return paddle.nn.initializer.XavierUniform()
         else:
             # For integer and boolean types, initialize with all zeros
-            return Constant()
+            return paddle.nn.initializer.Constant()
 
     # TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs
     def is_instance(self, param_name, cls):
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 994fc98038086..eb4d227f914ff 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -92,7 +92,7 @@ class ListenAndServ:
                         shape=[32, 32],
                         dtype='float32',
                         name="X")
-                    fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                    paddle.nn.initializer.Constant(value=1.0)(x, main.global_block())
                     paddle.scale(x=x, scale=10.0, out=out_var)
 
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fa0f49d01b997..1dd819df41168 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -22,7 +22,6 @@
 
 import paddle
 from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
 from ..framework import (
     Variable,
     OpProtoHolder,
@@ -240,7 +239,7 @@ def embedding(
           w_param_attrs = fluid.ParamAttr(
               name="emb_weight",
               learning_rate=0.5,
-              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
+              initializer=paddle.nn.initializer.Assign(weight_data),
               trainable=True)
           emb_2 = fluid.layers.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')
     """
@@ -673,7 +672,10 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     )
     if is_new_var:
         helper.set_variable_initializer(
-            counter, initializer=Constant(value=begin - 1, force_cpu=True)
+            counter,
+            initializer=paddle.nn.initializer.ConstantInitializer(
+                value=begin - 1, force_cpu=True
+            ),
         )
         helper.main_program.global_block()._prepend_op(
             type='increment',
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 5d702b8e521bf..b04611db66866 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -19,7 +19,6 @@
 import copy
 
 from .layer_helper import LayerHelper
-from .initializer import Constant
 from . import unique_name
 from .framework import Program, Variable, program_guard
 from . import layers
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c5aa80c749027..d7ab914f80ffc 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -39,7 +39,6 @@
     _get_no_grad_set_name,
 )
 from .framework import program_guard
-from .initializer import Constant
 from .layer_helper import LayerHelper
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
@@ -397,7 +396,8 @@ def _create_global_learning_rate(self):
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
-                lr_var, initializer=Constant(value=lr_value)
+                lr_var,
+                initializer=paddle.nn.initializer.Constant(value=lr_value),
             )
             return
 
@@ -713,7 +713,10 @@ def _add_accumulator(
             device = self._get_device_for_param(param.name)
         with device_guard(device):
             self.helper.set_variable_initializer(
-                var, initializer=Constant(value=float(fill_value))
+                var,
+                initializer=paddle.nn.initializer.Constant(
+                    value=float(fill_value)
+                ),
             )
 
         if in_dygraph_mode():
@@ -774,7 +777,10 @@ def _add_global_accumulator(
             device = 'cpu'
         with device_guard(device):
             self.helper.set_variable_initializer(
-                var, initializer=Constant(value=float(fill_value))
+                var,
+                initializer=paddle.nn.initializer.Constant(
+                    value=float(fill_value)
+                ),
             )
 
         if in_dygraph_mode():
@@ -1225,10 +1231,12 @@ def flatten_param_grads(self, params_grads):
         # NOTE(zhiqiu): the initializer should be set after coalesce_tensor op,
         # so the shape of flatten_param and flatten_grad will be inferred.
         self.helper.set_variable_initializer(
-            flatten_param, initializer=Constant(0.0)
+            flatten_param,
+            initializer=paddle.nn.initializer.Constant(0.0),
         )
         self.helper.set_variable_initializer(
-            flatten_grad, initializer=Constant(0.0)
+            flatten_grad,
+            initializer=paddle.nn.initializer.Constant(0.0),
         )
 
         return [(flatten_param, flatten_grad)]
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index f251a654a992b..6fdadd7904bd4 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .initializer import Initializer, Xavier, Constant
+import paddle
 from .regularizer import WeightDecayRegularizer
 from paddle.fluid.data_feeder import check_type
 
@@ -88,7 +88,10 @@ def __init__(
         check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
         check_type(need_clip, "need_clip", (bool), "ParamAttr")
         check_type(
-            initializer, "initializer", (Initializer, type(None)), "ParamAttr"
+            initializer,
+            "initializer",
+            (paddle.nn.initializer.Initializer, type(None)),
+            "ParamAttr",
         )
         check_type(
             regularizer,
@@ -139,7 +142,7 @@ def _set_default_param_initializer(self):
         Returns:
             None.
         """
-        self._set_default_initializer(Xavier())
+        self._set_default_initializer(paddle.nn.initializer.XavierUniform())
 
     def _set_default_bias_initializer(self):
         """
@@ -151,7 +154,7 @@ def _set_default_bias_initializer(self):
         Returns:
             None.
         """
-        self._set_default_initializer(Constant(0.0))
+        self._set_default_initializer(paddle.nn.initializer.Constant(0.0))
 
     @staticmethod
     def _to_attr(arg):
@@ -177,7 +180,7 @@ def _to_attr(arg):
             return arg
         elif isinstance(arg, str):
             return ParamAttr(name=arg)
-        elif isinstance(arg, Initializer):
+        elif isinstance(arg, paddle.nn.initializer.Initializer):
             return ParamAttr(initializer=arg)
         elif isinstance(arg, WeightDecayRegularizer):
             return ParamAttr(regularizer=arg)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index 593d79998a2d1..80ebe78963287 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -36,7 +36,6 @@
     save_distributed_checkpoint,
 )
 from paddle.distributed.fleet import auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -55,8 +54,12 @@ def __init__(
         np.random.seed(2021)
         arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
         arr1 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
-        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
-        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
+        weight_attr0 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr0)
+        )
+        weight_attr1 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr1)
+        )
         bias_attr = None
         self.linear0 = nn.Linear(
             d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
index 875536af57a35..1cb2a3e9bf1fe 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -30,7 +30,6 @@
     save_distributed_checkpoint,
 )
 from paddle.distributed.fleet import auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 paddle.enable_static()
 _global_parallel_strategy = None
@@ -48,7 +47,9 @@ def __init__(
         dim_feedforward = intermediate_size
         np.random.seed(2021)
         arr = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
-        weight_attr = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr))
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr)
+        )
         bias_attr = None
 
         self.linear0 = nn.Linear(
diff --git a/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
index 59eee4cfeee2f..c1ed3175e100e 100644
--- a/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/column_parallel_linear_api.py
@@ -38,15 +38,11 @@ def get_model(self, main_prog, startup_program, rank):
             paddle.distributed.broadcast(data, src=0)
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[:, 0:8]
-                    ),
+                    initializer=paddle.nn.initializer.Assign(np_array[:, 0:8]),
                 )
             else:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[:, 8:16]
-                    ),
+                    initializer=paddle.nn.initializer.Assign(np_array[:, 8:16]),
                 )
 
             linear_out = paddle.distributed.split(
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
index a4d20264e7301..ca4ad63066ee8 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -242,10 +242,10 @@ def __init__(self, d_model, process_cmd, shape_len=None):
                 self._layer_norm = paddle.nn.LayerNorm(
                     normalized_shape=d_model,
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0)
+                        initializer=paddle.nn.initializer.Constant(0.0)
                     ),
                 )
 
@@ -513,7 +513,9 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
+                initializer=paddle.nn.initializer.Normal(
+                    0.0, src_emb_dim**-0.5
+                ),
             ),
         )
 
@@ -527,7 +529,7 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=pos_enc_param_name,
-                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                initializer=paddle.nn.initializer.Assign(pos_inp),
                 trainable=False,
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
index 2eb0951756a59..1fff26b20b191 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         # To cover @RENAMED@GRADIENT
@@ -74,7 +74,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         predict += predict2
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
index e094d932d33e4..a1d8688fd41c3 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         # To cover @RENAMED@GRADIENT
@@ -74,7 +74,7 @@ def cnn_model(data):
             size=SIZE,
             activation="softmax",
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01)
+                initializer=paddle.nn.initializer.Constant(value=0.01)
             ),
         )
         predict += predict2
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
index 7e442f1914b2c..74c3c1a7269e4 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_one_device.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -64,7 +64,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
index 7f247abc6d9cd..035a174775bd5 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_col.py
@@ -33,11 +33,9 @@
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
index b63e2065f431b..a480993e8ec50 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_by_row.py
@@ -33,11 +33,9 @@
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
@@ -65,7 +63,7 @@ def create_model(data, rank):
             data,
             size=OUT_SIZE,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)
+                initializer=paddle.nn.initializer.Assign(np_weight)
             ),
             bias_attr=bias_attr,
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
index 914ee0852a043..689b068f025f2 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/static_model_parallel_embedding.py
@@ -44,9 +44,7 @@ def create_model(data, rank):
             axis=0,
             num_partitions=MODEL_PARALLEL_SIZE,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    np_weight_part
-                )
+                initializer=paddle.nn.initializer.Assign(np_weight_part)
             ),
             bias_attr=False,
         )
@@ -55,7 +53,7 @@ def create_model(data, rank):
             data,
             size=OUT_SIZE,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)
+                initializer=paddle.nn.initializer.Assign(np_weight)
             ),
             bias_attr=False,
         )
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
index de839e2c5eea4..ec864a1e40f9e 100644
--- a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_dpppmp.py
@@ -35,7 +35,7 @@ def weight_init(mp, shape, col=True, seed=1024):
         else:
             step = shape[0] // mp.nranks
             _w = w[mp.rank * step : mp.rank * step + step, :]
-    return paddle.fluid.initializer.NumpyArrayInitializer(_w)
+    return paddle.nn.initializer.Assign(_w)
 
 
 class Criterion(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
index 160ed85cc9424..3f9527209134f 100644
--- a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_fp16.py
@@ -35,7 +35,7 @@ def weight_init(mp, shape, col=True, seed=1024):
         else:
             step = shape[0] // mp.nranks
             _w = w[mp.rank * step : mp.rank * step + step, :]
-    return paddle.fluid.initializer.NumpyArrayInitializer(_w)
+    return paddle.nn.initializer.Assign(_w)
 
 
 class Criterion(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
index 31daee3262291..af2b1b616d132 100644
--- a/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/multinode/dygraph_hybrid_recompute.py
@@ -36,7 +36,7 @@ def weight_init(mp, shape, col=True, seed=1024):
         else:
             step = shape[0] // mp.nranks
             _w = w[mp.rank * step : mp.rank * step + step, :]
-    return paddle.fluid.initializer.NumpyArrayInitializer(_w)
+    return paddle.nn.initializer.Assign(_w)
 
 
 class Criterion(nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py b/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py
index 9dd3bade93aee..f89643e7bff5e 100644
--- a/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/parallel_embedding_api.py
@@ -42,13 +42,13 @@ def get_model(self, main_prog, startup_program, rank):
             per_part_size = size[0] // 2
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[0:per_part_size, :]
                     ),
                 )
             else:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[per_part_size : size[0], :]
                     ),
                 )
diff --git a/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
index afbb3f3334364..6c3817da5ae90 100644
--- a/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/row_parallel_linear_api.py
@@ -39,13 +39,13 @@ def get_model(self, main_prog, startup_program, rank):
             data = paddle.split(data, 2, axis=1)[rank]
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[0:500, :]
                     ),
                 )
             else:
                 param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
+                    initializer=paddle.nn.initializer.Assign(
                         np_array[500:1000, :]
                     ),
                 )
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index 30bcea4cb5cb2..044c6d78cac10 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -38,7 +38,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -49,7 +49,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -63,7 +63,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index deb4cb921c1f3..dc9bd59df52fd 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -60,7 +60,7 @@ def get_model(self, batch_size=2):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=IS_SPARSE,
         )
@@ -74,7 +74,7 @@ def get_model(self, batch_size=2):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
@@ -87,7 +87,7 @@ def get_model(self, batch_size=2):
             size=[lr_input_dim, 1],
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=IS_SPARSE,
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 8e9341f9c5b1a..527ba34bae614 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -107,7 +107,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
             padding_idx=0,
@@ -122,7 +122,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
@@ -135,7 +135,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
             size=[lr_input_dim, 1],
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
             padding_idx=0,
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
index 3e71a1cb6054d..de0f32e3110a5 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
@@ -78,7 +78,7 @@ def net(self, args, batch_size=4, lr=0.01):
                 size=[dnn_input_dim, dnn_layer_dims[0]],
                 param_attr=fluid.ParamAttr(
                     name="deep_embedding",
-                    initializer=fluid.initializer.Constant(value=0.01),
+                    initializer=paddle.nn.initializer.Constant(value=0.01),
                 ),
                 is_sparse=True,
             )
@@ -94,7 +94,7 @@ def net(self, args, batch_size=4, lr=0.01):
                 size=[lr_input_dim, 1],
                 param_attr=fluid.ParamAttr(
                     name="wide_embedding",
-                    initializer=fluid.initializer.Constant(value=0.01),
+                    initializer=paddle.nn.initializer.Constant(value=0.01),
                 ),
                 is_sparse=True,
             )
@@ -109,7 +109,7 @@ def net(self, args, batch_size=4, lr=0.01):
                     size=dim,
                     activation="relu",
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=0.01)
+                        initializer=paddle.nn.initializer.Constant(value=0.01)
                     ),
                     name='dnn-fc-%d' % i,
                 )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
index dc0a7022b3434..453b715b50394 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -40,7 +40,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -51,7 +51,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index ac1a4c632fd49..25f8663c7406a 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -40,7 +40,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -51,7 +51,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -65,7 +65,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index b673bfeae16e2..bd4fc90fd244f 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -124,7 +124,8 @@ def train_network(
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__emb__",
         ),
         is_sparse=is_sparse,
     )
@@ -137,7 +138,7 @@ def train_network(
         x=q_ss,
         size=hid_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             name="__q_fc__",
             learning_rate=base_lr,
         ),
@@ -149,7 +150,7 @@ def train_network(
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             name="__emb__",
             learning_rate=emb_lr,
         ),
@@ -164,7 +165,8 @@ def train_network(
         x=pt_ss,
         size=hid_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__fc__",
         ),
         bias_attr=fluid.ParamAttr(name="__fc_b__"),
     )
@@ -175,7 +177,8 @@ def train_network(
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__emb__",
         ),
         is_sparse=is_sparse,
     )
@@ -188,7 +191,8 @@ def train_network(
         x=nt_ss,
         size=hid_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"
+            initializer=paddle.nn.initializer.Constant(value=0.01),
+            name="__fc__",
         ),
         bias_attr=fluid.ParamAttr(name="__fc_b__"),
     )
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index a9a2d7be0ba41..1780e7dfe2dde 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -86,11 +86,11 @@ def net(self, args, batch_size=4, lr=0.01):
         inference = bool(int(os.getenv("INFERENCE", "0")))
 
         if initializer == 0:
-            init = fluid.initializer.Constant(value=0.01)
+            init = paddle.nn.initializer.Constant(value=0.01)
         elif initializer == 1:
-            init = fluid.initializer.Uniform()
+            init = paddle.nn.initializer.Uniform()
         elif initializer == 2:
-            init = fluid.initializer.Normal()
+            init = paddle.nn.initializer.Normal()
         else:
             raise ValueError("error initializer code: {}".format(initializer))
 
@@ -113,7 +113,7 @@ def net(self, args, batch_size=4, lr=0.01):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
@@ -127,7 +127,7 @@ def net(self, args, batch_size=4, lr=0.01):
             entry=entry,
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 87eb22dceac1c..30c1130e33c85 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -39,7 +39,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     conv_pool_2 = fluid.nets.simple_img_conv_pool(
@@ -50,7 +50,7 @@ def cnn_model(data):
         pool_stride=2,
         act="relu",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -64,7 +64,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index db3318d67d88a..6482ac53b09d8 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -121,7 +121,7 @@ def net(self, input, class_dim=1000):
             size=class_dim,
             activation='softmax',
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
         )
         return out
@@ -174,7 +174,7 @@ def conv_bn_layer(
             act=None,
             # avoid pserver CPU init differs from GPU
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
             bias_attr=False,
         )
@@ -187,7 +187,7 @@ def squeeze_excitation(self, input, num_channels, reduction_ratio):
             x=pool,
             size=num_channels // reduction_ratio,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
             activation='relu',
         )
@@ -196,7 +196,7 @@ def squeeze_excitation(self, input, num_channels, reduction_ratio):
             x=squeeze,
             size=num_channels,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.05)
+                initializer=paddle.nn.initializer.Constant(value=0.05)
             ),
             activation='sigmoid',
         )
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
index a287bd8a6c878..d29997ef8a08e 100644
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -59,7 +59,7 @@ def conv_net(
         size=[dict_dim, emb_dim],
         is_sparse=False,
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -70,7 +70,7 @@ def conv_net(
         act="tanh",
         pool_type="max",
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -78,7 +78,7 @@ def conv_net(
         x=[conv_3],
         size=fc0_dim,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
@@ -87,7 +87,7 @@ def conv_net(
         size=class_dim,
         activation="softmax",
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)
+            initializer=paddle.nn.initializer.Constant(value=0.01)
         ),
     )
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index 5b0343bd81c24..e9ce91c197c1a 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -29,7 +29,9 @@
 import paddle.fluid.layers as layers
 import paddle.nn.functional as F
 
-const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001))
+const_para_attr = fluid.ParamAttr(
+    initializer=paddle.nn.initializer.Constant(0.001)
+)
 const_bias_attr = const_para_attr
 
 # Fix seed for test
@@ -1253,8 +1255,8 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.0):
             out = layers.layer_norm(
                 out,
                 begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.0),
-                bias_attr=fluid.initializer.Constant(0.0),
+                param_attr=paddle.nn.initializer.Constant(1.0),
+                bias_attr=paddle.nn.initializer.Constant(0.0),
             )
         elif cmd == "d":  # add dropout
             if dropout_rate:
@@ -1292,7 +1294,7 @@ def prepare_encoder(
             size=[src_vocab_size, src_emb_dim],
             param_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.ConstantInitializer(0.001),
+                initializer=paddle.nn.initializer.Constant(0.001),
             ),
         )
     else:
@@ -1301,7 +1303,9 @@ def prepare_encoder(
             size=[src_vocab_size, src_emb_dim],
             param_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
+                initializer=paddle.nn.initializer.Normal(
+                    0.0, src_emb_dim**-0.5
+                ),
             ),
         )
 
@@ -1312,7 +1316,7 @@ def prepare_encoder(
         param_attr=fluid.ParamAttr(
             name=pos_enc_param_name,
             trainable=False,
-            initializer=fluid.initializer.ConstantInitializer(0.001),
+            initializer=paddle.nn.initializer.Constant(0.001),
         ),
     )
     src_pos_enc.stop_gradient = True
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
index e10131667c745..f5de20385f26b 100644
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -41,7 +41,7 @@ def __network__(words):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
             embed_second = fluid.layers.embedding(
@@ -51,7 +51,7 @@ def __network__(words):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
             embed_third = fluid.layers.embedding(
@@ -61,7 +61,7 @@ def __network__(words):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
             embed_forth = fluid.layers.embedding(
@@ -71,7 +71,7 @@ def __network__(words):
                 is_sparse=IS_SPARSE,
                 param_attr=fluid.ParamAttr(
                     name='shared_w',
-                    initializer=fluid.initializer.Constant(value=0.1),
+                    initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
 
@@ -84,7 +84,7 @@ def __network__(words):
                 size=HIDDEN_SIZE,
                 activation='sigmoid',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.1)
+                    initializer=paddle.nn.initializer.Constant(value=0.1)
                 ),
             )
             predict_word = paddle.static.nn.fc(
@@ -92,7 +92,7 @@ def __network__(words):
                 size=dict_size,
                 activation='softmax',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.1)
+                    initializer=paddle.nn.initializer.Constant(value=0.1)
                 ),
             )
             cost = paddle.nn.functional.cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
index a99b56974a8ae..9b9d45db082c0 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
@@ -26,7 +26,6 @@
 import paddle.static as static
 import paddle.utils as utils
 from paddle.distributed.fleet import auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 logging.getLogger().setLevel(logging.INFO)
 paddle.enable_static()
@@ -42,8 +41,12 @@ def __init__(
         np.random.seed(2021)
         arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
         arr1 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
-        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
-        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
+        weight_attr0 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr0)
+        )
+        weight_attr1 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr1)
+        )
         bias_attr = None
         self.linear0 = nn.Linear(
             d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index 1ea69dfbb1569..8629a3e185297 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -203,8 +203,8 @@ def __init__(self, config, return_pooled_out=True, use_fp16=False):
         self._sent_emb_name = "sent_embedding"
         self._dtype = "float16" if use_fp16 else "float32"
 
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range']
+        self._param_initializer = paddle.nn.initializer.TruncatedNormal(
+            std=config['initializer_range']
         )
         paddle.set_default_dtype(self._dtype)
         self._src_emb = paddle.nn.Embedding(
@@ -317,8 +317,8 @@ def __init__(
         self._prepostprocess_dropout = config['hidden_dropout_prob']
 
         self._word_emb_name = "word_embedding"
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range']
+        self._param_initializer = paddle.nn.initializer.TruncatedNormal(
+            std=config['initializer_range']
         )
         self._weight_sharing = weight_sharing
         self.use_fp16 = use_fp16
@@ -343,7 +343,7 @@ def __init__(
 
         self.mask_lm_out_bias_attr = fluid.ParamAttr(
             name="mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
 
         if not self._weight_sharing:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
index 783dfff262e8f..1e7950c29e222 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py
@@ -41,7 +41,7 @@ def __init__(
             padding=padding,
             groups=groups,
             weight_attr=ParamAttr(
-                initializer=fluid.initializer.Normal(0.0, 0.02)
+                initializer=paddle.nn.initializer.Normal(0.0, 0.02)
             ),
             bias_attr=False,
         )
@@ -49,11 +49,11 @@ def __init__(
             num_channels=ch_out,
             is_test=is_test,
             param_attr=ParamAttr(
-                initializer=fluid.initializer.Normal(0.0, 0.02),
+                initializer=paddle.nn.initializer.Normal(0.0, 0.02),
                 regularizer=L2Decay(0.0),
             ),
             bias_attr=ParamAttr(
-                initializer=fluid.initializer.Constant(0.0),
+                initializer=paddle.nn.initializer.Constant(0.0),
                 regularizer=L2Decay(0.0),
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 99d90a7f1eaa5..88581c023f3d9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -26,10 +26,8 @@
 
 INF = 1.0 * 1e5
 alpha = 0.6
-uniform_initializer = lambda x: fluid.initializer.UniformInitializer(
-    low=-x, high=x
-)
-zero_constant = fluid.initializer.Constant(0.0)
+uniform_initializer = lambda x: paddle.nn.initializer.Uniform(low=-x, high=x)
+zero_constant = paddle.nn.initializer.Constant(0.0)
 
 
 class BasicLSTMUnit(Layer):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index c76b4dba9cb8e..7f93c83b91433 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -17,9 +17,9 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.param_attr as attr
+from paddle.common_ops_import import Variable
 from paddle.fluid.dygraph import Layer
 from paddle.jit.api import to_static
-from paddle.static import Variable
 
 
 class EmbeddingLayer:
@@ -48,7 +48,8 @@ def ops(self):
             sparse=True,
             padding_idx=self.padding_idx,
             weight_attr=attr.ParamAttr(
-                name=self.name, initializer=fluid.initializer.Xavier()
+                name=self.name,
+                initializer=paddle.nn.initializer.XavierUniform(),
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index 99fe330c69241..d8c5956357827 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -15,7 +15,7 @@
 from functools import reduce
 
 import paddle
-from paddle.static import Variable
+from paddle.common_ops_import import Variable
 
 
 class EmbeddingLayer:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 1f589b8d6fc8b..e1aaeabd48b8f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -118,10 +118,10 @@ def dyfunc_BilinearTensorProduct(layer1, layer2):
         4,
         1000,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)
+            initializer=paddle.nn.initializer.Constant(value=0.99)
         ),
         bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)
+            initializer=paddle.nn.initializer.Constant(value=0.5)
         ),
     )
 
@@ -138,10 +138,10 @@ def dyfunc_Conv2D(input):
         out_channels=2,
         kernel_size=3,
         weight_attr=paddle.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)
+            initializer=paddle.nn.initializer.Constant(value=0.99)
         ),
         bias_attr=paddle.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)
+            initializer=paddle.nn.initializer.Constant(value=0.5)
         ),
     )
     res = conv2d(input)
@@ -170,10 +170,10 @@ def dyfunc_Conv2DTranspose(input):
         12,
         12,
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.99)
+            initializer=paddle.nn.initializer.Constant(value=0.99)
         ),
         bias_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.5)
+            initializer=paddle.nn.initializer.Constant(value=0.5)
         ),
     )
     ret = conv2dTranspose(input)
@@ -222,7 +222,7 @@ def dyfunc_Pool2D(input):
 def dyfunc_Prelu(input):
     prelu0 = paddle.nn.PReLU(
         weight_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(1.0)
+            initializer=paddle.nn.initializer.Constant(1.0)
         ),
     )
     res = prelu0(input)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index a6a9d7281208d..55a93f769e25c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -115,11 +115,11 @@ def __init__(
         k = 1.0 / math.sqrt(fan_in)
         param_attr = ParamAttr(
             name=prefix + "_w",
-            initializer=fluid.initializer.Uniform(low=-k, high=k),
+            initializer=paddle.nn.initializer.Uniform(low=-k, high=k),
         )
         bias_attr = ParamAttr(
             name=prefix + "_b",
-            initializer=fluid.initializer.Uniform(low=-k, high=k),
+            initializer=paddle.nn.initializer.Uniform(low=-k, high=k),
         )
 
         self._conv2d = paddle.nn.Conv2D(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index c14631c35b6b4..59df33e5aa9e7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -125,10 +125,10 @@ def __init__(self):
             out_channels=2,
             kernel_size=3,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.99)
+                initializer=paddle.nn.initializer.Constant(value=0.99)
             ),
             bias_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)
+                initializer=paddle.nn.initializer.Constant(value=0.5)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index 0701750e3011a..b3556f0810197 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -352,7 +352,7 @@ def __init__(
             con_bias_attr = False
         else:
             con_bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(0.0)
+                initializer=paddle.nn.initializer.Constant(0.0)
             )
 
         self.conv = paddle.nn.Conv2D(
@@ -362,9 +362,7 @@ def __init__(
             stride=stride,
             padding=padding,
             weight_attr=paddle.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=stddev
-                )
+                initializer=paddle.nn.initializer.Normal(mean=0.0, std=stddev)
             ),
             bias_attr=con_bias_attr,
         )
@@ -378,10 +376,10 @@ def __init__(
                 use_global_stats=True,  # set True to use deterministic algorithm
                 num_channels=num_filters,
                 param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.NormalInitializer(1.0, 0.02)
+                    initializer=paddle.nn.initializer.Normal(1.0, 0.02)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(0.0)
+                    initializer=paddle.nn.initializer.Constant(0.0)
                 ),
                 trainable_statistics=True,
             )
@@ -421,7 +419,7 @@ def __init__(
             de_bias_attr = False
         else:
             de_bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(0.0)
+                initializer=paddle.nn.initializer.Constant(0.0)
             )
 
         self._deconv = paddle.nn.Conv2DTranspose(
@@ -431,9 +429,7 @@ def __init__(
             stride=stride,
             padding=padding,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NormalInitializer(
-                    loc=0.0, scale=stddev
-                )
+                initializer=paddle.nn.initializer.Normal(mean=0.0, std=stddev)
             ),
             bias_attr=de_bias_attr,
         )
@@ -444,10 +440,10 @@ def __init__(
                 use_global_stats=True,  # set True to use deterministic algorithm
                 num_channels=num_filters,
                 param_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.NormalInitializer(1.0, 0.02)
+                    initializer=paddle.nn.initializer.Normal(1.0, 0.02)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(0.0)
+                    initializer=paddle.nn.initializer.Constant(0.0)
                 ),
                 trainable_statistics=True,
             )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 94e1dba49313a..0d108b40406ba 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -98,7 +98,7 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             in_features=input_dim,
             out_features=grnn_hidden_dim * 3,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -111,7 +111,7 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             size=grnn_hidden_dim,
             h_0=h_0,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -124,7 +124,7 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             in_features=input_dim,
             out_features=grnn_hidden_dim * 3,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -138,7 +138,7 @@ def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
             is_reverse=True,
             h_0=h_0,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_bound, high=init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
@@ -375,7 +375,7 @@ def __init__(self, args, length=None):
             weight_attr=fluid.ParamAttr(
                 learning_rate=self.emb_lr,
                 name="word_emb",
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound
                 ),
             ),
@@ -415,7 +415,7 @@ def __init__(self, args, length=None):
             in_features=self.grnn_hidden_dim * 2,
             out_features=self.num_labels,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound
                 ),
                 regularizer=fluid.regularizer.L2DecayRegularizer(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index d708dc1eadfed..72f3dd7c33190 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -22,7 +22,6 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.jit.api import to_static
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -61,7 +60,8 @@ def __init__(
             padding=padding,
             groups=num_groups,
             weight_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"
+                initializer=paddle.nn.initializer.KaimingUniform(),
+                name=self.full_name() + "_weights",
             ),
             bias_attr=False,
         )
@@ -259,7 +259,8 @@ def __init__(self, scale=1.0, class_dim=1000):
             int(1024 * scale),
             class_dim,
             weight_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "fc7_weights"
+                initializer=paddle.nn.initializer.KaimingUniform(),
+                name=self.full_name() + "fc7_weights",
             ),
             bias_attr=ParamAttr(name="fc7_offset"),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 53687ca6c1ea5..1099f2dad667a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -49,26 +49,26 @@ def __init__(
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -158,7 +158,7 @@ def __init__(
             sparse=False,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -167,7 +167,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -175,7 +175,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 911ca2ec9016f..407e11349c2de 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -194,7 +194,7 @@ def __init__(self, layers=50, class_dim=102):
             self.pool2d_avg_output,
             class_dim,
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index e01b77af7655b..723a7c742c198 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -131,7 +131,7 @@ def __init__(self, num_channels, reduction_ratio):
             num_channels,
             num_channels // reduction_ratio,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
         stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
@@ -139,7 +139,7 @@ def __init__(self, num_channels, reduction_ratio):
             num_channels // reduction_ratio,
             num_channels,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
@@ -316,7 +316,7 @@ def __init__(self, layers=50, class_dim=102):
             self.pool2d_avg_output,
             class_dim,
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
index 560132565907e..5eb7cfc1080c7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -230,7 +230,7 @@ def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
             self.embedding_size,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-0.5 / self.embedding_size,
                     high=0.5 / self.embedding_size,
                 ),
@@ -242,7 +242,7 @@ def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
             self.embedding_size,
             weight_attr=fluid.ParamAttr(
                 name='embedding_out_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-0.5 / self.embedding_size,
                     high=0.5 / self.embedding_size,
                 ),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 18afc4a4ab9d5..3928c715a6288 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -62,10 +62,10 @@ def __init__(self, process_cmd, d_model, dropout_rate):
                         paddle.nn.LayerNorm(
                             normalized_shape=d_model,
                             weight_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.0)
+                                initializer=paddle.nn.initializer.Constant(1.0)
                             ),
                             bias_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.0)
+                                initializer=paddle.nn.initializer.Constant(0.0)
                             ),
                         ),
                     )
@@ -295,7 +295,7 @@ def __init__(self, vocab_size, emb_dim, bos_idx=0):
             vocab_size,
             emb_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Normal(0.0, emb_dim**-0.5)
+                initializer=paddle.nn.initializer.Normal(0.0, emb_dim**-0.5)
             ),
         )
 
@@ -330,7 +330,7 @@ def __init__(
             max_length,
             self.emb_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     position_encoding_init(max_length, self.emb_dim)
                 ),
                 trainable=False,
@@ -522,7 +522,7 @@ def __init__(
             max_length,
             self.emb_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     position_encoding_init(max_length, self.emb_dim)
                 ),
                 trainable=False,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 10df42faa2373..dbfc43cfc2432 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -253,10 +253,10 @@ def __init__(self, ch_in, is_train=True, use_random=False):
                     stride=1,
                     padding=0,
                     weight_attr=ParamAttr(
-                        initializer=fluid.initializer.Normal(0.0, 0.02)
+                        initializer=paddle.nn.initializer.Normal(0.0, 0.02)
                     ),
                     bias_attr=ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0),
+                        initializer=paddle.nn.initializer.Constant(0.0),
                         regularizer=L2Decay(0.0),
                     ),
                 ),
diff --git a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
index 917beec752d2a..895f71c4858e9 100644
--- a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
+++ b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
@@ -70,7 +70,7 @@ def net(batch_size=4, lr=0.01):
             size=[dnn_input_dim, dnn_layer_dims[0]],
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
@@ -86,7 +86,7 @@ def net(batch_size=4, lr=0.01):
             size=[lr_input_dim, 1],
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
@@ -99,7 +99,7 @@ def net(batch_size=4, lr=0.01):
                 size=dim,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.01)
+                    initializer=paddle.nn.initializer.Constant(value=0.01)
                 ),
                 name='dnn-fc-%d' % i,
             )
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index d48091f6c10c1..1d2b442d2dd05 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -27,7 +27,6 @@
     Operator,
     convert_np_dtype_to_dtype_,
 )
-from paddle.fluid.initializer import NumpyArrayInitializer
 from paddle.static.quantization import (
     QuantizationFreezePass,
     QuantizationTransformPass,
@@ -305,7 +304,7 @@ def create_fake_model(program_config):
             shape=tensor_config.shape,
             type=core.VarDesc.VarType.LOD_TENSOR,
             name=name,
-            initializer=NumpyArrayInitializer(tensor_config.data),
+            initializer=paddle.nn.initializer.Assign(tensor_config.data),
         )
     in_vars = []
     for name in sorted(save_var_map.keys()):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
index 1f32de177e3ee..536f6c4d606d0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
@@ -30,7 +30,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 100, 100], dtype="float32"
             )
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv_out = paddle.static.nn.conv2d(
@@ -63,7 +63,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 100, 100], dtype="float32"
             )
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv_out = paddle.static.nn.conv2d(
@@ -89,7 +89,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 100, 100], dtype="float32"
             )
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv_out = paddle.static.nn.conv2d(
@@ -114,7 +114,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 100, 100], dtype="float32"
             )
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierUniform(),
                 learning_rate=0.001,
             )
             conv_out = paddle.static.nn.conv2d(
@@ -145,7 +145,7 @@ def setUp(self):
                 name="data", shape=[-1, 3, 100, 100], dtype="float32"
             )
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv_out = paddle.static.nn.conv2d(
@@ -173,7 +173,7 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(name="data", shape=[-1, 3, 5, 5], dtype="float32")
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv_out = paddle.static.nn.conv2d_transpose(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index da9a86725c008..0e7eb56da9133 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -180,11 +180,11 @@ def setUp(self):
             )
             param_attr = fluid.ParamAttr(
                 name='instance_norm_w',
-                initializer=fluid.initializer.Constant(value=1.0),
+                initializer=paddle.nn.initializer.Constant(value=1.0),
             )
             bias_attr = fluid.ParamAttr(
                 name='instance_norm_b',
-                initializer=fluid.initializer.Constant(value=0.0),
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             )
             out = paddle.static.nn.instance_norm(
                 input=data, param_attr=param_attr, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index 17672d668d38a..590ebbf63efa5 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -114,11 +114,11 @@ def compute_v3(x, is_test, trainable_statistics):
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.0),
+                            initializer=paddle.nn.initializer.Constant(1.0),
                             trainable=False,
                         ),
                         bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.0),
+                            initializer=paddle.nn.initializer.Constant(0.0),
                             trainable=False,
                         ),
                         trainable_statistics=trainable_statistics,
@@ -262,7 +262,7 @@ def test_global_stats(self):
                 net1 = paddle.nn.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     use_global_stats=self.use_global_stats,
                     trainable_statistics=self.trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index 5bf239b5bc77d..646466e9504d4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -322,12 +322,12 @@ def _test(
         label_np = np.random.randint(2, size=(2, 1)).astype('int64')
         weight_attr1 = paddle.ParamAttr(
             name="weight1",
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
             trainable=True,
         )
         weight_attr2 = paddle.ParamAttr(
             name="weight2",
-            initializer=fluid.initializer.Constant(value=2.0),
+            initializer=paddle.nn.initializer.Constant(value=2.0),
             trainable=True,
         )
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
index 69769bbdc1f08..5369f4d410bda 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_run_program_op_npu.py
@@ -298,7 +298,7 @@ def build_model(self):
         weight_attr = fluid.ParamAttr(
             name=self.input_names['Params'][0],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][0]]
             ),
             trainable=True,
@@ -306,7 +306,7 @@ def build_model(self):
         bias_attr = fluid.ParamAttr(
             name=self.input_names['Params'][1],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][1]]
             ),
             trainable=True,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
index 11aacd02439e9..96b0b734a174c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -41,7 +41,7 @@ def __init__(
             self.hidden_size,
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 )
             ),
@@ -50,7 +50,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -58,7 +58,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index a3ff2b6865744..cf4372818ba1e 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -26,7 +26,7 @@ def simple_fc_net_with_inputs(img, label, class_num=10):
             size=100,
             activation='relu',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(
@@ -53,7 +53,7 @@ def batchnorm_fc_with_inputs(img, label, class_num=10):
             size=200,
             activation='relu',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index d018c52506bff..37048d7cd256b 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -25,11 +25,9 @@
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index 89cd0453d747b..4fca47635a1de 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -30,11 +30,9 @@
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
index 047bd3ae8ad27..9c863d6d3be8b 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_multi_transformer.py
@@ -25,11 +25,9 @@
 
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(weight)
-    )
-    bias_attr = paddle.ParamAttr(
-        initializer=fluid.initializer.NumpyArrayInitializer(bias)
+        initializer=paddle.nn.initializer.Assign(weight)
     )
+    bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Assign(bias))
     return weight_attr, bias_attr
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index d7052c94720a4..2983e5ca1958e 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -767,12 +767,12 @@ def _test(
         label_np = np.random.randint(2, size=(2, 1)).astype('int64')
         weight_attr1 = paddle.ParamAttr(
             name="weight1",
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
             trainable=True,
         )
         weight_attr2 = paddle.ParamAttr(
             name="weight2",
-            initializer=fluid.initializer.Constant(value=2.0),
+            initializer=paddle.nn.initializer.Constant(value=2.0),
             trainable=True,
         )
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
@@ -926,7 +926,7 @@ def test_adam_flatten_param_grads_with_regularizer(self):
         main = fluid.Program()
         weight_attr = paddle.ParamAttr(
             name="weight1",
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
             regularizer=fluid.regularizer.L1DecayRegularizer(
                 regularization_coeff=0.1
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 25e4ab9aa8b4a..c15f647a380fe 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -39,7 +39,6 @@
 from paddle.distributed.auto_parallel.reshard import Resharder
 from paddle.distributed.fleet import auto
 from paddle.fluid import core
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 if os.getenv("CUDA_VISIBLE_DEVICES") is not None:
     os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -373,10 +372,18 @@ def __init__(
         arr1 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
         arr2 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
         arr3 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
-        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
-        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
-        weight_attr2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr2))
-        weight_attr3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr3))
+        weight_attr0 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr0)
+        )
+        weight_attr1 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr1)
+        )
+        weight_attr2 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr2)
+        )
+        weight_attr3 = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Assign(arr3)
+        )
         bias_attr = None
         self.linear0 = nn.Linear(
             d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
index 77062eee5a376..efbf4a538e009 100644
--- a/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
+++ b/python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 
 
@@ -22,7 +23,7 @@ def test_avoid_twice_initialization(self):
         cur_program = fluid.Program()
         cur_block = cur_program.current_block()
         var = cur_block.create_parameter(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             shape=[2, 2],
             dtype='float32',
             name='var_a',
@@ -40,7 +41,7 @@ def test_avoid_twice_initialization(self):
             attrs={'ring_id': 0},
         )
         var2 = cur_block.create_parameter(
-            initializer=fluid.initializer.Constant(value=0.01),
+            initializer=paddle.nn.initializer.Constant(value=0.01),
             shape=[2, 2],
             dtype='float32',
             name='var_a',
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 05d9b71c1e437..7414c3732b18f 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -26,7 +26,7 @@ class L1(fluid.Layer):
     def __init__(self):
         super().__init__()
         self._param_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.1)
+            initializer=paddle.nn.initializer.Constant(value=0.1)
         )
         self.w1 = self.create_parameter(
             attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index ece07889df4e9..d6127ff5dd78a 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -183,11 +183,11 @@ def compute_v3(x, is_test, trainable_statistics):
                         shape[1],
                         is_test=is_test,
                         param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.0),
+                            initializer=paddle.nn.initializer.Constant(1.0),
                             trainable=False,
                         ),
                         bias_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(0.0),
+                            initializer=paddle.nn.initializer.Constant(0.0),
                             trainable=False,
                         ),
                         trainable_statistics=trainable_statistics,
@@ -378,7 +378,7 @@ def test_global_stats(self):
                 net1 = paddle.nn.BatchNorm(
                     6,
                     param_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     use_global_stats=self.use_global_stats,
                     trainable_statistics=self.trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index d8d20e41aac26..38a1284f0ae4a 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -48,7 +48,7 @@ def test1(self):
                 name='x',
                 shape=[1],
                 dtype='float32',
-                default_initializer=fluid.initializer.Constant(3),
+                default_initializer=paddle.nn.initializer.Constant(3),
             )
             (grad1,) = fluid.gradients(net(x), x)  # 2x = 6
             z = net(x - grad1)
@@ -69,7 +69,7 @@ def test2(self):
                 name='x',
                 shape=[1],
                 dtype='float32',
-                default_initializer=fluid.initializer.Constant(1),
+                default_initializer=paddle.nn.initializer.Constant(1),
             )
             y = x * x
             (dx1,) = fluid.gradients(y, x)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 0e836dca1c2e5..a82c0e023c6c0 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -41,7 +41,7 @@ def net(self):
             size=[10000, 10],
             param_attr=fluid.ParamAttr(
                 name="embedding",
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index edd3d718c437e..7cb0a066141db 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -18,7 +18,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -110,11 +109,11 @@ def fluid_layer(self, place):
                     else (-1, self.num_channels, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
                 if self.padding_mode != 'zeros':
                     x_var = F.pad(
                         x_var,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 9eeb4fc82dfb1..50c80c3aa32d6 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -18,7 +18,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -101,11 +100,11 @@ def fluid_layer(self, place):
                     else (-1, self.num_channels, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
 
                 y_var = paddle.static.nn.conv2d_transpose(
                     x_var,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index 78276fbf76db1..8ef86daf69a03 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -18,7 +18,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -97,11 +96,11 @@ def fluid_layer(self, place):
                     else (-1, self.num_channels, -1, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
                 y_var = paddle.static.nn.conv3d(
                     x_var,
                     self.num_filters,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index 1ea071142c6c7..82c08348f4bf1 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -18,7 +18,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid, nn
 
@@ -99,11 +98,11 @@ def fluid_layer(self, place):
                     else (-1, self.num_channels, -1, -1, -1)
                 )
                 x_var = fluid.data("input", input_shape, dtype=self.dtype)
-                weight_attr = I.NumpyArrayInitializer(self.weight)
+                weight_attr = paddle.nn.initializer.Assign(self.weight)
                 if self.bias is None:
                     bias_attr = False
                 else:
-                    bias_attr = I.NumpyArrayInitializer(self.bias)
+                    bias_attr = paddle.nn.initializer.Assign(self.bias)
                 y_var = paddle.static.nn.conv3d_transpose(
                     x_var,
                     self.num_filters,
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 32f77ab290b88..b7a0c981bacd6 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -120,15 +120,15 @@ def test_gen_TruncatedNormal_initializer(self):
             result_1 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
             result_2 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index e02282cb9bee1..a5f193daa4c50 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -61,7 +61,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
                     size=hidden_size,
                     activation='tanh',
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=1.0)
+                        initializer=paddle.nn.initializer.Constant(value=1.0)
                     ),
                 )
 
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index ecb49c3172fb9..2bf68add10281 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -58,9 +58,7 @@ def cnn_model(data):
         size=SIZE,
         activation="softmax",
         weight_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale
-            )
+            initializer=paddle.nn.initializer.Normal(loc=0.0, scale=scale)
         ),
     )
     return predict
diff --git a/python/paddle/fluid/tests/unittests/test_detach.py b/python/paddle/fluid/tests/unittests/test_detach.py
index 1a01b7667feb1..d3622bd042de9 100644
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ b/python/paddle/fluid/tests/unittests/test_detach.py
@@ -120,7 +120,7 @@ def detach_multi(self):
                 initializer=paddle.nn.initializer.Constant(5.0)
             )
             linear_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(6.0)
+                initializer=paddle.nn.initializer.Constant(6.0)
             )
             linear = Linear(
                 4,
@@ -132,7 +132,7 @@ def detach_multi(self):
                 initializer=paddle.nn.initializer.Constant(7.0)
             )
             linear1_b_param_attrs = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(8.0)
+                initializer=paddle.nn.initializer.Constant(8.0)
             )
             linear1 = Linear(
                 10,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index c560dfa8dbb0b..f0f85e1645124 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -58,7 +58,7 @@ def test_a_sync_optimizer3(self):
             size=[1000000000, 100000],
             param_attr=paddle.fluid.ParamAttr(
                 name="embedding",
-                initializer=paddle.fluid.initializer.Constant(value=0.01),
+                initializer=paddle.paddle.nn.initializer.Constant(value=0.01),
             ),
             is_sparse=True,
         )
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index bc17b0d67f990..69b341a026762 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -89,7 +89,7 @@ def embedding_layer(input):
                 size=[100001, 10],
                 param_attr=fluid.ParamAttr(
                     name="SparseFeatFactors",
-                    initializer=fluid.initializer.Uniform(),
+                    initializer=paddle.nn.initializer.Uniform(),
                 ),
             )
 
@@ -103,8 +103,8 @@ def embedding_layer(input):
                 size=400,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(concated.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(concated.shape[1])
                     )
                 ),
                 name="fc1",
@@ -116,8 +116,8 @@ def embedding_layer(input):
                 size=400,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(fc1.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(fc1.shape[1])
                     )
                 ),
                 name="fc2",
@@ -129,8 +129,8 @@ def embedding_layer(input):
                 size=400,
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(fc2.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(fc2.shape[1])
                     )
                 ),
                 name="fc3",
@@ -142,8 +142,8 @@ def embedding_layer(input):
                 size=2,
                 activation="softmax",
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Normal(
-                        scale=1 / math.sqrt(fc3.shape[1])
+                    initializer=paddle.nn.initializer.Normal(
+                        std=1 / math.sqrt(fc3.shape[1])
                     )
                 ),
             )
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py
index b60ff0db63e7d..da63b75f50fa4 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_minimize.py
@@ -77,7 +77,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index a330b45b52228..ea30485e5aba0 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -78,7 +78,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -93,7 +93,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -141,7 +141,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -156,7 +156,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
index 2143dc94d39e0..861e015568370 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -77,7 +77,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
index bee3cd9eb2239..1ab2d5178241b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -80,7 +80,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -94,7 +94,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -124,7 +124,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -139,7 +139,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -153,7 +153,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
index 58248d325b145..b17451098f405 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
@@ -81,7 +81,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -95,7 +95,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -111,7 +111,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -140,7 +140,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -154,7 +154,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index e207fb859de54..c9e6cb2035d69 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -80,7 +80,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -95,7 +95,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -111,7 +111,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -140,7 +140,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -154,7 +154,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index 4093fc34cc998..2a5f845b93b64 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -78,7 +78,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -93,7 +93,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -141,7 +141,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -156,7 +156,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 025b3e90b37d4..094ea32967205 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -77,7 +77,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index 51bf54b3241b4..40abc45e0ab32 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -78,7 +78,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -93,7 +93,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -110,7 +110,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -125,7 +125,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -141,7 +141,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__tmp_",
                 learning_rate=emb_lr,
             ),
@@ -156,7 +156,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
index 165a8b6240aaf..a5811d4e0f12a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -77,7 +77,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -91,7 +91,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -107,7 +107,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -121,7 +121,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -136,7 +136,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
index 517232fa54eb8..fae692f8fd57c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
@@ -211,7 +211,7 @@ def net():
             datas = [dnn_data, lr_data, label]
 
             inference = True
-            init = fluid.initializer.Uniform()
+            init = paddle.nn.initializer.Uniform()
 
             dnn_layer_dims = [128, 64, 32]
             dnn_embedding = fluid.contrib.layers.sparse_embedding(
@@ -232,7 +232,7 @@ def net():
                     size=dim,
                     activation="relu",
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=0.01)
+                        initializer=paddle.nn.initializer.Constant(value=0.01)
                     ),
                     name='dnn-fc-%d' % i,
                 )
@@ -245,7 +245,7 @@ def net():
                 is_test=inference,
                 param_attr=fluid.ParamAttr(
                     name="wide_embedding",
-                    initializer=fluid.initializer.Constant(value=0.01),
+                    initializer=paddle.nn.initializer.Constant(value=0.01),
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py
index ba6e67a035095..ebcbfb9e4c4a6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_spmt.py
@@ -75,7 +75,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=q,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -89,7 +89,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=q_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__q_fc__",
                 learning_rate=base_lr,
             ),
@@ -105,7 +105,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=pt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -119,7 +119,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=pt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
@@ -134,7 +134,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             input=nt,
             size=[dict_dim, emb_dim],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__emb__",
                 learning_rate=emb_lr,
             ),
@@ -148,7 +148,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             x=nt_ss,
             size=hid_dim,
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
+                initializer=paddle.nn.initializer.Constant(value=0.01),
                 name="__fc__",
                 learning_rate=base_lr,
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
index 368be77fdbbfb..5ab7ad21dbdc9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py
@@ -38,9 +38,7 @@ def net(self, emb_array, fc_array):
                 size=[10, 10],
                 param_attr=fluid.ParamAttr(
                     name="embedding",
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        emb_array
-                    ),
+                    initializer=paddle.nn.initializer.Assign(emb_array),
                 ),
             )
 
@@ -50,9 +48,7 @@ def net(self, emb_array, fc_array):
                 activation="relu",
                 weight_attr=fluid.ParamAttr(
                     name='fc',
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        fc_array
-                    ),
+                    initializer=paddle.nn.initializer.Assign(fc_array),
                 ),
             )
             loss = paddle.mean(fc1)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 828b07baf7bbc..548f2bf8a0c83 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -87,7 +87,9 @@ def init_serv(self, place):
                     dtype='float32',
                     name="X",
                 )
-                fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                paddle.nn.initializer.Constant(value=1.0)(
+                    x, main.global_block()
+                )
                 ops._scale(x=x, scale=10.0, out=out_var)
 
         self.server_exe = fluid.Executor(place)
@@ -108,7 +110,7 @@ def init_client(self, place, port):
 
             x = paddle.static.data(shape=[32, 32], dtype='float32', name='X')
             x.persistable = True
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            paddle.nn.initializer.Constant(value=2.3)(x, main.global_block())
 
             get_var = main.global_block().create_var(
                 name="scale_0.tmp_0",  # server side var
@@ -116,7 +118,9 @@ def init_client(self, place, port):
                 persistable=False,
                 shape=[32, 32],
             )
-            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            paddle.nn.initializer.Constant(value=2.3)(
+                get_var, main.global_block()
+            )
 
             # NOTE(zjl): `Send` is async send, which means that the sent
             # variable would be needed even though `Send` op runs.
@@ -135,7 +139,7 @@ def run_local(self, place):
         main = fluid.Program()
         with fluid.program_guard(main):
             x = paddle.static.data(shape=[32, 32], dtype='float32', name='X')
-            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            paddle.nn.initializer.Constant(value=2.3)(x, main.global_block())
             o = paddle.scale(x=x, scale=10.0)
         exe = fluid.Executor(place)
         self.local_out = exe.run(main, fetch_list=[o])
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index e9b8f773c743b..e79a2f7276c00 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -356,7 +356,9 @@ def net_conf(self):
             size=[dict_size, embedding_size],
             param_attr=fluid.ParamAttr(
                 name='emb',
-                initializer=fluid.initializer.Uniform(-init_width, init_width),
+                initializer=paddle.nn.initializer.Uniform(
+                    -init_width, init_width
+                ),
             ),
         )
 
@@ -365,7 +367,8 @@ def net_conf(self):
             is_sparse=True,
             size=[dict_size, embedding_size],
             param_attr=fluid.ParamAttr(
-                name='emb_w', initializer=fluid.initializer.Constant(value=0.0)
+                name='emb_w',
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             ),
         )
 
@@ -374,7 +377,8 @@ def net_conf(self):
             is_sparse=True,
             size=[dict_size, 1],
             param_attr=fluid.ParamAttr(
-                name='emb_b', initializer=fluid.initializer.Constant(value=0.0)
+                name='emb_b',
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             ),
         )
 
@@ -1327,7 +1331,7 @@ def network_with_table(self, is_sparse, is_distributed):
                 shape=[num_total_classes, 10],
                 dtype='float32',
                 name='nce_w',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
         b_param = (
@@ -1337,7 +1341,7 @@ def network_with_table(self, is_sparse, is_distributed):
                 shape=[num_total_classes, 1],
                 dtype='float32',
                 name='nce_b',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
 
@@ -1405,7 +1409,7 @@ def network_with_table(self, is_sparse, is_distributed):
                 shape=[num_total_classes, 10],
                 dtype='float32',
                 name='hs_w',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
         b_param = (
@@ -1415,7 +1419,7 @@ def network_with_table(self, is_sparse, is_distributed):
                 shape=[3, 1],
                 dtype='float32',
                 name='hs_b',
-                initializer=fluid.initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
 
@@ -1424,7 +1428,7 @@ def network_with_table(self, is_sparse, is_distributed):
             is_sparse=is_sparse,
             size=[3, 3],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Normal(
+                initializer=paddle.nn.initializer.Normal(
                     scale=1 / math.sqrt(num_total_classes)
                 )
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
index a12a17636bfc1..46977b13d7700 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -41,7 +41,7 @@ def simple_fc_net():
             size=200,
             activation='tanh',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 5657eb174c303..05df1e96d7505 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -129,7 +129,7 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
                 [hidden_size * 2, hidden_size * 4],
                 dtype="float32",
                 name="fc_weight1_" + str(i),
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             )
@@ -138,7 +138,7 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
                 [hidden_size * 4],
                 dtype="float32",
                 name="fc_bias1_" + str(i),
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             bias_arr.append(bias_1)
 
@@ -250,7 +250,7 @@ def encoder_static(
                 [hidden_size * 2, hidden_size * 4],
                 dtype="float32",
                 name="fc_weight1_" + str(i),
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             )
@@ -259,7 +259,7 @@ def encoder_static(
                 [hidden_size * 4],
                 dtype="float32",
                 name="fc_bias1_" + str(i),
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             bias_arr.append(bias_1)
 
@@ -368,7 +368,7 @@ def encoder_static(
         is_sparse=False,
         param_attr=fluid.ParamAttr(
             name='embedding_para',
-            initializer=fluid.initializer.UniformInitializer(
+            initializer=paddle.nn.initializer.Uniform(
                 low=-init_scale, high=init_scale
             ),
         ),
@@ -406,7 +406,7 @@ def encoder_static(
         [hidden_size, vocab_size],
         dtype="float32",
         name="softmax_weight",
-        default_initializer=fluid.initializer.UniformInitializer(
+        default_initializer=paddle.nn.initializer.Uniform(
             low=-init_scale, high=init_scale
         ),
     )
@@ -414,7 +414,7 @@ def encoder_static(
         [vocab_size],
         dtype="float32",
         name='softmax_bias',
-        default_initializer=fluid.initializer.UniformInitializer(
+        default_initializer=paddle.nn.initializer.Uniform(
             low=-init_scale, high=init_scale
         ),
     )
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index bd4e08819570f..d9ce93c913017 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -312,7 +312,7 @@ def create_rnn_op(self):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='W',
-                    initializer=fluid.initializer.ConstantInitializer(1.0),
+                    initializer=paddle.nn.initializer.Constant(1.0),
                 ),
                 bias_attr=False,
             )
@@ -321,7 +321,7 @@ def create_rnn_op(self):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='U',
-                    initializer=fluid.initializer.ConstantInitializer(0.0),
+                    initializer=paddle.nn.initializer.Constant(0.0),
                 ),
                 bias_attr=False,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 9f05d354c463d..3fb03ac89f0d7 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -911,19 +911,19 @@ def func_fp16_initilaizer(self):
             1,
             3,
             bias_attr=False,
-            weight_attr=paddle.fluid.initializer.Uniform(),
+            weight_attr=paddle.nn.initializer.Uniform(),
         )
         linear3 = paddle.nn.Linear(
             1,
             3,
             bias_attr=False,
-            weight_attr=paddle.fluid.initializer.TruncatedNormalInitializer(),
+            weight_attr=paddle.nn.initializer.TruncatedNormal(),
         )
         linear4 = paddle.nn.Linear(
             1,
             3,
             bias_attr=False,
-            weight_attr=paddle.fluid.initializer.MSRAInitializer(),
+            weight_attr=paddle.nn.initializer.KaimingUniform(),
         )
         res = [
             linear1.weight.numpy(),
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
index ce83ba62acb97..00cc6c07aac8b 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -19,7 +19,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -97,10 +96,10 @@ def static_graph_case_1(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=self.act,
                     data_format=self.data_format,
                 )
@@ -515,10 +514,10 @@ def static_graph_case(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index f45cf48afbf0d..2981748cf6178 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -19,7 +19,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -99,10 +98,10 @@ def static_graph_case_1(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     data_format=self.data_format,
                 )
         exe = fluid.Executor(self.place)
@@ -523,10 +522,10 @@ def static_graph_case(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
index bdd8360f97174..62322f8e3dc8f 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -19,7 +19,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -97,10 +96,10 @@ def static_graph_case_1(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=self.act,
                     data_format=self.data_format,
                 )
@@ -490,10 +489,10 @@ def static_graph_case(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index ae402c874e639..7a8549b1240aa 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -19,7 +19,6 @@
 
 import paddle
 import paddle.fluid.dygraph as dg
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 from paddle import fluid
 
@@ -99,10 +98,10 @@ def static_graph_case_1(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
+                    param_attr=paddle.nn.initializer.Assign(self.weight),
                     bias_attr=False
                     if self.no_bias
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=self.act,
                     data_format=self.data_format,
                 )
@@ -548,10 +547,10 @@ def static_graph_case(self):
                     padding=self.padding,
                     dilation=self.dilation,
                     groups=self.groups,
-                    param_attr=I.NumpyArrayInitializer(self.filter),
+                    param_attr=paddle.nn.initializer.Assign(self.filter),
                     bias_attr=False
                     if self.bias is None
-                    else I.NumpyArrayInitializer(self.bias),
+                    else paddle.nn.initializer.Assign(self.bias),
                     act=None,
                     data_format=self.data_format,
                 )
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 4d7fb60d4660e..83574bae6b462 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -37,11 +37,11 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
             )
             param_attr = fluid.ParamAttr(
                 name='batch_norm_w',
-                initializer=fluid.initializer.Constant(value=1.0),
+                initializer=paddle.nn.initializer.Constant(value=1.0),
             )
             bias_attr = fluid.ParamAttr(
                 name='batch_norm_b',
-                initializer=fluid.initializer.Constant(value=0.0),
+                initializer=paddle.nn.initializer.Constant(value=0.0),
             )
             hidden2 = paddle.static.nn.batch_norm(
                 input=hidden1,
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index d981ccbe14ccb..c00f10d91d4b4 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -31,33 +31,33 @@ class TestFusedBnAddActAPI(unittest.TestCase):
     def setUp(self):
         self.conv_param_attr1 = fluid.ParamAttr(
             name='conv2d_1.weight',
-            initializer=fluid.initializer.Xavier(uniform=False),
+            initializer=paddle.nn.initializer.XavierNormal(),
             learning_rate=0.001,
         )
         self.conv_param_attr2 = fluid.ParamAttr(
             name='conv2d_2.weight',
-            initializer=fluid.initializer.Xavier(uniform=False),
+            initializer=paddle.nn.initializer.XavierNormal(),
             learning_rate=0.001,
         )
         self.bn_param_attr1 = fluid.ParamAttr(
             name='batch_norm_w_1',
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         self.bn_bias_attr1 = fluid.ParamAttr(
             name='batch_norm_b_1',
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
         self.bn_param_attr2 = fluid.ParamAttr(
             name='batch_norm_w_2',
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         self.bn_bias_attr2 = fluid.ParamAttr(
             name='batch_norm_b_2',
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
         self.fc_param_attr = fluid.ParamAttr(
             name='fc.weight',
-            initializer=fluid.initializer.Xavier(uniform=False),
+            initializer=paddle.nn.initializer.XavierNormal(),
         )
 
     def build_fused_program(
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index 8068387cfdcba..9264c8f2e77c6 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -53,7 +53,7 @@ def setUp(self):
         self.__class__.no_need_check_grad = False
 
         bias_attr = paddle.fluid.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(value=0.0005)
+            initializer=paddle.paddle.nn.initializer.Constant(value=0.0005)
         )
         self.q_proj = Linear(
             self.embed_dim,
@@ -1027,16 +1027,16 @@ def config(self):
         self.has_attn_mask = False
         self.x_type = np.float32
         self.weight_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(0.0)
+            initializer=paddle.paddle.nn.initializer.Constant(0.0)
         )
         self.bias_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(0.0005)
+            initializer=paddle.paddle.nn.initializer.Constant(0.0005)
         )
         self.ln_w_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(1.0)
+            initializer=paddle.paddle.nn.initializer.Constant(1.0)
         )
         self.ln_b_attr = paddle.ParamAttr(
-            initializer=paddle.fluid.initializer.Constant(0.0)
+            initializer=paddle.paddle.nn.initializer.Constant(0.0)
         )
 
     def test_fused_multi_transformer_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index fcbc91edee31e..b0625050b889f 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -62,7 +62,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
                     size=hidden_size,
                     activation='tanh',
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(value=1.0)
+                        initializer=paddle.nn.initializer.Constant(value=1.0)
                     ),
                 )
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 75e5d1ee2ee15..abf0ba0ac2650 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -20,7 +20,6 @@
 
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.initializer as I
 import paddle.nn.functional as F
 
 paddle.enable_static()
@@ -302,7 +301,7 @@ def hs_net_conf(self, is_sparse):
             is_sparse=is_sparse,
             size=[3, 3],
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Normal(scale=1 / math.sqrt(3))
+                initializer=paddle.nn.initializer.Normal(std=1 / math.sqrt(3))
             ),
         )
 
@@ -555,8 +554,8 @@ def test_dygraph_api(self):
             x, labels, self.num_classes, weight, bias, path_table, path_code
         )
 
-        weight_attr = I.NumpyArrayInitializer(self.weight_np)
-        bias_attr = I.NumpyArrayInitializer(self.bias_np)
+        weight_attr = paddle.nn.initializer.Assign(self.weight_np)
+        bias_attr = paddle.nn.initializer.Assign(self.bias_np)
         m = paddle.nn.HSigmoidLoss(
             self.feature_size,
             self.num_classes,
@@ -593,10 +592,10 @@ def test_static_api(self):
             )
 
             weight_attr = paddle.framework.ParamAttr(
-                initializer=I.NumpyArrayInitializer(self.weight_np)
+                initializer=paddle.nn.initializer.Assign(self.weight_np)
             )
             bias_attr = paddle.framework.ParamAttr(
-                initializer=I.NumpyArrayInitializer(self.bias_np)
+                initializer=paddle.nn.initializer.Assign(self.bias_np)
             )
             m = paddle.nn.HSigmoidLoss(
                 self.feature_size,
@@ -636,8 +635,8 @@ def test_fluid_api(self):
             if self.is_custom:
                 path_table = fluid.data('path_table', [-1, -1], 'int64')
                 path_code = fluid.data('path_code', [-1, -1], 'int64')
-            weight_attr = I.NumpyArrayInitializer(self.weight_np)
-            bias_attr = I.NumpyArrayInitializer(self.bias_np)
+            weight_attr = paddle.nn.initializer.Assign(self.weight_np)
+            bias_attr = paddle.nn.initializer.Assign(self.bias_np)
             loss = paddle.nn.HSigmoidLoss(
                 feature_size=x.shape[1],
                 num_classes=self.num_classes,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index f34c8d6a2a858..51e32c5259f45 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -123,7 +123,7 @@ def __init__(self, num_users, num_items, matrix):
             shape=matrix.shape,
             dtype=matrix.dtype,
             is_bias=False,
-            default_initializer=fluid.initializer.NumpyArrayInitializer(matrix),
+            default_initializer=paddle.nn.initializer.Assign(matrix),
         )
         self._rating_matrix.stop_gradient = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 0eb037bc6a02e..af6e32ac6b897 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -47,7 +47,7 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -56,7 +56,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 12be3af2d9cf9..5c48252cb0b7f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -77,12 +77,12 @@ def __init__(
         filter_size = 3
         conv_std_0 = (2.0 / (filter_size**2 * channels[0])) ** 0.5
         conv_param_0 = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, conv_std_0)
+            initializer=paddle.nn.initializer.Normal(0.0, conv_std_0)
         )
 
         conv_std_1 = (2.0 / (filter_size**2 * channels[1])) ** 0.5
         conv_param_1 = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, conv_std_1)
+            initializer=paddle.nn.initializer.Normal(0.0, conv_std_1)
         )
 
         self.conv_0_layer = paddle.nn.Conv2D(
@@ -200,10 +200,11 @@ def __init__(
         super().__init__()
         self.rnn_hidden_size = rnn_hidden_size
         para_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, 0.02)
+            initializer=paddle.nn.initializer.Normal(0.0, 0.02)
         )
         bias_attr = fluid.ParamAttr(
-            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0
+            initializer=paddle.nn.initializer.Normal(0.0, 0.02),
+            learning_rate=2.0,
         )
         if fluid.framework._non_static_mode():
             h_0 = np.zeros(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 6eb0c9d6e6c03..8917230d52c4e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -51,26 +51,26 @@ def _create_parameter(self):
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -176,7 +176,7 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -185,7 +185,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -193,7 +193,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index bc46ad12d3df0..2936b0730386f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -226,7 +226,7 @@ def __init__(self, layers=50, class_dim=102, use_cudnn=True):
             self.pool2d_avg_output,
             class_dim,
             weight_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 7fd322d358366..2ef0b8afcc5c7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -48,26 +48,26 @@ def __init__(
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -172,7 +172,7 @@ def __init__(
             sparse=False,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -182,7 +182,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -190,7 +190,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index 647710fba61f1..fb833c6525846 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -48,7 +48,7 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -57,7 +57,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.hidden_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -65,7 +65,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size],
             dtype=dtype,
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index e171899289aa4..46bd8890d21da 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -399,10 +399,10 @@ def __init__(self, d_model, process_cmd, shape_len=None):
                 self._layer_norm = paddle.nn.LayerNorm(
                     normalized_shape=d_model,
                     weight_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(1.0)
+                        initializer=paddle.nn.initializer.Constant(1.0)
                     ),
                     bias_attr=fluid.ParamAttr(
-                        initializer=fluid.initializer.Constant(0.0)
+                        initializer=paddle.nn.initializer.Constant(0.0)
                     ),
                 )
 
@@ -662,7 +662,9 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
-                initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
+                initializer=paddle.nn.initializer.Normal(
+                    0.0, src_emb_dim**-0.5
+                ),
             ),
         )
 
@@ -676,7 +678,7 @@ def __init__(
             sparse=is_sparse,
             weight_attr=fluid.ParamAttr(
                 name=pos_enc_param_name,
-                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                initializer=paddle.nn.initializer.Assign(pos_inp),
                 trainable=False,
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 07d9d7b48c29f..f87e62cb02098 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-import paddle.fluid.initializer as initializer
 from paddle.fluid.core import VarDesc
 from paddle.regularizer import L2Decay
 
@@ -67,7 +66,7 @@ def test_constant_initializer_default_value(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
@@ -86,7 +85,7 @@ def test_constant_initializer(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.ConstantInitializer(2.3),
+                initializer=paddle.nn.initializer.Constant(2.3),
             )
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
@@ -119,7 +118,7 @@ def test_uniform_initializer_default_value(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.UniformInitializer(),
+                initializer=paddle.nn.initializer.Uniform(),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -141,14 +140,14 @@ def test_uniform_initializer_random_seed(self):
                 shape=[5, 10],
                 lod_level=0,
                 name="param1",
-                initializer=initializer.UniformInitializer(),
+                initializer=paddle.nn.initializer.Uniform(),
             )
             block.create_parameter(
                 dtype="float32",
                 shape=[5, 10],
                 lod_level=0,
                 name="param2",
-                initializer=initializer.UniformInitializer(seed=456),
+                initializer=paddle.nn.initializer.UniformInitializer(seed=456),
             )
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 456)
@@ -165,7 +164,9 @@ def test_uniform_initializer(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.UniformInitializer(-4.2, 3.1, 123),
+                initializer=paddle.nn.initializer.UniformInitializer(
+                    -4.2, 3.1, 123
+                ),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -186,7 +187,9 @@ def test_uniform_initializer_two_op(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.UniformInitializer(-4.2, float(i), 123),
+                initializer=paddle.nn.initializer.UniformInitializer(
+                    -4.2, float(i), 123
+                ),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -226,7 +229,7 @@ def test_normal_initializer_default_value(self):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.NormalInitializer(),
+                initializer=paddle.nn.initializer.Normal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -245,7 +248,9 @@ def test_normal_initializer(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.NormalInitializer(2.3, 1.9, 123),
+                initializer=paddle.nn.initializer.NormalInitializer(
+                    2.3, 1.9, 123
+                ),
             )
         num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
@@ -278,7 +283,7 @@ def test_uniform_xavier_initializer(self):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(),
+                initializer=paddle.nn.initializer.XavierUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -300,7 +305,7 @@ def test_uniform_xavier_initializer_conv(self):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(),
+                initializer=paddle.nn.initializer.XavierUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -325,7 +330,7 @@ def test_normal_xavier_initializer(self):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -347,7 +352,7 @@ def test_normal_xavier_initializer_conv(self):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -372,7 +377,7 @@ def test_xavier_initializer_supplied_arguments(
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.XavierInitializer(
+                initializer=paddle.nn.initializer.XavierInitializer(
                     uniform=uniform, fan_in=12, fan_out=23, seed=134
                 ),
             )
@@ -421,7 +426,7 @@ def test_uniform_msra_initializer(self):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(),
+                initializer=paddle.nn.initializer.KaimingUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -443,7 +448,7 @@ def test_uniform_msra_initializer_conv(self):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(),
+                initializer=paddle.nn.initializer.KaimingUniform(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -466,7 +471,7 @@ def test_normal_msra_initializer(self):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(uniform=False),
+                initializer=paddle.nn.initializer.KaimingNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -488,7 +493,7 @@ def test_normal_msra_initializer_conv(self):
                 shape=[5, 10, 15, 20],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(uniform=False),
+                initializer=paddle.nn.initializer.KaimingNormal(),
             )
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
@@ -509,7 +514,9 @@ def test_msra_initializer_supplied_arguments(self, dtype="float32"):
                 shape=[5, 10],
                 lod_level=0,
                 name="param",
-                initializer=initializer.MSRAInitializer(fan_in=12, seed=134),
+                initializer=paddle.nn.initializer.MSRAInitializer(
+                    fan_in=12, seed=134
+                ),
             )
         num_ops = 2 if dtype == "float16" else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -542,7 +549,7 @@ def test_bilinear_initializer(self, dtype="float32"):
                 shape=[8, 1, 3, 3],
                 lod_level=0,
                 name="param",
-                initializer=initializer.BilinearInitializer(),
+                initializer=paddle.nn.initializer.Bilinear(),
             )
         num_ops = 2 if dtype in ["float16", "uint16", "float64"] else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -576,7 +583,7 @@ def func_test_case(self):
         w_attr = paddle.ParamAttr(
             learning_rate=0.0,
             regularizer=L2Decay(0.0),
-            initializer=initializer.BilinearInitializer(),
+            initializer=paddle.nn.initializer.Bilinear(),
         )
         data = paddle.rand([B, 3, H, W], dtype='float32')
         conv_up = paddle.nn.Conv2DTranspose(
@@ -597,7 +604,7 @@ def func_test_case_fp16(self):
         w_attr = paddle.ParamAttr(
             learning_rate=0.0,
             regularizer=L2Decay(0.0),
-            initializer=initializer.BilinearInitializer(),
+            initializer=paddle.nn.initializer.Bilinear(),
         )
         conv2d = paddle.nn.Conv2D(1, 2, 3, weight_attr=w_attr)
         paddle.set_default_dtype("float32")
@@ -632,7 +639,7 @@ def test_numpy_array_initializer(self, dtype="float32"):
                 shape=np_array.shape,
                 lod_level=0,
                 name="param",
-                initializer=initializer.NumpyArrayInitializer(np_array),
+                initializer=paddle.nn.initializer.Assign(np_array),
             )
         num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
@@ -657,7 +664,9 @@ def test_set_global_weight_initilizer(self):
         """Test Set Global Param initilizer with UniformInitializer"""
         main_prog = framework.Program()
         startup_prog = framework.Program()
-        fluid.set_global_initializer(initializer.Uniform(low=-0.5, high=0.5))
+        fluid.set_global_initializer(
+            paddle.nn.initializer.Uniform(low=-0.5, high=0.5)
+        )
         with fluid.program_guard(main_prog, startup_prog):
             x = fluid.data(name="x", shape=[1, 3, 32, 32])
             # default initilizer of param in layers.conv2d is NormalInitializer
@@ -683,8 +692,8 @@ def test_set_global_bias_initilizer(self):
         main_prog = framework.Program()
         startup_prog = framework.Program()
         fluid.set_global_initializer(
-            initializer.Uniform(low=-0.5, high=0.5),
-            bias_init=initializer.Normal(loc=0.0, scale=2.0),
+            paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+            bias_init=paddle.nn.initializer.Normal(0.0, 2.0),
         )
         with fluid.program_guard(main_prog, startup_prog):
             x = fluid.data(name="x", shape=[1, 3, 32, 32])
@@ -746,9 +755,7 @@ def test_xvarier_initializer(self, dtype="float32"):
         tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
 
-        xavier_ = paddle.fluid.initializer.XavierInitializer(
-            uniform=False, fan_in=3, fan_out=5
-        )
+        xavier_ = paddle.nn.initializer.XavierNormal(fan_in=3, fan_out=5)
         xavier_(tensor)
 
         hist, _ = output_hist(tensor.numpy())
@@ -771,9 +778,7 @@ def test_msra_initializer(self, dtype="float32"):
         tensor = paddle.zeros([1024, 1024, 16])
         tensor.stop_gradient = False
 
-        msra_ = paddle.fluid.initializer.MSRAInitializer(
-            uniform=False, fan_in=4
-        )
+        msra_ = paddle.nn.initializer.KaimingNormal(fan_in=4)
         msra_(tensor)
 
         hist, _ = output_hist(tensor.numpy())
@@ -1188,7 +1193,7 @@ def func_kaiminguniform_initializer_fan_in_zero(self):
 
     def test_type_error(self):
         self.assertRaises(
-            ValueError, self.func_kaiminguniform_initializer_fan_in_zero
+            ZeroDivisionError, self.func_kaiminguniform_initializer_fan_in_zero
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index bc4ef3d386ccb..7dcf964c41e31 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -34,7 +34,7 @@ def fc_with_batchnorm(use_feed):
             size=200,
             activation='tanh',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 51715e2ae1ce2..192585e6c16db 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -138,7 +138,9 @@ def test_linear(self):
                 name='data', shape=[3, 32, 32], dtype='float32'
             )
             linear = paddle.nn.Linear(
-                32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1)
+                32,
+                4,
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             ret = linear(t)
             static_ret = self.get_static_graph_result(
@@ -147,7 +149,9 @@ def test_linear(self):
         with self.dynamic_graph():
             t = base.to_variable(inp)
             linear = paddle.nn.Linear(
-                32, 4, bias_attr=fluid.initializer.ConstantInitializer(value=1)
+                32,
+                4,
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_ret = linear(t)
             dy_ret_value = dy_ret.numpy()
@@ -162,7 +166,7 @@ def test_Variable():
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret1 = linear(inp)
 
@@ -175,7 +179,7 @@ def test_type():
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret2 = linear(inp)
 
@@ -248,7 +252,7 @@ def test_Variable():
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret1 = linear(inp)
 
@@ -261,7 +265,7 @@ def test_type():
                 linear = paddle.nn.Linear(
                     32,
                     4,
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                    bias_attr=paddle.nn.initializer.Constant(value=1),
                 )
                 linear_ret2 = linear(inp)
 
@@ -396,7 +400,7 @@ def test_conv2d_transpose(self):
                 num_filters=10,
                 filter_size=27,
                 act='sigmoid',
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             static_rlt = self.get_static_graph_result(
                 feed={'pixel': inp_np}, fetch_list=[out]
@@ -409,7 +413,7 @@ def test_conv2d_transpose(self):
                 3,
                 10,
                 27,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             out = conv2d_transpose(img)
             out = paddle.nn.functional.sigmoid(out)
@@ -421,7 +425,7 @@ def test_conv2d_transpose(self):
                 3,
                 10,
                 27,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_rlt = conv2d_transpose(base.to_variable(inp_np))
             dy_rlt = paddle.nn.functional.sigmoid(dy_rlt)
@@ -433,9 +437,7 @@ def test_conv2d_transpose(self):
             images = np.ones([2, 3, 5, 5], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             conv2d1 = paddle.nn.Conv2DTranspose(3, 3, [2, 2])
             conv2d2 = paddle.nn.Conv2DTranspose(
@@ -503,7 +505,7 @@ def test_bilinear_tensor_product(self):
                 data_x,
                 data_y,
                 6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
                 act='sigmoid',
             )
 
@@ -518,7 +520,7 @@ def test_bilinear_tensor_product(self):
                 3,
                 3,
                 6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             out = btp(data_x, data_y)
             out = paddle.nn.functional.sigmoid(out)
@@ -530,7 +532,7 @@ def test_bilinear_tensor_product(self):
                 3,
                 3,
                 6,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
             dy_rlt = paddle.nn.functional.sigmoid(dy_rlt)
@@ -566,9 +568,7 @@ def test_bilinear_tensor_product(self):
         with self.dynamic_graph():
             custom_weight = np.random.randn(6, 3, 3).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             btp1 = paddle.nn.Bilinear(3, 3, 6)
             btp2 = paddle.nn.Bilinear(3, 3, 6, weight_attr=weight_attr)
@@ -641,9 +641,7 @@ def test_embeding(self):
         with self.dynamic_graph():
             custom_weight = np.random.randn(dict_size, 32).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             emb1 = paddle.nn.Embedding(dict_size, 32, sparse=False)
             emb2 = paddle.nn.Embedding(
@@ -741,9 +739,7 @@ def test_conv3d(self):
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             conv3d1 = paddle.nn.Conv3D(
                 in_channels=3, out_channels=3, kernel_size=2
@@ -798,8 +794,8 @@ def test_group_norm(self):
             ret = paddle.static.nn.group_norm(
                 input=X,
                 groups=2,
-                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                param_attr=paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             static_ret = self.get_static_graph_result(
                 feed={
@@ -818,8 +814,8 @@ def test_group_norm(self):
             groupNorm = paddle.nn.GroupNorm(
                 num_channels=shape[1],
                 num_groups=2,
-                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                weight_attr=paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             ret = groupNorm(X)
             static_ret2 = self.get_static_graph_result(
@@ -836,8 +832,8 @@ def test_group_norm(self):
             groupNorm = paddle.nn.GroupNorm(
                 num_channels=shape[1],
                 num_groups=2,
-                weight_attr=fluid.initializer.Uniform(low=-0.5, high=0.5),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
+                weight_attr=paddle.nn.initializer.Uniform(low=-0.5, high=0.5),
+                bias_attr=paddle.nn.initializer.Constant(value=1),
             )
             dy_ret = groupNorm(base.to_variable(input))
             dy_rlt_value = dy_ret.numpy()
@@ -990,9 +986,7 @@ def test_conv3d_transpose(self):
             images = np.ones([2, 3, 6, 6, 6], dtype='float32')
             custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32")
             weight_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    custom_weight
-                )
+                initializer=paddle.nn.initializer.Assign(custom_weight)
             )
             conv3d1 = paddle.nn.Conv3DTranspose(
                 in_channels=3,
@@ -2213,13 +2207,13 @@ def test_batch_fc(self):
                 param_attr=fluid.ParamAttr(
                     learning_rate=1.0,
                     name="w_0",
-                    initializer=fluid.initializer.Xavier(uniform=False),
+                    initializer=paddle.nn.initializer.XavierNormal(),
                 ),
                 bias_size=[16, 10],
                 bias_attr=fluid.ParamAttr(
                     learning_rate=1.0,
                     name="b_0",
-                    initializer=fluid.initializer.Xavier(uniform=False),
+                    initializer=paddle.nn.initializer.XavierNormal(),
                 ),
                 act="relu",
             )
@@ -2238,7 +2232,7 @@ def test_rank_attention(self):
                 rank_param_attr=fluid.ParamAttr(
                     learning_rate=1.0,
                     name="ubm_rank_param.w_0",
-                    initializer=fluid.initializer.Xavier(uniform=False),
+                    initializer=paddle.nn.initializer.XavierNormal(),
                 ),
                 max_rank=3,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py
index 71f5c831ae4b6..36496004b18d5 100644
--- a/python/paddle/fluid/tests/unittests/test_linear.py
+++ b/python/paddle/fluid/tests/unittests/test_linear.py
@@ -50,14 +50,14 @@ def paddle_nn_layer(self, place):
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
-            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         bias_attr = fluid.ParamAttr(
             name="linear_bias",
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
-            initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         linear = paddle.nn.Linear(
             2, 2, weight_attr=weight_attr, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
index cc11e96f5a915..649a2e5937c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -217,7 +217,7 @@ class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
     """
 
     def set_initializer(self):
-        self.initializer = fluid.initializer.Constant(value=self.value)
+        self.initializer = paddle.nn.initializer.Constant(value=self.value)
 
     def setUp(self):
         self.ids_shape = [4, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
index 8cbc6242b3af9..0f6affcd26c07 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py
@@ -84,7 +84,7 @@ class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
     """
 
     def set_initializer(self):
-        self.initializer = fluid.initializer.Constant(value=self.value)
+        self.initializer = paddle.nn.initializer.Constant(value=self.value)
 
     def setUp(self):
         self.op_type = "lookup_table_v2"
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index 74b6eec7198c6..6aea5ef118c11 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -209,9 +209,7 @@ def get_w_grad(self, is_sparse):
                 param_attr=fluid.ParamAttr(
                     name="emb_weight",
                     learning_rate=10,
-                    initializer=fluid.initializer.NumpyArrayInitializer(
-                        self.w_data
-                    ),
+                    initializer=paddle.nn.initializer.Assign(self.w_data),
                 ),
                 is_sparse=is_sparse,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
index a4dc9f33279db..bdc4af3bdcd32 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -58,10 +58,10 @@ def simple_fc_net_static():
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = image
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.8)
+                initializer=paddle.nn.initializer.Constant(value=0.8)
             )
             bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)
+                initializer=paddle.nn.initializer.Constant(value=0.5)
             )
             for hidden_size in [10, 20, 30]:
                 hidden = paddle.static.nn.fc(
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 24c008a60271f..a38c77386a67a 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -58,10 +58,10 @@ def simple_fc_net_static():
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = image
             param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.8)
+                initializer=paddle.nn.initializer.Constant(value=0.8)
             )
             bias_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.5)
+                initializer=paddle.nn.initializer.Constant(value=0.5)
             )
             for hidden_size in [10, 20, 30]:
                 hidden = paddle.static.nn.fc(
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index e2923da7113df..80787e7fd3f38 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -19,7 +19,6 @@
 
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.initializer as initializer
 from paddle.fluid import Program, program_guard
 
 
@@ -199,7 +198,7 @@ def train_network(
                 shape=[num_total_classes, 10],
                 dtype='float32',
                 name='nce_w',
-                initializer=initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
         b_param = (
@@ -209,7 +208,7 @@ def train_network(
                 shape=[num_total_classes, 1],
                 dtype='float32',
                 name='nce_b',
-                initializer=initializer.ConstantInitializer(),
+                initializer=paddle.nn.initializer.Constant(),
             )
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
index d89af631baa45..95df8aa0be0ac 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -27,7 +27,7 @@ def test_1(self):
         with fluid.program_guard(prog):
 
             def test_bad_x():
-                initializer = fluid.initializer.NumpyArrayInitializer(
+                initializer = paddle.nn.initializer.Assign(
                     np.random.random(size=(128, 100))
                 )
 
@@ -59,7 +59,7 @@ def test_2(self):
         with fluid.program_guard(prog):
 
             def test_bad_x():
-                initializer = fluid.initializer.NumpyArrayInitializer(
+                initializer = paddle.nn.initializer.Assign(
                     np.random.random(size=(128, 100))
                 )
 
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index fc5fbec82cd0a..626521577d173 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -81,20 +81,20 @@ def build_net(self, cond_i, use_bf16=False):
             dtype="float32",
             shape=self.shape,
             attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_x"),
-            default_initializer=fluid.initializer.NumpyArrayInitializer(self.x),
+            default_initializer=paddle.nn.initializer.Assign(self.x),
         )
 
         param_y = paddle.create_parameter(
             dtype="float32",
             shape=self.shape,
             attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_y"),
-            default_initializer=fluid.initializer.NumpyArrayInitializer(self.y),
+            default_initializer=paddle.nn.initializer.Assign(self.y),
         )
         param_z = paddle.create_parameter(
             dtype="float32",
             shape=self.shape,
             attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_z"),
-            default_initializer=fluid.initializer.NumpyArrayInitializer(self.z),
+            default_initializer=paddle.nn.initializer.Assign(self.z),
         )
 
         sum_xy = paddle.add(param_x, param_y, name='sum_xy')
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 3b32c9ca4ee78..ab9b99d8cb249 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -51,10 +51,10 @@ def double_fc_net(image):
                 size=FC_SIZE,
                 activation='relu',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.99)
+                    initializer=paddle.nn.initializer.Constant(value=0.99)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.5)
+                    initializer=paddle.nn.initializer.Constant(value=0.5)
                 ),
                 name="hidden",
             )
@@ -64,10 +64,10 @@ def double_fc_net(image):
                 size=CLASS_NUM,
                 activation='softmax',
                 weight_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=1.2)
+                    initializer=paddle.nn.initializer.Constant(value=1.2)
                 ),
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=0.8)
+                    initializer=paddle.nn.initializer.Constant(value=0.8)
                 ),
                 name="prediction",
             )
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 850ddc379c609..1f6429620f689 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -33,7 +33,7 @@ def simple_fc_net(use_feed):
             size=200,
             activation='tanh',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
@@ -56,7 +56,7 @@ def fc_with_batchnorm(use_feed):
                 size=200,
                 activation='tanh',
                 bias_attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.Constant(value=1.0)
+                    initializer=paddle.nn.initializer.Constant(value=1.0)
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
index 5ce6f31318395..909feb2a48ff3 100644
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -23,7 +23,6 @@
 from paddle.fluid.dygraph import guard
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import ParamBase, Variable, default_main_program
-from paddle.fluid.initializer import ConstantInitializer
 
 paddle.enable_static()
 main_program = default_main_program()
@@ -38,7 +37,7 @@ def test_parameter(self):
             name='fc.w',
             shape=shape,
             dtype='float32',
-            initializer=ConstantInitializer(val),
+            initializer=paddle.nn.initializer.Constant(val),
         )
         self.assertIsNotNone(param)
         self.assertEqual('fc.w', param.name)
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 9c95d5b946ce4..4a4d5921bbb94 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -153,7 +153,7 @@ def test_dygraph_api(self):
         x = paddle.to_tensor(self.x_np)
         m = paddle.nn.PReLU(
             weight_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(0.5)
+                initializer=paddle.nn.initializer.Constant(0.5)
             )
         )
         out = m(x)
@@ -438,7 +438,7 @@ def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,
-        default_initializer=fluid.initializer.ConstantInitializer(0.25),
+        default_initializer=paddle.nn.initializer.Constant(0.25),
     )
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index 5364dcaa6e14a..885c8fa829aa9 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -40,7 +40,7 @@ def simple_fc_net_with_accuracy(use_feed):
             size=200,
             activation='relu',
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
     prediction = paddle.static.nn.fc(hidden, size=10, activation='softmax')
diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py
index 30e3aefe0a738..a93516da417a4 100644
--- a/python/paddle/fluid/tests/unittests/test_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_prune.py
@@ -170,7 +170,7 @@ def net1(self):
         w_param_attrs = fluid.ParamAttr(
             name="fc_weight",
             learning_rate=0.5,
-            initializer=fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=True,
         )
         y = paddle.static.nn.fc(
@@ -198,13 +198,13 @@ def net2(self):
         w1_param_attrs = fluid.ParamAttr(
             name="fc_weight1",
             learning_rate=0.5,
-            initializer=fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=True,
         )
         w2_param_attrs = fluid.ParamAttr(
             name="fc_weight2",
             learning_rate=0.5,
-            initializer=fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=True,
         )
         y1 = paddle.static.nn.fc(
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index a90e37a4755c2..526e08e9d5940 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -79,7 +79,7 @@ def simple_fc_net(img, label, use_py_func_op):
             hidden,
             size=200,
             bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             ),
         )
         if not use_py_func_op:
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 856b2be783d36..0798fa8864f42 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -378,15 +378,15 @@ def test_gen_TruncatedNormal_initializer(self):
             result_1 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
             result_2 = paddle.static.nn.fc(
                 x,
                 size=10,
-                weight_attr=fluid.initializer.TruncatedNormal(
-                    loc=0.0, scale=2.0
+                weight_attr=paddle.nn.initializer.TruncatedNormal(
+                    mean=0.0, std=2.0
                 ),
             )
 
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 4ea5ed0e0d35f..8991b143846c4 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -301,7 +301,7 @@ def create_rnn_op(self):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='W',
-                    initializer=fluid.initializer.ConstantInitializer(1.0),
+                    initializer=paddle.nn.initializer.Constant(1.0),
                 ),
                 bias_attr=False,
             )
@@ -310,7 +310,7 @@ def create_rnn_op(self):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name='U',
-                    initializer=fluid.initializer.ConstantInitializer(0.0),
+                    initializer=paddle.nn.initializer.Constant(0.0),
                 ),
                 bias_attr=False,
             )
@@ -686,7 +686,7 @@ def create_rnn_op(self):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name="W",
-                    initializer=fluid.initializer.ConstantInitializer(1.0),
+                    initializer=paddle.nn.initializer.Constant(1.0),
                 ),
                 bias_attr=False,
             )
@@ -695,7 +695,7 @@ def create_rnn_op(self):
                 size=self.input_dim,
                 weight_attr=ParamAttr(
                     name="U",
-                    initializer=fluid.initializer.ConstantInitializer(0.0),
+                    initializer=paddle.nn.initializer.Constant(0.0),
                 ),
                 bias_attr=False,
             )
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index d160a9982577f..408a5f8a7405e 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -197,7 +197,7 @@ def check_identity(self):
                 out = paddle.static.nn.row_conv(
                     x,
                     self.context_length,
-                    param_attr=fluid.initializer.NumpyArrayInitializer(self.w),
+                    param_attr=paddle.nn.initializer.Assign(self.w),
                 )
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index fe012ded3993e..73ad833a3efe2 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -403,7 +403,7 @@ def build_model(self):
         weight_attr = fluid.ParamAttr(
             name=self.input_names['Params'][0],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][0]]
             ),
             trainable=True,
@@ -411,7 +411,7 @@ def build_model(self):
         bias_attr = fluid.ParamAttr(
             name=self.input_names['Params'][1],
             learning_rate=0.5,
-            initializer=fluid.initializer.NumpyArrayInitializer(
+            initializer=paddle.nn.initializer.Assign(
                 self.inputs['Params'][self.input_names['Params'][1]]
             ),
             trainable=True,
@@ -469,7 +469,7 @@ def build_model(self):
             param_attr=fluid.ParamAttr(
                 name="emb_weight",
                 learning_rate=10,
-                initializer=fluid.initializer.NumpyArrayInitializer(
+                initializer=paddle.nn.initializer.Assign(
                     self.inputs['Params'][self.input_names['Params'][0]]
                 ),
             ),
diff --git a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
index c599f08ae2bf9..3424d393952b3 100644
--- a/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
+++ b/python/paddle/fluid/tests/unittests/test_set_bool_attr.py
@@ -26,11 +26,11 @@ def test_set_bool_attr(self):
         )
         param_attr = fluid.ParamAttr(
             name='batch_norm_w',
-            initializer=fluid.initializer.Constant(value=1.0),
+            initializer=paddle.nn.initializer.Constant(value=1.0),
         )
         bias_attr = fluid.ParamAttr(
             name='batch_norm_b',
-            initializer=fluid.initializer.Constant(value=0.0),
+            initializer=paddle.nn.initializer.Constant(value=0.0),
         )
         bn = paddle.static.nn.batch_norm(
             input=x, param_attr=param_attr, bias_attr=bias_attr
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index 89515c931c250..c63be2c6f2be8 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -322,7 +322,7 @@ def _check_output(
                 print(e)
 
     def _set_initializer(self):
-        self.initializer = fluid.initializer.Constant(value=self.value)
+        self.initializer = paddle.nn.initializer.Constant(value=self.value)
 
     def _data_reader(self):
         for sample in range(self.sample_count):
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index d043e3785c498..0ac2644d90a11 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -59,26 +59,26 @@ def __init__(
         for i in range(self._num_layers):
             weight_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 2, self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.UniformInitializer(
+                default_initializer=paddle.nn.initializer.Uniform(
                     low=-self._init_scale, high=self._init_scale
                 ),
             )
             self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
-                    initializer=fluid.initializer.UniformInitializer(
+                    initializer=paddle.nn.initializer.Uniform(
                         low=-self._init_scale, high=self._init_scale
                     )
                 ),
                 shape=[self._hidden_size * 4],
                 dtype="float32",
-                default_initializer=fluid.initializer.Constant(0.0),
+                default_initializer=paddle.nn.initializer.Constant(0.0),
             )
             self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
@@ -184,7 +184,7 @@ def __init__(
             embedding_dim=hidden_size,
             weight_attr=fluid.ParamAttr(
                 name='embedding_para',
-                initializer=fluid.initializer.UniformInitializer(
+                initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
                 ),
             ),
@@ -193,7 +193,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.hidden_size, self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
@@ -201,7 +201,7 @@ def __init__(
             attr=fluid.ParamAttr(),
             shape=[self.vocab_size],
             dtype="float32",
-            default_initializer=fluid.initializer.UniformInitializer(
+            default_initializer=paddle.nn.initializer.Uniform(
                 low=-self.init_scale, high=self.init_scale
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/test_tdm_child_op.py b/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
index 2481a48f01793..eaa139714660f 100644
--- a/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tdm_child_op.py
@@ -151,9 +151,7 @@ def test_shape(self):
             node_nums=26,
             child_nums=2,
             param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    tree_info_np
-                )
+                initializer=paddle.nn.initializer.Assign(tree_info_np)
             ),
         )
 
diff --git a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
index 217d84b4b9f8a..c54c6c0c9de02 100644
--- a/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tdm_sampler_op.py
@@ -290,12 +290,10 @@ def test_shape(self):
             layer_node_num_list,
             leaf_node_num,
             tree_travel_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    travel_array
-                )
+                initializer=paddle.nn.initializer.Assign(travel_array)
             ),
             tree_layer_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(layer_array)
+                initializer=paddle.nn.initializer.Assign(layer_array)
             ),
             output_positive=True,
             output_list=True,
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index c31d763dbff7c..407d70b4dadf3 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -286,7 +286,7 @@ def test_api(self):
         y = paddle.static.nn.fc(
             x,
             size=16,
-            weight_attr=fluid.initializer.Uniform(
+            weight_attr=paddle.nn.initializer.UniformInitializer(
                 low=-0.5,
                 high=0.5,
                 seed=10,
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 17a05bdb01caa..f649fe1a28152 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.param_attr import WeightNormParamAttr
 
 
@@ -44,7 +43,7 @@ def set_program(cls):
             weight_attr=WeightNormParamAttr(
                 dim=None,
                 name='weight_norm_param',
-                initializer=ConstantInitializer(1.0),
+                initializer=paddle.nn.initializer.Constant(1.0),
             ),
             bias_attr=False,
             activation=None,
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 42436b6e242b4..d847ac9ee4433 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -76,8 +76,8 @@ def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         q = paddle.static.nn.fc(
             x=queries,
             size=d_key * n_head,
-            weight_attr=fluid.initializer.Xavier(
-                uniform=False, fan_in=d_model * d_key, fan_out=n_head * d_key
+            weight_attr=paddle.nn.initializer.XavierNormal(
+                fan_in=d_model * d_key, fan_out=n_head * d_key
             ),
             bias_attr=False,
             num_flatten_dims=2,
@@ -85,8 +85,8 @@ def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         k = paddle.static.nn.fc(
             x=keys,
             size=d_key * n_head,
-            weight_attr=fluid.initializer.Xavier(
-                uniform=False, fan_in=d_model * d_key, fan_out=n_head * d_key
+            weight_attr=paddle.nn.initializer.XavierNormal(
+                fan_in=d_model * d_key, fan_out=n_head * d_key
             ),
             bias_attr=False,
             num_flatten_dims=2,
@@ -94,8 +94,7 @@ def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         v = paddle.static.nn.fc(
             x=values,
             size=d_value * n_head,
-            weight_attr=fluid.initializer.Xavier(
-                uniform=False,
+            weight_attr=paddle.nn.initializer.XavierNormal(
                 fan_in=d_model * d_value,
                 fan_out=n_head * d_value,
             ),
@@ -187,7 +186,7 @@ def __softmax(x, eps=1e-9):
     proj_out = paddle.static.nn.fc(
         x=out,
         size=d_model,
-        weight_attr=fluid.initializer.Xavier(uniform=False),
+        weight_attr=paddle.nn.initializer.XavierNormal(),
         bias_attr=False,
         num_flatten_dims=2,
     )
@@ -204,7 +203,7 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
         x,
         size=d_inner_hid,
         num_flatten_dims=2,
-        weight_attr=fluid.initializer.Uniform(
+        weight_attr=paddle.nn.initializer.Uniform(
             low=-(d_hid**-0.5), high=(d_hid**-0.5)
         ),
         activation="relu",
@@ -213,7 +212,7 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
         x=hidden,
         size=d_hid,
         num_flatten_dims=2,
-        weight_attr=fluid.initializer.Uniform(
+        weight_attr=paddle.nn.initializer.Uniform(
             low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)
         ),
     )
@@ -235,8 +234,8 @@ def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.0):
             out = paddle.static.nn.layer_norm(
                 out,
                 begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.0),
-                bias_attr=fluid.initializer.Constant(0.0),
+                param_attr=paddle.nn.initializer.Constant(1.0),
+                bias_attr=paddle.nn.initializer.Constant(0.0),
             )
         elif cmd == "d":  # add dropout
             if dropout:
@@ -269,7 +268,7 @@ def prepare_encoder(
         src_word,
         size=[src_vocab_size, src_emb_dim],
         padding_idx=src_pad_idx,
-        param_attr=fluid.initializer.Normal(0.0, 1.0),
+        param_attr=paddle.nn.initializer.Normal(0.0, 1.0),
     )
     src_pos_enc = layers.embedding(
         src_pos,
@@ -587,7 +586,7 @@ def transformer(
         x=paddle.static.nn.fc(
             x=dec_output,
             size=trg_vocab_size,
-            weight_attr=fluid.initializer.Xavier(uniform=False),
+            weight_attr=paddle.nn.initializer.XavierNormal(),
             bias_attr=False,
             num_flatten_dims=2,
         ),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index d3909193cd6ce..3ee0469b6145d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -369,7 +369,7 @@ def test_global_stats(self):
                     net1 = paddle.nn.BatchNorm(
                         6,
                         param_attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.Constant(1.0)
+                            initializer=paddle.nn.initializer.Constant(1.0)
                         ),
                         use_global_stats=self.use_global_stats,
                         trainable_statistics=self.trainable_statistics,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
index 3518083d75678..1764400403f26 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -73,34 +73,34 @@ def Base(self):
             paddle.disable_static()
 
             conv1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             conv3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             bn1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             bn1_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             bn2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             bn2_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             bn3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             bn3_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
 
             self.conv1 = nn.Conv2D(
@@ -173,34 +173,34 @@ def FusedResNetBasicBlock(self):
             paddle.disable_static()
 
             fused_conv1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             fused_conv2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             fused_conv3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
+                initializer=paddle.nn.initializer.XavierNormal(),
                 learning_rate=0.001,
             )
             fused_bn1_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             fused_bn1_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             fused_bn2_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             fused_bn2_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
             fused_bn3_weight = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)
+                initializer=paddle.nn.initializer.Constant(value=1.0)
             )
             fused_bn3_bias = fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.0)
+                initializer=paddle.nn.initializer.Constant(value=0.0)
             )
 
             if self.has_shortcut:
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
index 2f699ca3c026d..666c29f7fcaa8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
@@ -163,7 +163,7 @@ def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
         shape=alpha_shape,
         dtype='float32',
         is_bias=False,
-        default_initializer=fluid.initializer.ConstantInitializer(0.25),
+        default_initializer=paddle.nn.initializer.Constant(0.25),
     )
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 29901363dbeef..32486a8dadd2b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,7 +39,7 @@
 import numpy as np
 
 from .ps_dispatcher import RoundRobin, PSDispatcher
-from .. import core, framework, unique_name, initializer
+from .. import core, framework, unique_name
 from ..framework import (
     Program,
     default_main_program,
@@ -2856,7 +2856,7 @@ def _get_lr_ops(self):
                             dtype=var.dtype,
                             shape=var.shape,
                             persistable=var.persistable,
-                            initializer=initializer.Constant(1),
+                            initializer=paddle.nn.initializer.Constant(1),
                         )
                     op_role_attr_name = (
                         core.op_proto_and_checker_maker.kOpRoleAttrName()
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index df1c81bffe835..7bf04dc151c7f 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -24,7 +24,6 @@
 import paddle
 from paddle.fluid import core, global_scope, program_guard
 from paddle.fluid.framework import dygraph_only
-from paddle.fluid.initializer import ConstantInitializer
 from paddle.incubate import asp
 
 from .supported_layer_list import (
@@ -882,7 +881,9 @@ def _create_mask_variables(cls, main_program, startup_program, params):
                             name=ASPHelper._get_mask_name(param.name),
                             shape=param.shape,
                             dtype=param.dtype,
-                            default_initializer=ConstantInitializer(value=1.0),
+                            default_initializer=paddle.nn.initializer.Constant(
+                                value=1.0
+                            ),
                         )
                         mask_param.stop_gradient = True
                         mask_param.trainable = False
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index 4ad72077014f0..4ce504d8f8b66 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -19,8 +19,8 @@
 import numpy as np
 
 import paddle
+from paddle.common_ops_import import default_main_program
 from paddle.framework import _non_static_mode
-from paddle.static import default_main_program
 
 from ..fluid.data_feeder import convert_dtype
 from ..fluid.layers.utils import flatten, map_structure
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 57a1e0023d4fc..d8777d2c4779d 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -16,10 +16,10 @@
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.tensor import fill_constant
 from paddle.framework import core, in_dynamic_mode
-from paddle.static import Variable, default_main_program
 from paddle.tensor.creation import full
 
 from ...fluid.data_feeder import (
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 74a97e25938ed..82d25747eadb0 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -22,6 +22,7 @@
 from paddle.fluid.framework import _global_flags, in_dygraph_mode
 from paddle.tensor.math import _add_with_axis
 
+from ...common_ops_import import Variable
 from ...device import get_cudnn_version
 from ...fluid.data_feeder import check_dtype, check_variable_and_dtype
 from ...fluid.layer_helper import LayerHelper
@@ -32,7 +33,6 @@
     convert_to_list,
 )
 from ...framework import no_grad
-from ...static import Variable
 from ...tensor.manipulation import squeeze, unsqueeze
 
 __all__ = []
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 3b566b3de3044..533bf138a1a49 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -18,6 +18,7 @@
 
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 
+from ...common_ops_import import Variable
 from ...fluid.data_feeder import (
     check_dtype,
     check_type,
@@ -26,7 +27,6 @@
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
 from ...framework import convert_np_dtype_to_dtype_, core
-from ...static import Variable
 from ...tensor.creation import assign
 from ...tensor.layer_function_generator import templatedoc
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 8964b69df2a71..eccaffcb729a8 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -14,10 +14,10 @@
 
 from paddle import _C_ops
 
+from ...common_ops_import import Variable
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
-from ...static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 90697cb63476f..001efd74a6733 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -20,10 +20,10 @@
 from paddle.framework import core
 from paddle.utils import deprecated
 
+from ...common_ops_import import Variable
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.framework import _current_expected_place, in_dygraph_mode
 from ...fluid.layer_helper import LayerHelper
-from ...static import Variable
 from ...tensor.manipulation import reshape
 
 __all__ = []
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 4f164e991f328..1178928acc2da 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -15,10 +15,10 @@
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
 from paddle.fluid.framework import in_dygraph_mode
 
+from ...common_ops_import import Variable
 from ...device import get_cudnn_version, is_compiled_with_rocm
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.layer_helper import LayerHelper
-from ...static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/nn/initializer/Bilinear.py b/python/paddle/nn/initializer/Bilinear.py
new file mode 100644
index 0000000000000..b3a1766d07ccc
--- /dev/null
+++ b/python/paddle/nn/initializer/Bilinear.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
+
+__all__ = []
+
+
+class Bilinear(Initializer):
+    """
+    This initializer can be used in transposed convolution operator to
+    act as upsampling. Users can upsample a feature map with shape of
+    (B, C, H, W) by any integer factor. The usage is:
+
+    Examples:
+
+        .. code-block:: python
+
+            import math
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.regularizer import L2Decay
+
+            factor = 2
+            C = 2
+            B = 8
+            H = W = 32
+            w_attr = paddle.ParamAttr(learning_rate=0.,
+                                      regularizer=L2Decay(0.),
+                                      initializer=nn.initializer.Bilinear())
+            data = paddle.rand([B, 3, H, W], dtype='float32')
+            conv_up = nn.Conv2DTranspose(3,
+                                         out_channels=C,
+                                         kernel_size=2 * factor - factor % 2,
+                                         padding=int(
+                                             math.ceil((factor - 1) / 2.)),
+                                         stride=factor,
+                                         weight_attr=w_attr,
+                                         bias_attr=False)
+            x = conv_up(data)
+
+    Where, `out_channels=C` and `groups=C` means this is channel-wise transposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`,
+    This initializer will set a (K, K) interpolation kernel for every channel
+    of the filter identically. The resulting shape of the output feature map
+    will be (B, C, factor * H, factor * W). Note that the learning rate and the
+    weight decay are set to 0 in order to keep coefficient values of bilinear
+    interpolation unchanged during training.
+
+    """
+
+    def __init__(self):
+        """Constructor for BilinearInitializer."""
+        super().__init__()
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Bilinear initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        if not isinstance(var, framework.Variable):
+            raise ValueError("var must be framework.Variable.")
+
+        if not isinstance(block, framework.Block):
+            raise ValueError("block must be framework.Block.")
+
+        shape = var.shape
+        if len(shape) != 4:
+            raise ValueError("the length of shape must be 4.")
+        if shape[2] != shape[3]:
+            raise ValueError("shape[2] must be equal to shape[3].")
+
+        weight = np.zeros(np.prod(var.shape), dtype='float32')
+        size = shape[3]
+        # factor
+        f = np.ceil(size / 2.0)
+        # center
+        c = (2 * f - 1 - f % 2) / (2.0 * f)
+        for i in range(np.prod(shape)):
+            x = i % size
+            y = (i / size) % size
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        weight = np.reshape(weight, shape)
+
+        # to be compatible of fp16 initalizers
+        if var.dtype in [
+            core.VarDesc.VarType.FP16,
+            core.VarDesc.VarType.BF16,
+            core.VarDesc.VarType.FP64,
+        ]:
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['bilinear_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if out_dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in weight.flat]
+        else:
+            raise TypeError("Unsupported dtype %s", var.dtype)
+
+        if np.prod(shape) > 1024 * 1024:
+            raise ValueError("The size of input is too big. ")
+
+        if in_dygraph_mode():
+            _C_ops.assign_value_(
+                out_var,
+                list(shape),
+                out_dtype,
+                values,
+                _current_expected_place(),
+            )
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+                core.VarDesc.VarType.FP64,
+            ]:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            op = block.append_op(
+                type='assign_value',
+                outputs={'Out': [out_var]},
+                attrs={
+                    'dtype': out_dtype,
+                    'shape': list(shape),
+                    value_name: values,
+                },
+            )
+
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+                core.VarDesc.VarType.FP64,
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index e078e19ed2b4d..6ef516c8b6af5 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-from ...fluid.initializer import Bilinear  # noqa: F401
 from ...fluid.initializer import set_global_initializer  # noqa: F401
-from ...fluid.initializer import calculate_gain  # noqa: F401
+
+from .Bilinear import Bilinear  # noqa: F401
 
 from .constant import Constant  # noqa: F401
 
@@ -36,6 +36,15 @@
 
 from .dirac import Dirac  # noqa: F401
 
+from .initializer import Initializer, calculate_gain  # noqa: F401
+from .uniform import UniformInitializer  # noqa: F401
+from .constant import ConstantInitializer  # noqa: F401
+from .normal import NormalInitializer  # noqa: F401
+from .normal import TruncatedNormalInitializer  # noqa: F401
+from .xavier import XavierInitializer  # noqa: F401
+from .kaiming import MSRAInitializer  # noqa: F401
+from .assign import NumpyArrayInitializer  # noqa: F401
+
 __all__ = [  # noqa
     'Bilinear',
     'Constant',
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index 052da37af244e..3ab5a896e463a 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -12,20 +12,134 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
+from paddle import _C_ops
 
+from ...fluid import core, framework, unique_name
 from ...fluid.data_feeder import check_type
-from ...fluid.initializer import NumpyArrayInitializer
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class NumpyArrayInitializer(Initializer):
+    """Init an parameter with an numpy array
+    This api initialize the tensor by numpy array.
+
+    Args:
+        value (numpy): numpy array to initialize the tensor
+
+    Returns:
+        A Tensor initialized by numpy.
+
+    """
+
+    def __init__(self, value):
+        import numpy
+
+        assert isinstance(value, numpy.ndarray)
+        super().__init__()
+        self._value = value
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Numpy array.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+
+        # to be compatible of fp16 initalizers
+        if var.dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
+            out_dtype = core.VarDesc.VarType.FP32
+            np_value = self._value.astype("float32")
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['numpy_array_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_var = var
+            out_dtype = var.dtype
+            np_value = self._value
+
+        if out_dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in np_value.flat]
+        elif out_dtype == core.VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in np_value.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", self._value.dtype)
+        if self._value.size > 1024 * 1024 * 1024:
+            raise ValueError(
+                "The size of input is too big. Please consider "
+                "saving it to file and 'load_op' to load it"
+            )
+
+        if in_dygraph_mode():
+            _C_ops.assign_value_(
+                out_var,
+                list(self._value.shape),
+                out_dtype,
+                values,
+                _current_expected_place(),
+            )
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            op = block.append_op(
+                type='assign_value',
+                outputs={'Out': out_var},
+                attrs={
+                    'dtype': out_dtype,
+                    'shape': list(self._value.shape),
+                    value_name: values,
+                },
+                stop_gradient=True,
+            )
+
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class Assign(NumpyArrayInitializer):
     """Init an parameter with a numpy array, list, or tensor.
 
     Args:
         value (Tensor|numpy.ndarray|list|tuple): numpy array, list, tuple, or tensor to initialize the parameter.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        name(str, optional): Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         A parameter initialized by the input numpy array, list, or tensor.
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 637ae6299005c..0016467f117b0 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -12,12 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle import _C_ops
+
+from ...fluid import core, framework
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+
 # TODO: define the initializers of Constant in neural network
-from ...fluid.initializer import ConstantInitializer
+from .initializer import Initializer
 
 __all__ = []
 
 
+class ConstantInitializer(Initializer):
+    """Implements the constant initializer
+
+    Args:
+        value (float32, optional): constant value to initialize the variable. Default: 0.0.
+
+    """
+
+    def __init__(self, value=0.0, force_cpu=False):
+        assert value is not None
+        super().__init__()
+        self._value = value
+        self._force_cpu = force_cpu
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with constant.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable) or isinstance(
+            var, framework.EagerParamBase
+        )
+        assert isinstance(block, framework.Block)
+
+        if in_dygraph_mode():
+            place = _current_expected_place()
+            if self._force_cpu:
+                place = core.CPUPlace()
+            _C_ops.full_(
+                var, var.shape, str(float(self._value)), var.dtype, place
+            )
+            return None
+        else:
+            op = block.append_op(
+                type="fill_constant",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "value": float(self._value),
+                    'str_value': str(float(self._value)),
+                    'force_cpu': self._force_cpu,
+                },
+                stop_gradient=True,
+            )
+
+            var.op = op
+            return op
+
+
 class Constant(ConstantInitializer):
     """Implement the constant initializer.
 
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 0917859415d36..3abcc300bc64e 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -20,7 +20,7 @@
 from ...fluid.core import VarDesc
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.framework import _current_expected_place
-from ...fluid.initializer import Initializer
+from .initializer import Initializer
 
 __all__ = []
 
diff --git a/python/paddle/nn/initializer/initializer.py b/python/paddle/nn/initializer/initializer.py
new file mode 100644
index 0000000000000..c320fa68cd114
--- /dev/null
+++ b/python/paddle/nn/initializer/initializer.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import math
+
+import numpy as np
+
+from ...fluid.framework import default_main_program, in_dygraph_mode
+from ...fluid.lazy_init import lazy_init_helper
+
+__all__ = []
+
+
+class Initializer:
+    """Base class for parameter initializers
+
+    Defines the common interface of parameter initializers.
+    They add operations to the init program that are used
+    to initialize parameter. Users should not use this class
+    directly, but need to use one of its implementations.
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, param, block=None):
+        if not lazy_init_helper().state:
+            return self.forward(param, block)
+
+        return self._lazy_init(param, block)
+
+    def forward(self, param, block=None):
+        """Add corresponding initialization operations to the network"""
+        raise NotImplementedError()
+
+    def _lazy_init(self, param, block=None):
+        """
+        Apply lazy initialization
+        """
+        assert in_dygraph_mode()
+
+        def init_op_creator(forward, param, block):
+            new_var = param._to_static_var(True, block=block)
+            # Record initializer operator
+            with lazy_init_helper():
+                forward(new_var, block)
+
+        # Add hook function for initializing param in dygraph mode
+        param.set_init_func(functools.partial(self.forward, param, block))
+        param._init_op_creator = functools.partial(
+            init_op_creator, self.forward, param
+        )
+
+        return param
+
+    def _check_block(self, block):
+        if block is None:
+            block = default_main_program().global_block()
+
+        return block
+
+    def _compute_fans(self, var):
+        """Compute the fan_in and the fan_out for layers
+
+        This method computes the fan_in and the fan_out
+        for neural network layers, if not specified. It is
+        not possible to perfectly estimate fan_in and fan_out.
+        This method will estimate it correctly for matrix multiply and
+        convolutions.
+
+        Args:
+            var: variable for which fan_in and fan_out have to be computed
+
+        Returns:
+            tuple of two integers (fan_in, fan_out)
+        """
+        shape = var.shape
+        if not shape or len(shape) == 0:
+            fan_in = fan_out = 1
+        elif len(shape) == 1:
+            fan_in = fan_out = shape[0]
+        elif len(shape) == 2:
+            # This is the case for simple matrix multiply
+            fan_in = shape[0]
+            fan_out = shape[1]
+        else:
+            # Assume this to be a convolutional kernel
+            # In PaddlePaddle, the shape of the kernel is like:
+            # [num_filters, num_filter_channels, ...] where the remaining
+            # dimensions are the filter_size
+            receptive_field_size = np.prod(shape[2:])
+            fan_in = shape[1] * receptive_field_size
+            fan_out = shape[0] * receptive_field_size
+
+        return (fan_in, fan_out)
+
+
+def calculate_gain(nonlinearity, param=None):
+    """
+    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some
+    ``paddle.nn.initializer`` api to adjust the initialization value.
+
+    Args:
+        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as:
+            `linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose` , 1.0 will be returned.
+        param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to
+            'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
+
+    Returns:
+        A float value, which is the recommended gain for this nonlinearity function.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3
+            gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2))
+            initializer = paddle.nn.initializer.Orthogonal(gain)
+
+    """
+    if param is None:
+        param = 0.01
+    else:
+        assert isinstance(param, (bool, int, float))
+        param = float(param)
+    recommended_gain = {
+        'sigmoid': 1,
+        'linear': 1,
+        'conv1d': 1,
+        'conv2d': 1,
+        'conv3d': 1,
+        'conv1d_transpose': 1,
+        'conv2d_transpose': 1,
+        'conv3d_transpose': 1,
+        'tanh': 5.0 / 3,
+        'relu': math.sqrt(2.0),
+        'leaky_relu': math.sqrt(2.0 / (1 + param**2)),
+        'selu': 3.0 / 4,
+    }
+    if nonlinearity in recommended_gain.keys():
+        return recommended_gain[nonlinearity]
+    else:
+        raise ValueError(
+            "nonlinearity function {} is not suppported now.".format(
+                nonlinearity
+            )
+        )
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index f214e46fa4b2b..c3a8732315db3 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -13,11 +13,185 @@
 # limitations under the License.
 
 # TODO: define the initializers of Kaiming functions in neural network
-from ...fluid.initializer import MSRAInitializer
+import math
+
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer, calculate_gain
 
 __all__ = []
 
 
+class MSRAInitializer(Initializer):
+    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = gain \times \sqrt{\frac{3}{fan\_in}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \frac{gain}{\sqrt{{fan\_in}}}
+
+    Args:
+        uniform (bool, optional): whether to use uniform or normal distribution. Default is True.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        seed (int32, optional): random seed. Default is 0.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
+        nonlinearity(str, optional): the non-linear function. Default is relu.
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    """
+
+    def __init__(
+        self,
+        uniform=True,
+        fan_in=None,
+        seed=0,
+        negative_slope=0,
+        nonlinearity='relu',
+    ):
+        """Constructor for MSRAInitializer"""
+        assert uniform is not None
+        assert seed is not None
+        super().__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+        self._negative_slope = negative_slope
+        self._nonlinearity = nonlinearity
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with MSRA initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == core.VarDesc.VarType.FP16 or (
+            var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+        ):
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['masra_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            if self._uniform:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+                out_var = _C_ops.uniform(
+                    var.shape,
+                    out_dtype,
+                    -limit,
+                    limit,
+                    self._seed,
+                    _current_expected_place(),
+                )
+            else:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
+                place = _current_expected_place()
+                out_var = _C_ops.gaussian(
+                    out_var.shape, 0.0, std, self._seed, out_dtype, place
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            if self._uniform:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+                op = block.append_op(
+                    type="uniform_random",
+                    inputs={},
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": int(out_dtype),
+                        "min": -limit,
+                        "max": limit,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+
+            else:
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
+                op = block.append_op(
+                    type="gaussian_random",
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": int(out_dtype),
+                        "mean": 0.0,
+                        "std": std,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class KaimingNormal(MSRAInitializer):
     r"""Implements the Kaiming Normal initializer
 
@@ -36,9 +210,9 @@ class KaimingNormal(MSRAInitializer):
         \frac{gain}{\sqrt{{fan\_in}}}
 
     Args:
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. default is None.
-        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
-        nonlinearity(str, optional): the non-linear function. default is relu.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
+        nonlinearity(str, optional): the non-linear function. Default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -84,9 +258,9 @@ class KaimingUniform(MSRAInitializer):
         x = gain \times \sqrt{\frac{3}{fan\_in}}
 
     Args:
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. default is None.
-        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
-        nonlinearity(str, optional): the non-linear function. default is relu.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
+        nonlinearity(str, optional): the non-linear function. Default is relu.
 
     Note:
         It is recommended to set fan_in to None for most cases.
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 5ead30f4f1e3e..030ec95940db6 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -12,19 +12,99 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.initializer import NormalInitializer, TruncatedNormalInitializer
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class NormalInitializer(Initializer):
+    """Implements the Random Normal(Gaussian) distribution initializer
+
+    Args:
+        loc (float, optional): mean of the normal distribution. Default is 0.0.
+        scale (float, optional): standard deviation of the normal distribution. Default is 1.0.
+        seed (int, optional): random seed. Default is 0.
+
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super().__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Normal distribution.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(block, framework.Block)
+
+        check_variable_and_dtype(
+            var,
+            "Out",
+            ["uint16", "float16", "float32", "float64"],
+            "guassian_random",
+        )
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        if in_dygraph_mode():
+            place = _current_expected_place()
+            out_var = _C_ops.gaussian(
+                var.shape,
+                self._mean,
+                self._std_dev,
+                self._seed,
+                var.dtype,
+                place,
+            )
+            out_var._share_underline_tensor_to(var)
+            return None
+
+        else:
+            op = block.append_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": var.dtype,
+                    "mean": self._mean,
+                    "std": self._std_dev,
+                    "seed": self._seed,
+                    "use_mkldnn": False,
+                },
+                stop_gradient=True,
+            )
+            var.op = op
+            return op
+
+
 class Normal(NormalInitializer):
     """The Random Normal (Gaussian) distribution initializer.
 
     Args:
-        mean (float, optional): mean of the normal distribution. The default value is 0.0.
-        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
+        mean (float, optional): mean of the normal distribution. Default is 0.0.
+        std (float, optional): standard deviation of the normal distribution. Default is 1.0.
         name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+            property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
         A parameter initialized by Random Normal (Gaussian) distribution.
@@ -58,12 +138,113 @@ def __init__(self, mean=0.0, std=1.0, name=None):
         super().__init__(loc=mean, scale=std, seed=0)
 
 
+class TruncatedNormalInitializer(Initializer):
+    """Implements the Random TruncatedNormal(Gaussian) distribution initializer
+
+    Args:
+        loc (float, optional): Mean of the normal distribution. Default is :math:`0.0`.
+        scale (float, optional): Standard deviation of the normal distribution. Default is :math:`1.0`.
+        seed (int, optional): random seed. Default is 0.
+
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super().__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with TruncatedNormal distribution.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['truncated_gaussian_random', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            out_var = _C_ops.truncated_gaussian_random(
+                var.shape,
+                self._mean,
+                self._std_dev,
+                self._seed,
+                out_dtype,
+                _current_expected_place(),
+            )
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+
+        else:
+            op = block.append_op(
+                type="truncated_gaussian_random",
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": out_dtype,
+                    "mean": self._mean,
+                    "std": self._std_dev,
+                    "seed": self._seed,
+                },
+                stop_gradient=True,
+            )
+
+            if var.dtype in [
+                core.VarDesc.VarType.FP16,
+                core.VarDesc.VarType.BF16,
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+            var.op = op
+            return op
+
+
 class TruncatedNormal(TruncatedNormalInitializer):
     """The truncated normal distribution (Gaussian distribution) initializer.
 
     Args:
-        mean (float, optional): Mean of the normal distribution. The default value is :math:`0.0`.
-        std (float, optional): Standard deviation of the normal distribution. The default value is :math:`1.0`.
+        mean (float, optional): Mean of the normal distribution. Default is :math:`0.0`.
+        std (float, optional): Standard deviation of the normal distribution. Default is :math:`1.0`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 0bbfd9eaaaa86..65a496f2b1069 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -18,7 +18,7 @@
 from ...fluid import framework
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.dygraph import no_grad
-from ...fluid.initializer import Initializer
+from .initializer import Initializer
 
 __all__ = []
 
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index 011cb6eff6dfa..cd64a15b7519e 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -12,17 +12,144 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.initializer import UniformInitializer
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class UniformInitializer(Initializer):
+    """Implements the random uniform distribution initializer
+
+    Args:
+        low (float, optional): Lower boundary of the uniform distribution. Default is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. Default is :math:`1.0`.
+        seed (int, optional): Random seed. Default is 0.
+        diag_num (int, optional): the number of diagonal elements to initialize.
+            If set to 0, diagonal initialization will be not performed. Default is 0.
+        diag_step (int, optional): Step size between two diagonal elements,
+            which is generally the width of the square matrix. Default is 0.
+        diag_val (float, optional): the value of the diagonal element to be initialized,
+            default 1.0. It takes effect only if the diag_num is greater than 0. Default is :math:`1.0`.
+
+    """
+
+    def __init__(
+        self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0
+    ):
+        assert low is not None
+        assert high is not None
+        assert high >= low
+        assert seed is not None
+        assert diag_num is not None
+        assert diag_step is not None
+        assert diag_val is not None
+        if diag_num > 0 or diag_step > 0:
+            assert diag_num > 0 and diag_step > 0
+        super().__init__()
+        self._low = low
+        self._high = high
+        self._seed = seed
+        self._diag_num = diag_num
+        self._diag_step = diag_step
+        self._diag_val = diag_val
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Uniform distribution.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(block, framework.Block)
+        if not in_dygraph_mode():
+            check_variable_and_dtype(
+                var,
+                "Out",
+                ["uint16", "float16", "float32", "float64"],
+                "uniform_random",
+            )
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initializers
+        if var.dtype == core.VarDesc.VarType.FP16:
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['uniform_random', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            out_var = _C_ops.uniform(
+                var.shape,
+                out_dtype,
+                self._low,
+                self._high,
+                self._seed,
+                _current_expected_place(),
+            )
+            if var.dtype == core.VarDesc.VarType.FP16:
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            op = block.append_op(
+                type="uniform_random",
+                inputs={},
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": out_dtype,
+                    "min": self._low,
+                    "max": self._high,
+                    "seed": self._seed,
+                    "diag_num": self._diag_num,
+                    "diag_step": self._diag_step,
+                    "diag_val": self._diag_val,
+                },
+                stop_gradient=True,
+            )
+
+            if var.dtype == core.VarDesc.VarType.FP16:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class Uniform(UniformInitializer):
     """The uniform distribution initializer.
 
     Args:
-        low (float, optional): Lower boundary of the uniform distribution. The default value is :math:`-1.0`.
-        high (float, optional): Upper boundary of the uniform distribution. The default value is :math:`1.0`.
+        low (float, optional): Lower boundary of the uniform distribution. Default is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. Default is :math:`1.0`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 35e104edba111..6d17c029f587c 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -12,11 +12,183 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.initializer import XavierInitializer
+import math
+
+from paddle import _C_ops
+
+from ...fluid import core, framework, unique_name
+from ...fluid.data_feeder import check_variable_and_dtype
+from ...fluid.framework import _current_expected_place, in_dygraph_mode
+from .initializer import Initializer
 
 __all__ = []
 
 
+class XavierInitializer(Initializer):
+    r"""
+    This class implements the Xavier weight initializer from the paper
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
+
+    This initializer is designed to keep the scale of the gradients
+    approximately same in all the layers. In case of Uniform distribution,
+    the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+
+
+    Args:
+        uniform (bool, optional): whether to use uniform ,if False use normal distribution. Default is True.
+        fan_in (float, optional): fan_in for Xavier initialization. If None, it is
+                inferred from the variable. Default is None.
+        fan_out (float, optional): fan_out for Xavier initialization. If None, it is
+                 inferred from the variable. Default is None.
+        seed (int, optional): Random seed. Default is 0.
+
+    Note:
+        It is recommended to set fan_in and fan_out to None for most cases.
+
+    """
+
+    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
+        assert uniform is not None
+        assert seed is not None
+        super().__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._fan_out = fan_out
+        self._seed = seed
+
+    def forward(self, var, block=None):
+        """Initialize the input tensor with Xavier initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(block, framework.Block)
+        check_variable_and_dtype(
+            var,
+            "Out",
+            ["uint16", "float16", "float32", "float64"],
+            "xavier_init",
+        )
+
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in and fan_out are passed, use them
+        fan_in = f_in if self._fan_in is None else self._fan_in
+        fan_out = f_out if self._fan_out is None else self._fan_out
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == core.VarDesc.VarType.FP16 or (
+            var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+        ):
+            out_dtype = core.VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['xavier_init', var.name, 'tmp'])
+                ),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+            )
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if in_dygraph_mode():
+            if self._uniform:
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                out_var = _C_ops.uniform(
+                    out_var.shape,
+                    out_dtype,
+                    -limit,
+                    limit,
+                    self._seed,
+                    _current_expected_place(),
+                )
+            else:
+                std = math.sqrt(2.0 / float(fan_in + fan_out))
+
+                place = _current_expected_place()
+                out_var = _C_ops.gaussian(
+                    out_var.shape, 0.0, std, self._seed, out_dtype, place
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                var_tmp = _C_ops.cast(out_var, var.dtype)
+                var_tmp._share_underline_tensor_to(var)
+            else:
+                out_var._share_underline_tensor_to(var)
+            return None
+        else:
+            if self._uniform:
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                op = block.append_op(
+                    type="uniform_random",
+                    inputs={},
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": out_dtype,
+                        "min": -limit,
+                        "max": limit,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+            else:
+                std = math.sqrt(2.0 / float(fan_in + fan_out))
+                op = block.append_op(
+                    type="gaussian_random",
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": out_var.dtype,
+                        "mean": 0.0,
+                        "std": std,
+                        "seed": self._seed,
+                    },
+                    stop_gradient=True,
+                )
+
+            if var.dtype == core.VarDesc.VarType.FP16 or (
+                var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
+            ):
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype},
+                )
+
+            var.op = op
+            return op
+
+
 class XavierNormal(XavierInitializer):
     r"""
     This class implements the Xavier weight initializer from the paper
@@ -31,9 +203,9 @@ class XavierNormal(XavierInitializer):
 
     Args:
         fan_in (float, optional): fan_in for Xavier initialization, which is
-                inferred from the Tensor. The default value is None.
+                inferred from the Tensor. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization, which is
-                 inferred from the Tensor. The default value is None.
+                 inferred from the Tensor. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -83,9 +255,9 @@ class XavierUniform(XavierInitializer):
 
     Args:
         fan_in (float, optional): fan_in for Xavier initialization, which is
-                inferred from the Tensor. The default value is None.
+                inferred from the Tensor. Default is None.
         fan_out (float, optional): fan_out for Xavier initialization, which is
-                 inferred from the Tensor. The default value is None.
+                 inferred from the Tensor. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 2617c76ae6e79..4bf31ca30ea28 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -20,15 +20,20 @@
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops, framework, in_dynamic_mode
+from paddle.common_ops_import import Variable
 from paddle.fluid.data_feeder import check_type, check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    default_startup_program,
+    in_dygraph_mode,
+    program_guard,
+)
 from paddle.fluid.layers import control_flow, sequence_lod, utils
 from paddle.fluid.layers.utils import flatten, map_structure
 from paddle.framework import core
 from paddle.nn import Layer
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-from paddle.static import Variable, default_startup_program, program_guard
 from paddle.tensor.manipulation import tensor_array_to_tensor
 
 from .container import LayerList
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index cad226952be41..d9e1cd456042c 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -34,7 +34,6 @@
 from ..fluid import framework, unique_name
 from ..fluid.backward import _get_no_grad_set_name, append_backward
 from ..fluid.framework import Parameter, program_guard
-from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
 from .lr import LRScheduler
 
@@ -453,7 +452,8 @@ def _create_global_learning_rate(self):
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
-                lr_var, initializer=Constant(value=lr_value)
+                lr_var,
+                initializer=paddle.nn.initializer.Constant(value=lr_value),
             )
         elif isinstance(self._learning_rate, float):
             # only create global lr_var once
@@ -726,7 +726,10 @@ def _add_accumulator(
         else:
             with device_guard(device):
                 self.helper.set_variable_initializer(
-                    var, initializer=Constant(value=float(fill_value))
+                    var,
+                    initializer=paddle.nn.initializer.Constant(
+                        value=float(fill_value)
+                    ),
                 )
 
         if framework._non_static_mode():
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 1581f299214df..ef49b5642a37c 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -28,9 +28,9 @@
 from paddle.fluid import core
 from paddle.fluid.data_feeder import check_dtype
 from paddle.fluid.framework import Variable, _non_static_mode, static_only
-from paddle.fluid.initializer import Constant, Normal
 from paddle.fluid.layers.layer_function_generator import templatedoc
 from paddle.fluid.param_attr import ParamAttr
+from paddle.nn.initializer import Constant, Normal
 
 __all__ = []
 
@@ -1012,7 +1012,7 @@ def _get_default_param_initializer():
                 "filter size.".format(filter_elem_num)
             )
         std = (2.0 / filter_elem_num) ** 0.5
-        return Normal(0.0, std, 0)
+        return Normal(0.0, std)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
@@ -1315,7 +1315,7 @@ def _get_default_param_initializer():
             )
 
         std = (2.0 / filter_elem_num) ** 0.5
-        return Normal(0.0, std, 0)
+        return Normal(0.0, std)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
@@ -2286,7 +2286,7 @@ def _get_default_param_initializer():
                 "filter size.".format(filter_elem_num)
             )
         std = (2.0 / filter_elem_num) ** 0.5
-        return paddle.nn.initializer.normal.NormalInitializer(0.0, std, 0)
+        return paddle.nn.initializer.normal.Normal(0.0, std)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
@@ -2757,7 +2757,7 @@ def batch_norm(
         attr=helper.param_attr,
         shape=param_shape,
         dtype=dtype,
-        default_initializer=paddle.fluid.initializer.Constant(1.0),
+        default_initializer=paddle.nn.initializer.Constant(1.0),
     )
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
@@ -2766,7 +2766,7 @@ def batch_norm(
     mean = helper.create_parameter(
         attr=paddle.ParamAttr(
             name=moving_mean_name,
-            initializer=paddle.fluid.initializer.Constant(0.0),
+            initializer=paddle.nn.initializer.Constant(0.0),
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var,
         ),
@@ -2778,7 +2778,7 @@ def batch_norm(
     variance = helper.create_parameter(
         attr=paddle.ParamAttr(
             name=moving_variance_name,
-            initializer=paddle.fluid.initializer.Constant(1.0),
+            initializer=paddle.nn.initializer.Constant(1.0),
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var,
         ),
diff --git a/python/paddle/static/nn/loss.py b/python/paddle/static/nn/loss.py
index 20c7641e2d9de..3f464928c289d 100644
--- a/python/paddle/static/nn/loss.py
+++ b/python/paddle/static/nn/loss.py
@@ -16,12 +16,12 @@
 import numpy as np
 
 from paddle.fluid.framework import static_only
-from paddle.fluid.initializer import NumpyArrayInitializer
 
 # TODO: define loss functions of neural network
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.layer_function_generator import templatedoc
 from paddle.fluid.param_attr import ParamAttr
+from paddle.nn.initializer import Assign
 
 from ...fluid.data_feeder import check_variable_and_dtype
 
@@ -209,7 +209,7 @@ def _init_by_numpy_array(numpy_array):
                 attr=ParamAttr(),
                 shape=numpy_array.shape,
                 dtype=numpy_array.dtype,
-                default_initializer=NumpyArrayInitializer(numpy_array),
+                default_initializer=Assign(numpy_array),
             )
             ret.stop_gradient = True
             return ret
diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py
index 7406525b9df0a..bcb3cfc130fcd 100644
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -18,9 +18,9 @@
 from paddle import _legacy_C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import Variable, _non_static_mode, _varbase_creator
-from paddle.fluid.initializer import Constant
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import tensor
+from paddle.nn.initializer import ConstantInitializer
 
 __all__ = []
 
@@ -266,7 +266,8 @@ def auc(
 
     for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
         helper.set_variable_initializer(
-            var, Constant(value=0.0, force_cpu=False)
+            var,
+            ConstantInitializer(value=0.0, force_cpu=False),
         )
 
     # "InsTagWeight": [ins_tag_weight]
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index 70b606c3c6fbe..84fc94b5eec85 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -14,9 +14,9 @@
 
 # Define functions about array.
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..framework import LayerHelper, core, in_dygraph_mode
-from ..static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 37a1aaf3c86d9..c79c9553c2f08 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -19,10 +19,10 @@
 import paddle
 from paddle import _C_ops
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.framework import in_dygraph_mode
 from ..framework import LayerHelper, core
-from ..static import Variable
 from .creation import _complex_to_real_dtype, assign
 
 __all__ = []
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 7523845c2b8b2..808e4d86d6032 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -35,7 +35,6 @@
     _in_eager_without_dygraph_check,
     device_guard,
 )
-from ..fluid.initializer import Constant, Initializer
 from ..fluid.layers import utils
 from ..fluid.param_attr import ParamAttr
 from ..framework import (
@@ -140,7 +139,10 @@ def create_global_var(
         stop_gradient=True,
     )
     helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value), force_cpu=force_cpu)
+        var,
+        initializer=paddle.nn.initializer.ConstantInitializer(
+            value=float(value), force_cpu=force_cpu
+        ),
     )
 
     return var
@@ -214,7 +216,7 @@ def create_parameter(
     check_type(
         default_initializer,
         'default_initializer',
-        (type(None), Initializer),
+        (type(None), paddle.nn.initializer.Initializer),
         'create_parameter',
     )
 
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 299e41d2aea94..6d9c5fe288057 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -19,6 +19,7 @@
 
 from paddle import _C_ops, _legacy_C_ops
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_variable_and_dtype
 from ..fluid.proto import framework_pb2
 from ..framework import (
@@ -28,7 +29,6 @@
     core,
     in_dygraph_mode,
 )
-from ..static import Variable
 
 __all__ = []
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 10c8c24a78724..c59202977fde9 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -18,13 +18,13 @@
 from paddle import _C_ops
 from paddle.common_ops_import import VarDesc
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
     check_variable_and_dtype,
 )
 from ..framework import LayerHelper, in_dygraph_mode
-from ..static import Variable
 from .creation import full
 from .logic import logical_not
 from .manipulation import cast
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
old mode 100755
new mode 100644
index 375f3614e5e30..ad6c30e319a81
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -16,9 +16,9 @@
 
 import paddle
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.framework import global_var
-from ..static import Variable
 from .layer_function_generator import templatedoc
 
 if global_var._in_eager_mode_:
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index b5308e6cee63d..b9feee2fe1dd9 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -20,7 +20,7 @@
 from paddle import _C_ops
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
-from ..common_ops_import import fill_constant
+from ..common_ops_import import Variable, fill_constant
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
@@ -35,7 +35,6 @@
     dygraph_only,
     in_dygraph_mode,
 )
-from ..static import Variable
 from .creation import _complex_to_real_dtype, _real_to_complex_dtype, zeros
 
 __all__ = []
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 81b092f4c38b4..6f797b82e1d08 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -25,6 +25,7 @@
 # TODO: define math functions
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import (
     check_dtype,
     check_type,
@@ -38,7 +39,6 @@
     core,
     in_dygraph_mode,
 )
-from ..static import Variable
 from .creation import _complex_to_real_dtype
 from .layer_function_generator import generate_layer_fn, templatedoc
 from .manipulation import cast
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 59958df236131..ff48780423fd6 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -16,8 +16,8 @@
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops
+from paddle.common_ops_import import Variable
 from paddle.fluid.framework import _current_expected_place, in_dygraph_mode
-from paddle.static import Variable
 
 from ..fluid.data_feeder import (
     check_dtype,
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index cc94aee415541..f9784478393dc 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -18,9 +18,9 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import in_dygraph_mode
 
+from ..common_ops_import import Variable
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..framework import LayerHelper, core
-from ..static import Variable
 from .math import _get_reduce_axis_with_tensor
 from .search import where
 
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 0d43bd0fc54ce..2cd582884abf4 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -19,11 +19,11 @@
 
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.framework import Variable, in_dygraph_mode
-from ..fluid.initializer import Normal
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.layers import utils
 from ..framework import _current_expected_place
 from ..nn import BatchNorm2D, Conv2D, Layer, ReLU, Sequential
+from ..nn.initializer import Normal
 
 __all__ = [  # noqa
     'yolo_loss',
@@ -1120,7 +1120,7 @@ def __init__(
         def _get_default_param_initializer():
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
             std = (2.0 / filter_elem_num) ** 0.5
-            return Normal(0.0, std, 0)
+            return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,

From ec6e0a2c117d0763fe5e6d0eeff238bf4bd5b97b Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 2 Feb 2023 09:43:40 +0800
Subject: [PATCH 86/89] jit layer optimzer model param memory usage (#50135)

* jit layer support multi thread
---
 paddle/fluid/jit/engine/interpreter_engine.cc |  7 ++---
 paddle/fluid/jit/engine/interpreter_engine.h  |  4 +--
 paddle/fluid/jit/engine/predictor_engine.cc   | 26 +++++++++++--------
 paddle/fluid/jit/engine/predictor_engine.h    |  3 ++-
 paddle/fluid/jit/function_utils.cc            |  6 ++---
 paddle/fluid/jit/function_utils.h             |  4 +--
 paddle/fluid/jit/layer.cc                     |  8 +++---
 paddle/fluid/jit/layer.h                      |  8 +++---
 paddle/fluid/jit/serializer.cc                | 26 +++++++++++--------
 paddle/fluid/jit/serializer.h                 |  4 +--
 10 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
index 36f8a2271d1ef..b16d0c98dee81 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.cc
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -25,9 +25,10 @@
 namespace paddle {
 namespace jit {
 
-InterpreterEngine::InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
-                                     const VariableMap &params_dict,
-                                     const phi::Place &place)
+InterpreterEngine::InterpreterEngine(
+    const std::shared_ptr<FunctionInfo> &info,
+    const std::shared_ptr<VariableMap> &params_dict,
+    const phi::Place &place)
     : info_(info), params_dict_(params_dict), place_(place) {
   info_->RemoveDescFeedFetch();
   PADDLE_ENFORCE_GT(
diff --git a/paddle/fluid/jit/engine/interpreter_engine.h b/paddle/fluid/jit/engine/interpreter_engine.h
index d7aa5d610a50e..367bc1b86dcc6 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.h
+++ b/paddle/fluid/jit/engine/interpreter_engine.h
@@ -36,7 +36,7 @@ using InterpreterCore = framework::InterpreterCore;
 class InterpreterEngine : public BaseEngine {
  public:
   InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
-                    const VariableMap &params_dict,
+                    const std::shared_ptr<VariableMap> &params_dict,
                     const phi::Place &place);
 
   ~InterpreterEngine() noexcept {}
@@ -54,7 +54,7 @@ class InterpreterEngine : public BaseEngine {
 
  private:
   std::shared_ptr<FunctionInfo> info_;
-  VariableMap params_dict_;
+  std::shared_ptr<VariableMap> params_dict_;
   framework::Scope scope_;
   phi::Place place_;
   std::shared_ptr<framework::InterpreterCore> inner_interpreter_;
diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index bac6f993b04f6..d18f4f487dbe2 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -27,11 +27,15 @@ static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                                       DenseTensor *t,
                                       const platform::Place &place);
 
-PredictorEngine::PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
-                                 const VariableMap &params_dict,
-                                 const phi::Place &place)
-    : info_(info), scope_(new framework::Scope()), place_(place) {
-  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, scope_.get());
+PredictorEngine::PredictorEngine(
+    const std::shared_ptr<FunctionInfo> &info,
+    const std::shared_ptr<VariableMap> &params_dict,
+    const phi::Place &place)
+    : info_(info),
+      params_dict_(params_dict),
+      scope_(new framework::Scope()),
+      place_(place) {
+  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict_, scope_.get());
   VLOG(6) << framework::GenScopeTreeDebugInfo(scope_.get());
 
   // TODO(Aurelius84): Expose AnalysisConfig to user.
@@ -66,6 +70,12 @@ PredictorEngine::PredictorEngine(
       predictor_(std::dynamic_pointer_cast<AnalysisPredictor, PaddlePredictor>(
           predictor)) {}
 
+std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
+  auto *x = new PredictorEngine(
+      info_, scope_, place_, std::move(predictor_->Clone(stream)));
+  return std::unique_ptr<BaseEngine>(x);
+}
+
 std::vector<Tensor> PredictorEngine::operator()(
     const std::vector<Tensor> &inputs) {
   auto dense_tensors = utils::ToDenseTensors(inputs);
@@ -199,11 +209,5 @@ static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
   return true;
 }
 
-std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
-  auto *x = new PredictorEngine(
-      info_, scope_, place_, std::move(predictor_->Clone(stream)));
-  return std::unique_ptr<BaseEngine>(x);
-}
-
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/engine/predictor_engine.h b/paddle/fluid/jit/engine/predictor_engine.h
index ad07a7a7ffbf5..b2da6f4210a37 100644
--- a/paddle/fluid/jit/engine/predictor_engine.h
+++ b/paddle/fluid/jit/engine/predictor_engine.h
@@ -31,7 +31,7 @@ namespace jit {
 class PredictorEngine : public BaseEngine {
  public:
   PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
-                  const VariableMap &params_dict,
+                  const std::shared_ptr<VariableMap> &params_dict,
                   const phi::Place &place);
 
   PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
@@ -50,6 +50,7 @@ class PredictorEngine : public BaseEngine {
 
  private:
   std::shared_ptr<FunctionInfo> info_;
+  std::shared_ptr<VariableMap> params_dict_;
   std::shared_ptr<framework::Scope> scope_;
   phi::Place place_;
   std::shared_ptr<AnalysisPredictor> predictor_;
diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc
index b67b5ba5b0518..3bd8c23411315 100644
--- a/paddle/fluid/jit/function_utils.cc
+++ b/paddle/fluid/jit/function_utils.cc
@@ -71,18 +71,18 @@ void ShareIntoScope(const std::vector<std::string> &ordered_input_names,
 }
 
 void ShareParamsIntoScope(const std::vector<std::string> &param_names,
-                          const VariableMap &params_dict,
+                          const std::shared_ptr<VariableMap> &params_dict,
                           framework::Scope *scope) {
   for (size_t i = 0; i < param_names.size(); ++i) {
     std::string name = param_names[i];
-    PADDLE_ENFORCE_EQ(params_dict.count(name),
+    PADDLE_ENFORCE_EQ(params_dict->count(name),
                       1,
                       phi::errors::InvalidArgument(
                           "Parameter named %s is not existed in params_dict. "
                           "Please check that your model was saved correctly",
                           name));
 
-    auto &param = params_dict.find(name)->second;
+    auto &param = params_dict->find(name)->second;
     auto &dense_tensor = param->Get<DenseTensor>();
     auto *var = scope->Var(name);
     auto *dst_tensor = var->GetMutable<DenseTensor>();
diff --git a/paddle/fluid/jit/function_utils.h b/paddle/fluid/jit/function_utils.h
index d61b720cec88f..5daa5ada200f4 100644
--- a/paddle/fluid/jit/function_utils.h
+++ b/paddle/fluid/jit/function_utils.h
@@ -51,14 +51,14 @@ void ShareIntoScope(const std::vector<std::string> &ordered_input_names,
                     framework::Scope *scope);
 
 void ShareParamsIntoScope(const std::vector<std::string> &param_names,
-                          const VariableMap &params_dict,
+                          const std::shared_ptr<VariableMap> &params_dict,
                           framework::Scope *scope);
 
 void RemoveFeedFetch(framework::ProgramDesc *program_desc);
 
 template <typename T>
 std::shared_ptr<T> MakeEngine(const std::shared_ptr<FunctionInfo> &info,
-                              const VariableMap &params_dict,
+                              const std::shared_ptr<VariableMap> &params_dict,
                               const phi::Place &place) {
   return std::make_shared<T>(info, params_dict, place);
 }
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index 332c53a8e3649..2e8dba0f5a731 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -26,8 +26,8 @@
 namespace paddle {
 namespace jit {
 
-Layer::Layer(const VariableMap& params_map,
-             const VariableMap& attrs_map,
+Layer::Layer(const std::shared_ptr<VariableMap>& params_map,
+             const std::shared_ptr<VariableMap>& attrs_map,
              const FunctionInfoMap& info_map,
              const phi::Place& place)
     : params_map_(params_map),
@@ -80,12 +80,12 @@ std::vector<std::string> Layer::FunctionNames() const {
 #define PD_SPECIALZE_ATTRIBUTE_TYPE(T)                                \
   template <>                                                         \
   T Layer::Attribute<T>(const std::string& name) const {              \
-    if (attrs_map_.find(name) == attrs_map_.end()) {                  \
+    if (attrs_map_->find(name) == attrs_map_->end()) {                \
       PADDLE_THROW(phi::errors::NotFound(                             \
           "Attribute can not found %s, please check if it exists.")); \
       return T();                                                     \
     }                                                                 \
-    auto var = attrs_map_.at(name);                                   \
+    auto var = attrs_map_->at(name);                                  \
     T ret = var->Get<T>();                                            \
     return ret;                                                       \
   }
diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h
index ed8b739a0b72f..4f76a41d06f3e 100644
--- a/paddle/fluid/jit/layer.h
+++ b/paddle/fluid/jit/layer.h
@@ -43,8 +43,8 @@ using FunctionInfoMap =
 
 class Layer {
  public:
-  Layer(const VariableMap& params_map,
-        const VariableMap& attrs_map_,
+  Layer(const std::shared_ptr<VariableMap>& params_map,
+        const std::shared_ptr<VariableMap>& attrs_map_,
         const FunctionInfoMap& info_map,
         const phi::Place& place);
 
@@ -70,8 +70,8 @@ class Layer {
   std::shared_ptr<Layer> Clone(void* stream = nullptr);
 
  private:
-  VariableMap params_map_;
-  VariableMap attrs_map_;
+  std::shared_ptr<VariableMap> params_map_;
+  std::shared_ptr<VariableMap> attrs_map_;
   FunctionInfoMap info_map_;
   phi::Place place_;
   std::shared_ptr<CompilationUnit> unit_;
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
index 436717a8dc389..21a187ad67100 100644
--- a/paddle/fluid/jit/serializer.cc
+++ b/paddle/fluid/jit/serializer.cc
@@ -58,12 +58,12 @@ Layer Deserializer::operator()(const std::string& path,
     info_map[func_name]->SetProgramFilePath(it.second);
   }
 
-  VariableMap params_dict;
-  VariableMap attrs_dict;
-  ReadTensorData(path + PDPARAMS_SUFFIX, param_names_set, place, &params_dict);
+  auto params_dict = std::make_shared<VariableMap>();
+  auto attrs_dict = std::make_shared<VariableMap>();
+  ReadTensorData(path + PDPARAMS_SUFFIX, param_names_set, place, params_dict);
 
   if (utils::FileExists(path + PROPERTY_SUFFIX)) {
-    ReadAttributeData(path + PROPERTY_SUFFIX, &attrs_dict);
+    ReadAttributeData(path + PROPERTY_SUFFIX, attrs_dict);
     VLOG(3) << "Read Property Success!";
   }
 
@@ -90,10 +90,11 @@ Layer Deserializer::operator()(const std::string& path,
   return layer;
 }
 
-void Deserializer::ReadTensorData(const std::string& file_name,
-                                  const std::set<std::string>& var_name,
-                                  const phi::Place& place,
-                                  VariableMap* params_dict) const {
+void Deserializer::ReadTensorData(
+    const std::string& file_name,
+    const std::set<std::string>& var_name,
+    const phi::Place& place,
+    std::shared_ptr<VariableMap> params_dict) const {
   VLOG(3) << "ReadTensorData from: " << file_name;
   std::ifstream fin(file_name, std::ios::binary);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -108,12 +109,15 @@ void Deserializer::ReadTensorData(const std::string& file_name,
   }
 }
 
-void Deserializer::ReadAttributeData(const std::string& file_path,
-                                     VariableMap* attrs_dict) const {
+void Deserializer::ReadAttributeData(
+    const std::string& file_path,
+    std::shared_ptr<VariableMap> attrs_dict) const {
   VLOG(3) << "ReadPropertyData from: " << file_path;
   Property p;
   p.Deserialization(file_path);
-  *attrs_dict = static_cast<VariableMap>(p.Values());
+  for (auto& it : p.Values()) {
+    attrs_dict->emplace(it.first, it.second);
+  }
   return;
 }
 
diff --git a/paddle/fluid/jit/serializer.h b/paddle/fluid/jit/serializer.h
index b93eaa44fe632..926e9a6afda37 100644
--- a/paddle/fluid/jit/serializer.h
+++ b/paddle/fluid/jit/serializer.h
@@ -55,11 +55,11 @@ class Deserializer {
   void ReadTensorData(const std::string& file_name,
                       const std::set<std::string>& var_name,
                       const phi::Place& place,
-                      VariableMap* params_dict) const;
+                      std::shared_ptr<VariableMap> params_dict) const;
 
   // property pb
   void ReadAttributeData(const std::string& file_path,
-                         VariableMap* attrs_dict) const;
+                         std::shared_ptr<VariableMap> attrs_dict) const;
 
   // void ReadExtraInfo(const std::string& file_name) const;
 

From 14dd68e1d7d05552f0f5de02adb5de76271f71d0 Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Thu, 2 Feb 2023 10:15:18 +0800
Subject: [PATCH 87/89] Fix the FP16 precision problem of add_n. (#50129)

---
 paddle/phi/kernels/gpu/add_n_kernel.cu        | 20 +++---
 .../fluid/tests/unittests/test_add_n_op.py    | 64 +++++++++++++++++++
 2 files changed, 75 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_add_n_op.py

diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index f32ba597f5b68..69bc248a7e2f2 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,11 +14,10 @@
 
 #include "paddle/phi/kernels/add_n_kernel.h"
 
-#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
 namespace phi {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
@@ -38,16 +37,18 @@ __global__ void Sum2CUDAKernel(const T *in_0,
 template <class T>
 __global__ void SumArrayCUDAKernel(
     T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
+    MPType total(read_dst ? static_cast<MPType>(out[id])
+                          : static_cast<MPType>(0));
     for (int i = 0; i < in_size; ++i) {
       const T *tmp = in[i];
       if (tmp) {
-        total += tmp[id];
+        total += static_cast<MPType>(tmp[id]);
       }
     }
-    out[id] = total;
+    out[id] = static_cast<T>(total);
     id += blockDim.x * gridDim.x;
   }
 }
@@ -116,11 +117,12 @@ void AddNKernel(const Context &dev_ctx,
     int64_t length_0 = in_0.numel();
     int64_t length_1 = in_1.numel();
     if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
+      using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
+      auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
+      auto in_1_e = EigenVector<T>::Flatten(in_1).template cast<MPType>();
+      result.device(place) = (in_0_e + in_1_e).template cast<T>();
     } else if (length_0 && in_0.IsInitialized()) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
diff --git a/python/paddle/fluid/tests/unittests/test_add_n_op.py b/python/paddle/fluid/tests/unittests/test_add_n_op.py
new file mode 100644
index 0000000000000..3ca485b1419fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_add_n_op.py
@@ -0,0 +1,64 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestAddnOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(20)
+        l = 32
+        self.x_np = np.random.random([l, 16, 256])
+
+    def check_main(self, x_np, dtype, axis=None):
+        paddle.disable_static()
+        x = []
+        for i in range(x_np.shape[0]):
+            val = paddle.to_tensor(x_np[i].astype(dtype))
+            val.stop_gradient = False
+            x.append(val)
+        y = paddle.add_n(x)
+        x_g = paddle.grad(y, x)
+        y_np = y.numpy().astype('float32')
+        x_g_np = []
+        for val in x_g:
+            x_g_np.append(val.numpy().astype('float32'))
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_add_n_fp16(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        y_np_16, x_g_np_16 = self.check_main(self.x_np, 'float16')
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+
+        np.testing.assert_allclose(y_np_16, y_np_32, rtol=1e-03)
+        for i in range(len(x_g_np_32)):
+            np.testing.assert_allclose(x_g_np_16[i], x_g_np_32[i], rtol=1e-03)
+
+    def test_add_n_api(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+        y_np_gt = np.sum(self.x_np, axis=0).astype('float32')
+
+        np.testing.assert_allclose(y_np_32, y_np_gt, rtol=1e-06)
+
+
+if __name__ == "__main__":
+    unittest.main()

From e48c882f42fe3bca4c1d707098be99fd7ab04659 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 2 Feb 2023 10:43:01 +0800
Subject: [PATCH 88/89] pass PYTHON_EXECUTABLE envs to thirdparty cinn (#50142)

---
 cmake/external/cinn.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 3ec194a6bfb37..5e23a0f36f04a 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -40,7 +40,10 @@ set(CINN_OPTIONAL_ARGS
     -DWITH_MKL_CBLAS=${WITH_MKL}
     -DWITH_MKLDNN=${WITH_MKL}
     -DPUBLISH_LIBS=ON
-    -DWITH_TESTING=ON)
+    -DWITH_TESTING=ON
+    -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
+    -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARIES=${PYTHON_LIBRARIES})
 set(CINN_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cinnapi -j)
 set(CINN_BINARY_DIR ${CINN_PREFIX_DIR}/src/external_cinn-build)
 set(CINN_LIB_NAME "libcinnapi.so")

From 3c557e2fdd1a42d46fa98faadbd1c1664e6c1ad8 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 2 Feb 2023 10:55:23 +0800
Subject: [PATCH 89/89] [BugFix]Fix bugs when compile with OneDNN (#50096)

* fix bugs

* fix ci bugs
---
 paddle/fluid/framework/feed_fetch_type.h      |  5 +++
 paddle/fluid/framework/string_array.h         | 12 ++++--
 paddle/fluid/operators/controlflow/feed_op.cc | 38 -------------------
 paddle/phi/kernels/funcs/CMakeLists.txt       |  2 +-
 4 files changed, 15 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index 571667bff47eb..e51f22a2c3c18 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -26,6 +26,11 @@ namespace framework {
 using FeedType =
     paddle::variant<phi::DenseTensor, Strings, phi::SparseCooTensor>;
 
+template <>
+struct PhiVectorType<FeedType> {
+  const char *type_name = "PhiVectorFeedType";
+};
+
 using FeedList = paddle::framework::PhiVector<FeedType>;
 
 using FetchType = paddle::variant<phi::DenseTensor,
diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h
index 9fd245ff91765..2ff52e5d078b8 100644
--- a/paddle/fluid/framework/string_array.h
+++ b/paddle/fluid/framework/string_array.h
@@ -102,6 +102,14 @@ class Vocab : public phi::ExtendedTensor,
 // Kernel. It can be used when you define a non-tensor type that needs to be
 // stored in a vector as PHI kernel argument.
 
+template <typename T>
+struct PhiVectorType;
+
+template <>
+struct PhiVectorType<std::string> {
+  const char* type_name = "PhiVectorString";
+};
+
 template <typename T>
 class PhiVector : public phi::ExtendedTensor,
                   public phi::TypeInfoTraits<phi::TensorBase, PhiVector<T>> {
@@ -129,9 +137,7 @@ class PhiVector : public phi::ExtendedTensor,
  public:
   /// \brief Returns the name of the class for type traits.
   /// \return The name of the class.
-  static const char* name() {
-    return (std::string("PhiVector_") + std::string(typeid(T).name())).c_str();
-  }
+  static const char* name() { return PhiVectorType<T>().type_name; }
 
   size_t size() const { return data_.size(); }
 
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 09684b8d737ba..9d266b81d0bab 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -267,44 +267,6 @@ PD_REGISTER_GENERAL_KERNEL(
     ALL_LAYOUT,
     paddle::operators::FeedStringsKernel<phi::XPUContext>,
     ALL_DTYPE) {}
-#elif defined(PADDLE_WITH_ASCEND_CL)
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    npu,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_sparse_coo_tensor,
-    npu,
-    ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    npu,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-#elif defined(PADDLE_WITH_MLU)
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    CustomMLU,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_sparse_coo_tensor,
-    CustomMLU,
-    ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    CustomMLU,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 namespace paddle {
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index e4f779c807570..da8f47c7bffd4 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -27,7 +27,7 @@ math_library(sequence_scale)
 cc_library(
   phi_data_layout_transform
   SRCS data_layout_transform.cc
-  DEPS tensor)
+  DEPS tensor blas)
 
 if(WITH_GPU OR WITH_ROCM)
   if(MKL_FOUND AND WITH_ONEMKL)