Merge pull request PaddlePaddle#71 from tiancaitzp/paddlebox

fix dense tensor miss copy when pass start. fix nan problem in wasq.
zmxdream · May 6, 2024 · 80dfe79 · 80dfe79
2 parents e0f1c18 + b2e6f74
commit 80dfe79
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 11 deletions.
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -417,8 +417,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 
     Tensor y_tensor;
     bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
-    int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(), 
-                              x, 
+    int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
+                              x,
                               y_ptr,
                               tensor->numel());
     PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
@@ -729,11 +729,16 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
   } else if (var->IsType<phi::SelectedRows>()) {
     tensor = &var->Get<phi::SelectedRows>().value();
   } else {
+    LOG(WARNING) << "op_type: " << op_type << ", var_name: " << var_name << " var->IsType invalid";
     return;
   }
   if (tensor->memory_size() == 0) {
     return;
   }
+  if (tensor->numel() == 0) {
+    LOG(WARNING) << "op_type: " << op_type << ", var_name: " << var_name << " tensor->numel() is zero";
+    return;
+  }
   if (platform::is_cpu_place(tensor->place())) {
     int64_t numel = tensor->numel();
     auto dtype = framework::TransToProtoVarType(tensor->type());
@@ -749,7 +754,7 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
 #ifdef PADDLE_WITH_XPU
     if (framework::TransToProtoVarType(tensor->dtype()) !=
         proto::VarType::FP32) {
-      LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
+      // LOG(WARNING) << "skip op_type: " << op_type << "check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
       return;
     }
 
@@ -775,26 +780,28 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
 
     Tensor y_tensor;
     bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
-    int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(), 
-                              x, 
+    VLOG(1) << "Check its output indeed:" << var_name;
+    int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
+                              x,
                               y_ptr,
                               tensor->numel());
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-            platform::errors::External(
-               "The check_nan_or_inf XPU OP return wrong value[%d %s]",
-               r, XPUAPIErrorMsg[r]));
     dev_ctx->Wait();
 
+    if (r != xpu::Error_t::SUCCESS) {
+        LOG(WARNING) << "op_type: " << op_type << ", var_name: " << var_name << "check_failed!";
+        return;
+    }
     bool check_res = false;
     bool* res_ptr = &check_res;
     memory::Copy(platform::CPUPlace(),
                  static_cast<void*>(res_ptr),
                  y_tensor.place(),
                  static_cast<const void*>(y_tensor.data<bool>()),
                  y_tensor.numel() * sizeof(bool));
-    VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res;
+    VLOG(1) << "CheckVarHasNanOrInfRet check_res = " << check_res;
     if (check_res) {
       get_cpu_nan_inf_num() ++;
+      VLOG(0) << "op_type: " << op_type << ", var_name: " << var_name << "check nan faild!";
     }
     return;
 #endif
@@ -832,13 +839,16 @@ bool CheckOpHasNanOrInfRet(const framework::OperatorBase& op,
 
   if (IsSkipOp(op)) return false;
   if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
+    VLOG(1) << "Check op:" << op.Type();
     // NOTE. vname may destruct in the end of this func.
     for (auto& vname : op.OutputVars(true)) {
       auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
+      VLOG(1) << "Check its output:" << vname;
       CheckVarHasNanOrInfRet(op.Type(), var, vname, place);
     }
   } else {
+    VLOG(1) << "Check op:" << op.Type();
     for (auto& vname : op.OutputVars(true)) {
       bool need_check = true;
       for (auto& white_vname : op_var_nan_inf_white_list().at(op.Type())) {
@@ -850,6 +860,7 @@ bool CheckOpHasNanOrInfRet(const framework::OperatorBase& op,
       if (!need_check) continue;
       auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
+      VLOG(1) << "Check its output:" << vname;
       CheckVarHasNanOrInfRet(op.Type(), var, vname, place);
     }
   }

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -1763,11 +1763,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP)
     if (framework::details::CheckOpHasNanOrInfRet(*this, exec_scope, place)) {
       framework::details::DumpAllScope(exec_scope, place);
+      VLOG(0) << "op_type: " << Type() << ", CheckOpHasNanOrInf failed!!";
       // dump current op data
       for (auto& iname : InputVars()) {
         auto* var = exec_scope.FindVar(iname);
         if (var == nullptr) continue;
+        VLOG(0) << "op_input: " << iname;
         std::ostringstream os;
+        os << "op type: " << type_ << "\n";
         os << "input name:" << iname << ", ";
         if (var->IsType<framework::LoDTensor>()) {
           os << var->Get<framework::LoDTensor>();
@@ -1780,7 +1783,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       for (auto& iname : OutputVars(true)) {
         auto* var = exec_scope.FindVar(iname);
         if (var == nullptr) continue;
+        VLOG(0) << "op_output: " << iname;
         std::ostringstream os;
+        os << "op type: " << type_ << "\n";
         os << "output name:" << iname << ", ";
         if (var->IsType<framework::LoDTensor>()) {
           os << var->Get<framework::LoDTensor>();

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -117,7 +117,7 @@ void TensorCopyImpl(const TENSOR& src,
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   } else if ((platform::is_xpu_place(src_place) || platform::is_xpul3_place(src_place)) &&
              (platform::is_xpu_place(dst_place) || platform::is_xpul3_place(dst_place))) {
-    if (src_ptr == dst_ptr) {
+    if (src_ptr == dst_ptr && src_place == dst_place) {
       VLOG(3) << "Skip copy the same data async from " << src_place << " to "
               << dst_place;
       return;

diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -145,6 +145,9 @@ static void PullBoxExtendedSparseFunctor(
         }
         auto *output = outputs[embedx_offset]->mutable_data<T>(ctx.GetPlace());
         all_values[i] = reinterpret_cast<float*>(output);
+        if (outputs[embedx_offset]->numel() == 0) {
+          all_values[i] = nullptr;
+        }
         ++embedx_offset;
       } else {
         all_values[i] = 0;
@@ -159,6 +162,9 @@ static void PullBoxExtendedSparseFunctor(
         }
         auto *output_extend = outputs_extend[expand_offset]->mutable_data<T>(ctx.GetPlace());
         all_values[i + slot_size] = reinterpret_cast<float*>(output_extend);
+        if(outputs_extend[expand_offset]->numel() == 0) {
+          all_values[i] = nullptr;
+        }
         ++expand_offset;
       } else {
         all_values[i + slot_size] = 0;