Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#71 from tiancaitzp/paddlebox
Browse files Browse the repository at this point in the history
fix dense tensor miss copy when pass start. fix nan problem in wasq.
  • Loading branch information
tiancaitzp committed May 6, 2024
2 parents e0f1c18 + b2e6f74 commit 80dfe79
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 11 deletions.
31 changes: 21 additions & 10 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,

Tensor y_tensor;
bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
x,
int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
x,
y_ptr,
tensor->numel());
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
Expand Down Expand Up @@ -729,11 +729,16 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
} else if (var->IsType<phi::SelectedRows>()) {
tensor = &var->Get<phi::SelectedRows>().value();
} else {
LOG(WARNING) << "op_type: " << op_type << ", var_name: " << var_name << " var->IsType invalid";
return;
}
if (tensor->memory_size() == 0) {
return;
}
if (tensor->numel() == 0) {
LOG(WARNING) << "op_type: " << op_type << ", var_name: " << var_name << " tensor->numel() is zero";
return;
}
if (platform::is_cpu_place(tensor->place())) {
int64_t numel = tensor->numel();
auto dtype = framework::TransToProtoVarType(tensor->type());
Expand All @@ -749,7 +754,7 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
#ifdef PADDLE_WITH_XPU
if (framework::TransToProtoVarType(tensor->dtype()) !=
proto::VarType::FP32) {
LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
// LOG(WARNING) << "skip op_type: " << op_type << "check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
return;
}

Expand All @@ -775,26 +780,28 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,

Tensor y_tensor;
bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
x,
VLOG(1) << "Check its output indeed:" << var_name;
int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(),
x,
y_ptr,
tensor->numel());
PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
platform::errors::External(
"The check_nan_or_inf XPU OP return wrong value[%d %s]",
r, XPUAPIErrorMsg[r]));
dev_ctx->Wait();

if (r != xpu::Error_t::SUCCESS) {
LOG(WARNING) << "op_type: " << op_type << ", var_name: " << var_name << "check_failed!";
return;
}
bool check_res = false;
bool* res_ptr = &check_res;
memory::Copy(platform::CPUPlace(),
static_cast<void*>(res_ptr),
y_tensor.place(),
static_cast<const void*>(y_tensor.data<bool>()),
y_tensor.numel() * sizeof(bool));
VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res;
VLOG(1) << "CheckVarHasNanOrInfRet check_res = " << check_res;
if (check_res) {
get_cpu_nan_inf_num() ++;
VLOG(0) << "op_type: " << op_type << ", var_name: " << var_name << "check nan faild!";
}
return;
#endif
Expand Down Expand Up @@ -832,13 +839,16 @@ bool CheckOpHasNanOrInfRet(const framework::OperatorBase& op,

if (IsSkipOp(op)) return false;
if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
VLOG(1) << "Check op:" << op.Type();
// NOTE. vname may destruct in the end of this func.
for (auto& vname : op.OutputVars(true)) {
auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue;
VLOG(1) << "Check its output:" << vname;
CheckVarHasNanOrInfRet(op.Type(), var, vname, place);
}
} else {
VLOG(1) << "Check op:" << op.Type();
for (auto& vname : op.OutputVars(true)) {
bool need_check = true;
for (auto& white_vname : op_var_nan_inf_white_list().at(op.Type())) {
Expand All @@ -850,6 +860,7 @@ bool CheckOpHasNanOrInfRet(const framework::OperatorBase& op,
if (!need_check) continue;
auto* var = exec_scope.FindVar(vname);
if (var == nullptr) continue;
VLOG(1) << "Check its output:" << vname;
CheckVarHasNanOrInfRet(op.Type(), var, vname, place);
}
}
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1763,11 +1763,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP)
if (framework::details::CheckOpHasNanOrInfRet(*this, exec_scope, place)) {
framework::details::DumpAllScope(exec_scope, place);
VLOG(0) << "op_type: " << Type() << ", CheckOpHasNanOrInf failed!!";
// dump current op data
for (auto& iname : InputVars()) {
auto* var = exec_scope.FindVar(iname);
if (var == nullptr) continue;
VLOG(0) << "op_input: " << iname;
std::ostringstream os;
os << "op type: " << type_ << "\n";
os << "input name:" << iname << ", ";
if (var->IsType<framework::LoDTensor>()) {
os << var->Get<framework::LoDTensor>();
Expand All @@ -1780,7 +1783,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
for (auto& iname : OutputVars(true)) {
auto* var = exec_scope.FindVar(iname);
if (var == nullptr) continue;
VLOG(0) << "op_output: " << iname;
std::ostringstream os;
os << "op type: " << type_ << "\n";
os << "output name:" << iname << ", ";
if (var->IsType<framework::LoDTensor>()) {
os << var->Get<framework::LoDTensor>();
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ void TensorCopyImpl(const TENSOR& src,
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if ((platform::is_xpu_place(src_place) || platform::is_xpul3_place(src_place)) &&
(platform::is_xpu_place(dst_place) || platform::is_xpul3_place(dst_place))) {
if (src_ptr == dst_ptr) {
if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/operators/pull_box_extended_sparse_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ static void PullBoxExtendedSparseFunctor(
}
auto *output = outputs[embedx_offset]->mutable_data<T>(ctx.GetPlace());
all_values[i] = reinterpret_cast<float*>(output);
if (outputs[embedx_offset]->numel() == 0) {
all_values[i] = nullptr;
}
++embedx_offset;
} else {
all_values[i] = 0;
Expand All @@ -159,6 +162,9 @@ static void PullBoxExtendedSparseFunctor(
}
auto *output_extend = outputs_extend[expand_offset]->mutable_data<T>(ctx.GetPlace());
all_values[i + slot_size] = reinterpret_cast<float*>(output_extend);
if(outputs_extend[expand_offset]->numel() == 0) {
all_values[i] = nullptr;
}
++expand_offset;
} else {
all_values[i + slot_size] = 0;
Expand Down

0 comments on commit 80dfe79

Please sign in to comment.