Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#4 from tsocha/onednn-3.0
Browse files Browse the repository at this point in the history
Onednn 3.0
  • Loading branch information
jczaja committed Mar 16, 2023
2 parents a86c93f + 1ad1558 commit 4dc7495
Show file tree
Hide file tree
Showing 500 changed files with 10,582 additions and 4,498 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ repos:
- id: black
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
- repo: https://github.com/pycqa/isort
rev: 5.10.1
rev: 5.11.5
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
Expand Down
9 changes: 6 additions & 3 deletions cmake/external/cinn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ if(NOT WITH_CINN)
endif()

if(NOT CINN_GIT_TAG)
# 2023.01.12 commit
set(CINN_GIT_TAG 5d1ae0f4b8e3f7cd5b16dfc76d2161bf77e938ac)
# 2023.01.28 commit
set(CINN_GIT_TAG 1449890f7724babf2a343c6f8073bd28a7bbc683)
endif()

message(STATUS "CINN version: " ${CINN_GIT_TAG})
Expand All @@ -40,7 +40,10 @@ set(CINN_OPTIONAL_ARGS
-DWITH_MKL_CBLAS=${WITH_MKL}
-DWITH_MKLDNN=${WITH_MKL}
-DPUBLISH_LIBS=ON
-DWITH_TESTING=ON)
-DWITH_TESTING=ON
-DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
-DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}
-DPYTHON_LIBRARIES=${PYTHON_LIBRARIES})
set(CINN_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cinnapi -j)
set(CINN_BINARY_DIR ${CINN_PREFIX_DIR}/src/external_cinn-build)
set(CINN_LIB_NAME "libcinnapi.so")
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/cutlass.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ include(ExternalProject)
set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)

set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
set(CUTLASS_TAG v2.10.0)
set(CUTLASS_TAG v2.11.0)

include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/pybind11.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ include(ExternalProject)

set(PYBIND_PREFIX_DIR ${THIRD_PARTY_PATH}/pybind)
set(PYBIND_REPOSITORY ${GIT_URL}/pybind/pybind11.git)
set(PYBIND_TAG v2.4.3)
set(PYBIND_TAG v2.6.0)

set(PYBIND_INCLUDE_DIR ${THIRD_PARTY_PATH}/pybind/src/extern_pybind/include)
include_directories(${PYBIND_INCLUDE_DIR})
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set(XPU_PROJECT "extern_xpu")
set(XPU_API_LIB_NAME "libxpuapi.so")
set(XPU_RT_LIB_NAME "libxpurt.so")

set(XPU_BASE_DATE "20230114")
set(XPU_BASE_DATE "20230119")
set(XPU_XCCL_BASE_VERSION "1.0.7")

if(NOT DEFINED XPU_BASE_URL)
Expand Down
26 changes: 20 additions & 6 deletions paddle/fluid/distributed/fleet_executor/carrier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "paddle/fluid/distributed/fleet_executor/carrier.h"

#include <algorithm>
#include <vector>

#include "paddle/fluid/distributed/fleet_executor/global.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
Expand All @@ -24,6 +25,7 @@
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable_helper.h"

namespace paddle {
Expand Down Expand Up @@ -55,23 +57,34 @@ void Carrier::Init(
framework::Scope* scope,
int64_t num_micro_batches,
const platform::Place& place,
const std::vector<std::string>& inference_root_scope_vars) {
const std::vector<std::string>& inference_root_scope_vars,
const std::vector<framework::Scope*>& micro_scope_list) {
rank_ = rank;
interceptor_id_to_rank_ = interceptor_id_to_rank;
interceptor_id_to_node_ = interceptor_id_to_node;
place_ = place;
root_scope_ = scope;
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
bool need_create_scope = micro_scope_list.empty();

PADDLE_ENFORCE_NOT_NULL(
root_scope_,
platform::errors::InvalidArgument("root_scope can not be nullptr"));
minibatch_scope_ = &root_scope_->NewScope();
microbatch_scopes_.resize(num_micro_batches);
for (int i = 0; i < num_micro_batches; ++i) {
microbatch_scopes_[i] = &minibatch_scope_->NewScope();
CopyParameters(i, program, inference_root_scope_vars);

if (need_create_scope) {
minibatch_scope_ = &root_scope_->NewScope();
microbatch_scopes_.resize(num_micro_batches);
for (int i = 0; i < num_micro_batches; ++i) {
microbatch_scopes_[i] = &minibatch_scope_->NewScope();
CopyParameters(i, program, inference_root_scope_vars);
}
} else {
microbatch_scopes_ = micro_scope_list;
for (int i = 0; i < num_micro_batches; ++i) {
CopyParameters(i, program, inference_root_scope_vars);
}
}

// Add source and sink interceptor id to rank
interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
interceptor_id_to_rank_.emplace(SINK_ID, rank);
Expand Down Expand Up @@ -166,6 +179,7 @@ void Carrier::Start() {
"Using carrier before initialized."));
InterceptorMessage start_msg;
start_msg.set_dst_id(SOURCE_ID);
start_msg.set_src_id(SOURCE_ID);
start_msg.set_message_type(START);
Send(start_msg);
// TODO(wangxi): async step
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/distributed/fleet_executor/carrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
#include "paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
Expand Down Expand Up @@ -60,7 +61,8 @@ class Carrier final {
framework::Scope* scope,
int64_t num_micro_batches,
const platform::Place& place,
const std::vector<std::string>& inference_root_scope_vars = {});
const std::vector<std::string>& inference_root_scope_vars = {},
const std::vector<framework::Scope*>& micro_scope_list = {});

void CopyParameters(
int microbatch_id,
Expand Down
42 changes: 25 additions & 17 deletions paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,17 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
auto max_ready_size = it->second.first;
auto ready_size = it->second.second;
ready_size += 1;
PADDLE_ENFORCE_LE(ready_size,
max_ready_size,
platform::errors::OutOfRange(
"upstream=%lld ready_size must <= max_ready_size, but "
"now ready_size=%lld, max_ready_size=%lld",
up_id,
ready_size,
max_ready_size));
if (max_ready_size != INFINITE_BUFFER_SIZE) {
PADDLE_ENFORCE_LE(
ready_size,
max_ready_size,
platform::errors::OutOfRange(
"upstream=%lld ready_size must <= max_ready_size, but "
"now ready_size=%lld, max_ready_size=%lld",
up_id,
ready_size,
max_ready_size));
}
it->second.second = ready_size;
}

Expand Down Expand Up @@ -96,6 +99,9 @@ bool ComputeInterceptor::CanWriteOutput() {
for (auto& outs : out_buffs_) {
auto max_buffer_size = outs.second.first;
auto used_size = outs.second.second;
if (max_buffer_size == INFINITE_BUFFER_SIZE) {
continue;
}
// full, return false
if (used_size == max_buffer_size) {
VLOG(3) << "Interceptor " << GetInterceptorId()
Expand All @@ -112,15 +118,17 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
auto max_buff_size = outs.second.first;
auto used_size = outs.second.second;
used_size += 1;
PADDLE_ENFORCE_LE(
used_size,
max_buff_size,
platform::errors::OutOfRange("downstream=%lld used buff size must <= "
"max_buff_size, but now used_size=%lld, "
"max_buff_size=%lld",
down_id,
used_size,
max_buff_size));
if (max_buff_size != INFINITE_BUFFER_SIZE) {
PADDLE_ENFORCE_LE(
used_size,
max_buff_size,
platform::errors::OutOfRange("downstream=%lld used buff size must <= "
"max_buff_size, but now used_size=%lld, "
"max_buff_size=%lld",
down_id,
used_size,
max_buff_size));
}
outs.second.second = used_size;

InterceptorMessage ready_msg;
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/distributed/fleet_executor/compute_interceptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
namespace paddle {
namespace distributed {

const int64_t INFINITE_BUFFER_SIZE = -1;

class ComputeInterceptor : public Interceptor {
public:
ComputeInterceptor(int64_t interceptor_id, TaskNode* node);
Expand Down
41 changes: 24 additions & 17 deletions paddle/fluid/distributed/fleet_executor/fleet_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"

#include <algorithm>
#include <vector>

#include "paddle/fluid/distributed/fleet_executor/global.h"
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
Expand All @@ -24,6 +25,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/variable.h"

namespace paddle {
namespace distributed {
Expand Down Expand Up @@ -59,7 +61,8 @@ void FleetExecutor::Init(
int64_t num_micro_batches,
const std::vector<TaskNode*>& task_nodes,
const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
const std::vector<std::string>& inference_root_scope_vars) {
const std::vector<std::string>& inference_root_scope_vars,
const std::vector<framework::Scope*>& micro_scope_list) {
PADDLE_ENFORCE_GT(task_nodes.size(),
0,
platform::errors::InvalidArgument(
Expand Down Expand Up @@ -108,21 +111,22 @@ void FleetExecutor::Init(
task_node->SetUnusedVars(unused_vars);
if (task_node->type() == "Cond") {
std::vector<std::string> while_block_vars;
std::vector<std::string> vars_in_parent;
std::vector<std::string> vars_in_sub;
for (auto& var : program_desc.Block(0).AllVars()) {
vars_in_parent.emplace_back(var->Name());
}
VLOG(3) << "Vars in while sub block:";
for (auto& var : program_desc.Block(1).AllVars()) {
vars_in_sub.emplace_back(var->Name());
VLOG(3) << var->Name();
while_block_vars.emplace_back(var->Name());
}
for (const auto& pair : unused_vars) {
if (pair.first->Type() == "while") {
for (const auto& var_name : pair.second) {
while_block_vars.emplace_back(var_name);
}
}
}
VLOG(3) << "Vars below will be removed after while:";
for (const auto& name : while_block_vars) {
VLOG(3) << name;
}
std::sort(vars_in_parent.begin(), vars_in_parent.end());
std::sort(vars_in_sub.begin(), vars_in_sub.end());
std::set_difference(vars_in_sub.begin(),
vars_in_sub.end(),
vars_in_parent.begin(),
vars_in_parent.end(),
std::back_inserter(while_block_vars));
task_node->SetWhileBlockVars(while_block_vars);
}
int64_t interceptor_id = task_node->task_id();
Expand All @@ -144,7 +148,8 @@ void FleetExecutor::Init(
place,
num_micro_batches,
program_desc,
inference_root_scope_vars);
inference_root_scope_vars,
micro_scope_list);
GlobalVal<MessageBus>::Get()->Barrier();
}

Expand All @@ -154,15 +159,17 @@ void FleetExecutor::InitCarrier(
const platform::Place& place,
int64_t num_micro_batches,
const framework::ProgramDesc& program_desc,
const std::vector<std::string>& inference_root_scope_vars) {
const std::vector<std::string>& inference_root_scope_vars,
const std::vector<framework::Scope*>& micro_scope_list) {
carrier->Init(exe_desc_.cur_rank(),
runtime_graph_->interceptor_id_to_rank(),
runtime_graph_->interceptor_id_to_node(),
program_desc,
scope,
num_micro_batches,
place,
inference_root_scope_vars);
inference_root_scope_vars,
micro_scope_list);
}

void FleetExecutor::InitMessageBus() {
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/distributed/fleet_executor/fleet_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"

Expand Down Expand Up @@ -45,7 +46,8 @@ class FleetExecutor final {
int64_t num_micro_batches,
const std::vector<TaskNode*>& task_nodes,
const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
const std::vector<std::string>& inference_root_scope_vars = {});
const std::vector<std::string>& inference_root_scope_vars = {},
const std::vector<framework::Scope*>& micro_scope_list = {});
void Run(const std::string& carrier_id);

private:
Expand All @@ -57,7 +59,8 @@ class FleetExecutor final {
const platform::Place& place,
int64_t num_micro_batches,
const framework::ProgramDesc& program_desc,
const std::vector<std::string>& inference_root_scope_vars = {});
const std::vector<std::string>& inference_root_scope_vars = {},
const std::vector<framework::Scope*>& micro_scope_list = {});
FleetExecutorDesc exe_desc_;
std::shared_ptr<RuntimeGraph> runtime_graph_;
std::unordered_set<std::string> carrier_ids_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ paddle::experimental::Tensor add_n_ad_func(
egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
}
grad_node->SetGradInMeta(out, 0);
egr::EagerUtils::CheckAndRetainGrad(out);
// Set TensorWrappers for Forward Outputs if needed
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ paddle::experimental::Tensor conv2d_ad_func(
egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
}
grad_node->SetGradInMeta(out, 0);
egr::EagerUtils::CheckAndRetainGrad(out);
// Set TensorWrappers for Forward Outputs if needed
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,6 @@ Conv2dGradNodeFinal::operator()(
}
grad_node->SetGradInMeta(grad_input, 0);
grad_node->SetGradInMeta(grad_filter, 1);
egr::EagerUtils::CheckAndRetainGrad(grad_input);
egr::EagerUtils::CheckAndRetainGrad(grad_filter);
// Set TensorWrappers for Forward Outputs if needed
}

Expand Down
Loading

0 comments on commit 4dc7495

Please sign in to comment.