Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPUDirect RDMA Out-of-Band Tensor Transport #11392

Merged
merged 48 commits into from Aug 9, 2017
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
f7f555f
GPU Direct RDMA Out-of-Band Tensor Transport
byronyi May 4, 2017
114b110
[WIP] GPU Direct with customized allocator
byronyi Jul 4, 2017
5b5661b
[WIP] Data race problem
byronyi Jul 7, 2017
dd7e2fc
[WIP] Refactor and add checksum for GDR
byronyi Jul 9, 2017
6c91828
[WIP] Add debug string to checksum check
byronyi Jul 9, 2017
a1323e5
Final piece of host memory fallback
byronyi Jul 9, 2017
44d74bd
Bugfix on memory region management
byronyi Jul 9, 2017
d3b84ee
Add RDMA library headers as third party dependency
byronyi Jul 10, 2017
be585a2
Revert "Add RDMA library headers as third party dependency"
byronyi Jul 11, 2017
2852d34
make buildifier happy
byronyi Jul 11, 2017
9638a5f
fix errors for non-RDMA target
byronyi Jul 11, 2017
4808c65
fix good path
byronyi Jul 11, 2017
f07170e
fix dangling pointer
byronyi Jul 11, 2017
947876e
add compile switch for GDR
byronyi Jul 13, 2017
946afcb
make buildifier happy
byronyi Jul 13, 2017
cdc98e1
tidy source format using clang-format --style=google
byronyi Jul 13, 2017
e6cc8ee
using buildifier to auto format
byronyi Jul 13, 2017
8858452
fix macro
byronyi Jul 13, 2017
d3ae026
fix build config
byronyi Jul 13, 2017
afd3c36
Fix a performance bug (and #11411 hopefully)
byronyi Jul 14, 2017
0cb5019
fix kUnknownNumaNode
byronyi Jul 17, 2017
074b48a
prepare for a cleaned up refactoring
byronyi Jul 31, 2017
915f2b2
remove unnecessary changes
byronyi Jul 31, 2017
2e874df
finishing moving to contrib
byronyi Aug 1, 2017
1e8b668
several quick fixes
byronyi Aug 1, 2017
7739c56
fix tests
byronyi Aug 2, 2017
e7eaf28
remove wrong stop() call in server join
byronyi Aug 2, 2017
0191ccf
fix a init race condition for gdr w.r.t. cuda
byronyi Aug 2, 2017
e4c1a99
better reporting of errors via errno
byronyi Aug 3, 2017
59e7b7d
add check for wildcard and loopback address
byronyi Aug 3, 2017
3c54f86
do not visit gpu allocators if no gpu is active
byronyi Aug 3, 2017
3b8f30c
Revert "do not visit gpu allocators if no gpu is active"
byronyi Aug 3, 2017
054daeb
fix for latest gcc and cpu-only build
byronyi Aug 3, 2017
6720470
complain louder on server side
byronyi Aug 3, 2017
9557643
reduce CPU overhead using event mode
byronyi Aug 3, 2017
71eab60
leave platform-neutral gdr to future work
byronyi Aug 4, 2017
4882418
add docs
byronyi Aug 4, 2017
7240ae3
fix interference with MKL CPU allocator
byronyi Aug 4, 2017
c20e8f4
fix testing if a tensor is on host
byronyi Aug 4, 2017
e841bc8
add several TODOs and checks
byronyi Aug 4, 2017
0a1e5c5
add a readme
byronyi Aug 5, 2017
5af83a5
Update README.md
byronyi Aug 5, 2017
bfb8f24
update readme.md
byronyi Aug 7, 2017
9bb2c8e
fix several issues under VLOG
byronyi Aug 7, 2017
554c3e4
fix typo in readme
byronyi Aug 7, 2017
f9a9d18
fix a race condition of GDR
byronyi Aug 8, 2017
db5501e
merge HEAD from master
byronyi Aug 8, 2017
241c020
remove unintended changes
byronyi Aug 8, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions configure.py
Expand Up @@ -911,6 +911,8 @@ def main():
'with_hdfs_support', False)
set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
False)
set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
False)
set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
False)

Expand Down
6 changes: 6 additions & 0 deletions tensorflow/BUILD
Expand Up @@ -182,6 +182,12 @@ config_setting(
visibility = ["//visibility:public"],
)

config_setting(
name = "with_gdr_support",
values = {"define": "with_gdr_support=true"},
visibility = ["//visibility:public"],
)

config_setting(
name = "with_verbs_support",
values = {"define": "with_verbs_support=true"},
Expand Down
122 changes: 122 additions & 0 deletions tensorflow/contrib/gdr/BUILD
@@ -0,0 +1,122 @@
# Description:
# GPU Direct RDMA Out-of-Band Tensor transport for TensorFlow.

package(default_visibility = [
"//tensorflow:__subpackages__",
])

licenses(["notice"]) # Apache 2.0

exports_files(["LICENSE"])

filegroup(
name = "all_files",
srcs = glob(
["**/*"],
exclude = [
"**/METADATA",
"**/OWNERS",
],
),
visibility = ["//tensorflow:__subpackages__"],
)

filegroup(
name = "c_srcs",
data = glob([
"**/*.cc",
"**/*.h",
]),
)

load(
"//tensorflow:tensorflow.bzl",
"tf_cuda_library",
)

# For platform specific build config
load(
"//tensorflow/core:platform/default/build_config.bzl",
"tf_proto_library_cc",
)

tf_proto_library_cc(
name = "gdr_proto",
srcs = ["gdr.proto"],
cc_api_version = 2,
visibility = [
"//tensorflow:__subpackages__",
],
)

tf_cuda_library(
name = "gdr_memory_manager",
srcs = ["gdr_memory_manager.cc"],
hdrs = ["gdr_memory_manager.h"],
linkopts = select({
"//tensorflow:with_gdr_support": [
"-libverbs",
"-lrdmacm",
],
"//conditions:default": [],
}),
deps = [
":gdr_proto_cc",
"//tensorflow/core:framework",
"//tensorflow/core:gpu_runtime",
"//tensorflow/core:lib",
],
)

tf_cuda_library(
name = "gdr_worker",
srcs = ["gdr_worker.cc"],
hdrs = ["gdr_worker.h"],
deps = [
":gdr_memory_manager",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
"//tensorflow/core:gpu_runtime",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core/distributed_runtime:graph_mgr",
"//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
"//tensorflow/core/distributed_runtime:worker",
"//tensorflow/core/distributed_runtime:worker_cache",
"//tensorflow/core/distributed_runtime:worker_env",
"//tensorflow/core/distributed_runtime:worker_session",
"//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
],
)

cc_library(
name = "gdr_rendezvous_mgr",
srcs = ["gdr_rendezvous_mgr.cc"],
hdrs = ["gdr_rendezvous_mgr.h"],
deps = [
":gdr_memory_manager",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
"//tensorflow/core/distributed_runtime:worker_cache",
"//tensorflow/core/distributed_runtime:worker_env",
"//tensorflow/core/distributed_runtime:worker_interface",
],
)

cc_library(
name = "gdr_server_lib",
srcs = ["gdr_server_lib.cc"],
hdrs = ["gdr_server_lib.h"],
linkstatic = 1, # Seems to be needed since alwayslink is broken in bazel
deps = [
":gdr_memory_manager",
":gdr_rendezvous_mgr",
":gdr_worker",
"//tensorflow/core:lib",
"//tensorflow/core/distributed_runtime:server_lib",
"//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
],
alwayslink = 1,
)
15 changes: 15 additions & 0 deletions tensorflow/contrib/gdr/gdr.proto
@@ -0,0 +1,15 @@
syntax = "proto3";

package tensorflow;
option cc_enable_arenas = true;

import "google/protobuf/any.proto";

message RemoteMemoryRegion {
string host = 1;
string port = 2;
uint64 addr = 3;
uint32 rkey = 4;
uint32 tensor_key = 5;
uint64 checksum = 6;
}