Skip to content

Commit

Permalink
[FEAT] Support building or writing graph from/to GAR format data (#1185)
Browse files Browse the repository at this point in the history
What do these changes do?
-------------------------
The [GAR format](https://github.com/alibaba/GraphAr) is a data format
for graph that keeps the CSR/CSC structure information. And so it is
easy to building the CSR/CSC of graph from the GAR format data.

This PR changes bring the support of loading property graph or writing
the in-memory property graph from/to GAR format data. The PR mainly
includes:
- Add `GARFragmentLoader` and `GARArrowFragmentBuilder` to support
loading arrow property fragment from GAR format data.
- Add `ArrowFragmentWriter` to support dumping arrow property graph to
GAR format data.
- Make [`GraphAr`](https://github.com/alibaba/GraphAr) project as
submodule to provide API to access GAR format data

There are some items need to do:
- [x] Add `GraphAr` as submodule and use compile option to check if
compile with GraphAr
- [x] Add test to check the correctness of loading and writing

Related issue number
--------------------
apache/incubator-graphar#40
apache/incubator-graphar#39

---------

Signed-off-by: acezen <qiaozi.zwb@alibaba-inc.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
  • Loading branch information
acezen and sighingnow committed Feb 16, 2023
1 parent 55a22bc commit 0eda206
Show file tree
Hide file tree
Showing 26 changed files with 2,680 additions and 8 deletions.
18 changes: 13 additions & 5 deletions .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
os: [ubuntu-20.04]
malloc: [dlmalloc, mimalloc]
# metadata: [etcd, redis]
metadata: [etcd] # disable redis for now as it seems has some nondeterministic bugs
metadata: [etcd] # disable redis for now as it seems has some nondeterministic bugs
exclude:
- malloc: mimalloc
metadata: redis
Expand All @@ -42,7 +42,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
submodules: true
submodules: recursive

- name: Generate Summary for Submodules
run: |
Expand Down Expand Up @@ -213,7 +213,8 @@ jobs:
-DBUILD_VINEYARD_HOSSEINMOEIN_DATAFRAME=ON \
-DBUILD_VINEYARD_FUSE=ON \
-DBUILD_VINEYARD_FUSE_PARQUET=ON \
-DBUILD_VINEYARD_TESTS=ON
-DBUILD_VINEYARD_GRAPH_WITH_GAR=ON \
-DBUILD_VINEYARD_TESTS_ALL=ON
if [ "${{ matrix.metadata }}" == "redis" ]; then
cmake .. -DBUILD_VINEYARD_SERVER_REDIS=ON
Expand Down Expand Up @@ -289,6 +290,13 @@ jobs:
- name: Build
run: |
# Workaround to [Errno 28] No space left on device
# https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/lib64
pushd build
Expand Down Expand Up @@ -398,7 +406,7 @@ jobs:
rm -rf default.etcd
rm -rf /dev/shm/etcd*
python3 test/runner.py $RUNNER_ARGS --with-io --with-migration
- name: Run FUSE Tests
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/lib64:/usr/local/lib/x86_64-linux-gnu
Expand All @@ -410,7 +418,7 @@ jobs:
rm -rf default.etcd
rm -rf /dev/shm/etcd*
python3 test/runner.py $RUNNER_ARGS --with-fuse
- name: Find vineyard using CMake
run: |
cmake -S test/vineyard-cmake-example -B build/vineyard-cmake-example
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,6 @@
path = thirdparty/libcuckoo
url = https://github.com/efficient/libcuckoo.git
shallow = true
[submodule "modules/graph/thirdparty/GraphAr"]
path = modules/graph/thirdparty/GraphAr
url = https://github.com/alibaba/GraphAr.git
7 changes: 4 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ if(POLICY CMP0069)
cmake_policy(SET CMP0069 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
endif()

project(vineyard LANGUAGES C CXX VERSION ${VINEYARD_VERSION})

option(BUILD_SHARED_LIBS "Build shared libraries" ON)
Expand Down Expand Up @@ -59,6 +59,7 @@ option(BUILD_VINEYARD_BENCHMARKS "Generate make targets for vineyard benchmarks"
option(BUILD_VINEYARD_BENCHMARKS_ALL "Include make targets for vineyard benchmarks to ALL" OFF)
option(BUILD_VINEYARD_COVERAGE "Build vineyard with coverage information, requires build with Debug" OFF)
option(BUILD_VINEYARD_PROFILING "Build vineyard with profiling information" OFF)
option(BUILD_VINEYARD_GRAPH_WITH_GAR "Building vineyard's graph data with GraphAr support" OFF)

include(CheckCXXCompilerFlag)
include(CheckLibraryExists)
Expand Down Expand Up @@ -649,7 +650,7 @@ if(BUILD_VINEYARD_SERVER)
"src/common/memory/*.cc"
"src/common/util/*.cc"
"src/common/memory/gpu/*.cc"

)
if(BUILD_VINEYARD_SERVER_REDIS)
list(APPEND SERVER_SRC_FILES "thirdparty/redis-plus-plus-shim/recipes/redlock.cpp")
Expand Down Expand Up @@ -677,7 +678,7 @@ if(BUILD_VINEYARD_SERVER)
enable_language(CUDA)
find_package(CUDA REQUIRED)
add_definitions(-DENABLE_GPU)
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(vineyardd PUBLIC ${CUDA_LIBRARIES})
endif()

Expand Down
24 changes: 24 additions & 0 deletions modules/basic/ds/arrow_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,30 @@ Status ConcatenateTables(
return Status::OK();
}

Status ConcatenateTablesColumnWise(
const std::vector<std::shared_ptr<arrow::Table>>& tables,
std::shared_ptr<arrow::Table>& table) {
if (tables.size() == 1) {
table = tables[0];
return Status::OK();
}
table = tables[0];
std::vector<std::shared_ptr<arrow::ChunkedArray>> columns = table->columns();
std::vector<std::shared_ptr<arrow::Field>> fields = table->fields();
for (size_t i = 1; i < tables.size(); ++i) {
const std::vector<std::shared_ptr<arrow::ChunkedArray>>& right_columns =
tables[i]->columns();
columns.insert(columns.end(), right_columns.begin(), right_columns.end());

const std::vector<std::shared_ptr<arrow::Field>>& right_fields =
tables[i]->fields();
fields.insert(fields.end(), right_fields.begin(), right_fields.end());
}
table =
arrow::Table::Make(arrow::schema(std::move(fields)), std::move(columns));
return Status::OK();
}

std::shared_ptr<arrow::RecordBatch> AddMetadataToRecordBatch(
std::shared_ptr<arrow::RecordBatch> const& batch,
std::map<std::string, std::string> const& meta) {
Expand Down
7 changes: 7 additions & 0 deletions modules/basic/ds/arrow_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,13 @@ Status ConcatenateTables(
const std::vector<std::shared_ptr<arrow::Table>>& tables,
std::shared_ptr<arrow::Table>& table);

/**
* @brief Concatenate multiple arrow tables into one in column wise.
*/
Status ConcatenateTablesColumnWise(
const std::vector<std::shared_ptr<arrow::Table>>& tables,
std::shared_ptr<arrow::Table>& table);

/**
* @brief Add extra metadata mapping to existing recordbatch.
*/
Expand Down
19 changes: 19 additions & 0 deletions modules/graph/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ file(GLOB_RECURSE GRAPH_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}" "fragment/*.cc"
"loader/*.cc"
"utils/*.cc"
"vertex_map/*.cc"
"writer/*.cc"
)

add_library(vineyard_graph ${GRAPH_SRC_FILES})
Expand Down Expand Up @@ -76,6 +77,24 @@ else()
)
endif()

if(BUILD_VINEYARD_GRAPH_WITH_GAR)
target_compile_definitions(vineyard_graph PUBLIC -DENABLE_GAR)
find_package(gar QUIET)
if (gar_FOUND)
message(STATUS "-- Found GraphAr: ${GAR_LIBRARIES}")
target_include_directories(vineyard_graph PUBLIC ${GAR_INCLUDE_DIRS})
target_link_libraries(vineyard_graph PUBLIC ${GAR_LIBRARIES})
else()
# build and install gar
message(STATUS "-- Building GraphAr from submodule: ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/GraphAr")
add_subdirectory_static("${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/GraphAr"
"${CMAKE_CURRENT_BINARY_DIR}/thirdparty/GraphAr"
)
# depends on gar
target_link_libraries(vineyard_graph PUBLIC gar)
endif()
endif()

target_include_directories(vineyard_graph PUBLIC
$<BUILD_INTERFACE:${LIBGRAPELITE_INCLUDE_DIRS}>
$<INSTALL_INTERFACE:include>
Expand Down
104 changes: 104 additions & 0 deletions modules/graph/fragment/gar_fragment_builder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/** Copyright 2020-2023 Alibaba Group Holding Limited.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#ifndef MODULES_GRAPH_FRAGMENT_GAR_FRAGMENT_BUILDER_H_
#define MODULES_GRAPH_FRAGMENT_GAR_FRAGMENT_BUILDER_H_

#ifdef ENABLE_GAR

#include <memory>
#include <vector>

#include "arrow/api.h"
#include "arrow/io/api.h"

#include "grape/worker/comm_spec.h"

#include "client/client.h"

#include "graph/loader/fragment_loader_utils.h"

namespace vineyard {

template <typename OID_T, typename VID_T,
typename VERTEX_MAP_T =
ArrowVertexMap<typename InternalType<OID_T>::type, VID_T>>
class GARFragmentBuilder
: public ArrowFragmentBaseBuilder<OID_T, VID_T, VERTEX_MAP_T> {
using Base = ArrowFragmentBaseBuilder<OID_T, VID_T, VERTEX_MAP_T>;

using oid_t = OID_T;
using vid_t = VID_T;
using internal_oid_t = typename InternalType<oid_t>::type;
using eid_t = property_graph_types::EID_TYPE;
using label_id_t = property_graph_types::LABEL_ID_TYPE;
using vertex_map_t = VERTEX_MAP_T;
using nbr_unit_t = property_graph_utils::NbrUnit<vid_t, eid_t>;
using vid_array_t = vineyard::ArrowArrayType<vid_t>;

public:
explicit GARFragmentBuilder(vineyard::Client& client,
std::shared_ptr<vertex_map_t> vm_ptr)
: ArrowFragmentBaseBuilder<oid_t, vid_t, vertex_map_t>(client),
client_(client),
vm_ptr_(vm_ptr) {}

vineyard::Status Build(vineyard::Client& client) override;

boost::leaf::result<void> Init(
fid_t fid, fid_t fnum,
std::vector<std::shared_ptr<arrow::Table>>&& vertex_tables,
std::vector<EdgeTableInfo>&& csr_edge_tables,
std::vector<EdgeTableInfo>&& csc_edge_tables, bool directed = true,
int concurrency = 1);

boost::leaf::result<void> SetPropertyGraphSchema(
PropertyGraphSchema&& schema);

private:
// | prop_0 | prop_1 | ... |
boost::leaf::result<void> initVertices(
std::vector<std::shared_ptr<arrow::Table>>&& vertex_tables);

// | src_id(generated) | dst_id(generated) | prop_0 | prop_1
// | ... |
boost::leaf::result<void> initEdges(
std::vector<EdgeTableInfo>&& csr_edge_tables,
std::vector<EdgeTableInfo>&& csc_edge_tables, int concurrency);

vineyard::Client& client_;
std::vector<vid_t> ivnums_, ovnums_, tvnums_;

std::vector<std::shared_ptr<arrow::Table>> vertex_tables_;
std::vector<std::shared_ptr<vid_array_t>> ovgid_lists_;
std::vector<typename ArrowFragment<OID_T, VID_T>::ovg2l_map_t> ovg2l_maps_;

std::vector<std::shared_ptr<arrow::Table>> edge_tables_;
std::vector<std::shared_ptr<arrow::Int64Array>> offset_arrays_;

std::vector<std::vector<std::shared_ptr<PodArrayBuilder<nbr_unit_t>>>>
ie_lists_, oe_lists_;
std::vector<std::vector<std::shared_ptr<arrow::Int64Array>>>
ie_offsets_lists_, oe_offsets_lists_;

std::shared_ptr<vertex_map_t> vm_ptr_;

IdParser<vid_t> vid_parser_;
};

} // namespace vineyard

#endif // ENABLE_GAR
#endif // MODULES_GRAPH_FRAGMENT_GAR_FRAGMENT_BUILDER_H_

0 comments on commit 0eda206

Please sign in to comment.