diff --git a/.github/workflows/flex.yml b/.github/workflows/flex.yml index fb5381ef6fea..6f5a798f5fa1 100644 --- a/.github/workflows/flex.yml +++ b/.github/workflows/flex.yml @@ -26,7 +26,7 @@ jobs: runs-on: ubuntu-20.04 if: ${{ github.repository == 'alibaba/GraphScope' }} container: - image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4 + image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6 steps: - uses: actions/checkout@v3 @@ -55,7 +55,9 @@ jobs: mkdir build && cd build cmake .. && sudo make -j$(nproc) export FLEX_DATA_DIR=../../../../storages/rt_mutable_graph/modern_graph/ - ./run_grin_test + ./run_grin_test flex://../../../../interactive/examples/modern_graph/ \ + ../../../../interactive/examples/modern_graph/modern_graph.yaml \ + ../../../../interactive/examples/modern_graph/bulk_load.yaml - name: Test Graph Loading on modern graph env: diff --git a/.github/workflows/hqps-db-ci.yml b/.github/workflows/hqps-db-ci.yml index 289add5288c9..8f875b64ae36 100644 --- a/.github/workflows/hqps-db-ci.yml +++ b/.github/workflows/hqps-db-ci.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-20.04 if: ${{ github.repository == 'alibaba/GraphScope' }} container: - image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4 + image: registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6 steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index 50f7e42c339a..477bf177b5ef 100644 --- a/.gitignore +++ b/.gitignore @@ -90,8 +90,8 @@ core.* # Flex related flex/docs/ -flex/interactive/data/*/indices/ -flex/interactive/data/*/plugins/ +flex/interactive/data/* flex/interactive/logs/* flex/interactive/examples/sf0.1-raw/ -flex/interactive/.running \ No newline at end of file +flex/interactive/.running +flex/interactive/.env \ No newline at end of file diff --git a/flex/.devcontainer.json b/flex/.devcontainer.json index 7bd8fe4dd2b2..6118d0fbdae8 100644 --- a/flex/.devcontainer.json +++ b/flex/.devcontainer.json @@ -3,7 +3,7 @@ { "name": "GraphScope", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile - "image": "registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4", + "image": "registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6", // Features to add to the dev container. More info: https://containers.dev/features. "features": { "ghcr.io/devcontainers/features/common-utils:2": { diff --git a/flex/CMakeLists.txt b/flex/CMakeLists.txt index 5169aa6257af..abe74ef335b3 100644 --- a/flex/CMakeLists.txt +++ b/flex/CMakeLists.txt @@ -19,6 +19,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../) set(DEFAULT_BUILD_TYPE "Release") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -mno-avx512f -fPIC") +set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0") add_compile_definitions(FLEX_VERSION="${FLEX_VERSION}") @@ -61,6 +62,20 @@ find_package(Boost REQUIRED COMPONENTS system filesystem # required by folly context program_options regex thread) +#find arrow---------------------------------------------------------------------- +include("cmake/FindArrow.cmake") +if (NOT ARROW_FOUND) + message(FATAL_ERROR "arrow not found, please install the arrow library") +else () + include_directories(SYSTEM ${ARROW_INCLUDE_DIRS}) + if (TARGET arrow_shared) + set(ARROW_SHARED_LIB arrow_shared) + endif() + if (TARGET arrow_static) + set(ARROW_STATIC_LIB arrow_static) + endif() +endif () + # Find Doxygen if (BUILD_DOC) find_package(Doxygen) diff --git a/flex/Dockerfile b/flex/Dockerfile index d047cab1e212..7d5874a60dd9 100644 --- a/flex/Dockerfile +++ b/flex/Dockerfile @@ -5,7 +5,7 @@ ARG CI=false SHELL ["/bin/bash", "-c"] -RUN apt update && apt -y install locales && locale-gen en_US.UTF-8 +RUN apt-get update && apt-get -y install locales && locale-gen en_US.UTF-8 ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 @@ -15,29 +15,32 @@ ENV TZ=Asia/Shanghai RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone # install dependencies -RUN apt install -y \ - ninja-build ragel libhwloc-dev libnuma-dev libpciaccess-dev vim wget \ - git g++ libgoogle-glog-dev cmake libopenmpi-dev default-jdk libcrypto++-dev \ - libboost-all-dev libxml2-dev -RUN apt install -y xfslibs-dev libgnutls28-dev liblz4-dev maven openssl pkg-config \ - libsctp-dev gcc make python3 systemtap-sdt-dev libtool libyaml-cpp-dev \ - libc-ares-dev stow libfmt-dev diffutils valgrind doxygen python3-pip net-tools +RUN apt-get update && apt-get install -y vim wget \ + git g++ libgoogle-glog-dev cmake libopenmpi-dev default-jdk \ + libboost-all-dev +RUN apt-get install -y maven openssl \ + gcc make python3 libyaml-cpp-dev \ + libc-ares-dev doxygen python3-pip net-tools curl # install libgrape-lite -RUN cd /root && \ +RUN cd /tmp && \ git clone https://github.com/alibaba/libgrape-lite.git -b v0.3.2 --single-branch && cd libgrape-lite && \ - mkdir build && cd build && cmake .. && make -j && make install + mkdir build && cd build && cmake .. && make -j && make install && rm -rf /tmp/libgrape-lite -RUN cp /usr/local/lib/libgrape-lite.so /usr/lib/libgrape-lite.so - -RUN git clone https://github.com/alibaba/hiactor.git -b v0.1.1 --single-branch && cd hiactor && \ +RUN cd /tmp && git clone https://github.com/alibaba/hiactor.git -b v0.1.1 --single-branch && cd hiactor && \ git submodule update --init --recursive && ./seastar/seastar/install-dependencies.sh && mkdir build && cd build && \ cmake -DHiactor_DEMOS=OFF -DHiactor_TESTING=OFF -DHiactor_DPDK=OFF -DHiactor_CXX_DIALECT=gnu++17 -DSeastar_CXX_FLAGS="-DSEASTAR_DEFAULT_ALLOCATOR -mno-avx512" .. && \ - make -j && make install + make -j && make install && rm -rf /tmp/hiactor #install protobuf RUN apt-get install -y protobuf-compiler libprotobuf-dev +#install arrow +RUN cd /tmp && apt-get install -y -V ca-certificates lsb-release wget && \ + curl -o apache-arrow-apt-source-latest.deb https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb && \ + apt-get install -y ./apache-arrow-apt-source-latest.deb && \ + apt-get update && apt-get install -y libarrow-dev=6.0.1-1 + RUN apt-get install -y sudo # Add graphscope user with user id 1001 @@ -49,8 +52,9 @@ USER graphscope WORKDIR /home/graphscope RUN curl -sf -L https://static.rust-lang.org/rustup.sh | \ - sh -s -- -y --profile minimal && \ + sh -s -- -y --profile minimal --default-toolchain=1.70.0 && \ chmod +x "$HOME/.cargo/env" && \ echo "$source $HOME/.cargo/env" >> ~/.bashrc && \ source "$HOME/.cargo/env" && \ + echo "1.70.0" > rust-toolchain && \ bash -c "rustup component add rustfmt" diff --git a/flex/bin/load_plan_and_gen.sh b/flex/bin/load_plan_and_gen.sh index 354112fe9499..1a2179e22249 100755 --- a/flex/bin/load_plan_and_gen.sh +++ b/flex/bin/load_plan_and_gen.sh @@ -191,6 +191,7 @@ compile_hqps_so() { cur_dir=${work_dir} mkdir -p ${cur_dir} output_cc_path="${cur_dir}/${query_name}.cc" + dst_yaml_path="${output_dir}/${query_name}.yaml" if [[ $(uname) == "Linux" ]]; then output_so_path="${cur_dir}/lib${query_name}.so" dst_so_path="${output_dir}/lib${query_name}.so" @@ -294,6 +295,12 @@ compile_hqps_so() { echo "Copy failed, ${dst_so_path} not exists." exit 1 fi + # copy the generated yaml + cp ${output_yaml_path} ${output_dir} + if [ ! -f ${dst_yaml_path} ]; then + echo "Copy failed, ${dst_yaml_path} not exists." + exit 1 + fi echo "Finish copying, output to ${dst_so_path}" } diff --git a/flex/bin/sync_server.cc b/flex/bin/sync_server.cc index 67b3d5c59f3d..e95376fcb5ef 100644 --- a/flex/bin/sync_server.cc +++ b/flex/bin/sync_server.cc @@ -120,11 +120,11 @@ std::tuple parse_from_server_config( << engine_type_str; } } - auto shard_num_node = engine_node["shared_num"]; + auto shard_num_node = engine_node["shard_num"]; if (shard_num_node) { shard_num = shard_num_node.as(); } else { - LOG(INFO) << "shared_num not found, use default value " + LOG(INFO) << "shard_num not found, use default value " << DEFAULT_SHARD_NUM; } auto host_node = engine_node["hosts"]; @@ -167,7 +167,9 @@ void load_plugins(const bpo::variables_map& vm) { } } -void init_codegen_proxy(const bpo::variables_map& vm) { +void init_codegen_proxy(const bpo::variables_map& vm, + const std::string& graph_schema_file, + const std::string& engine_config_file) { std::string codegen_dir = parse_codegen_dir(vm); std::string codegen_bin; std::string gie_home; @@ -181,25 +183,6 @@ void init_codegen_proxy(const bpo::variables_map& vm) { LOG(FATAL) << "codegen bin not exists: " << codegen_bin; } } - std::string ir_compiler_properties; - std::string compiler_graph_schema; - if (vm.count("ir-compiler-prop") == 0) { - LOG(FATAL) << "ir-compiler-prop is not specified"; - } else { - ir_compiler_properties = vm["ir-compiler-prop"].as(); - if (!std::filesystem::exists(ir_compiler_properties)) { - LOG(FATAL) << "ir-compiler-prop not exists: " << ir_compiler_properties; - } - } - if (vm.count("compiler-graph-schema") == 0) { - LOG(FATAL) << "compiler-graph-schema is not specified"; - } else { - compiler_graph_schema = vm["compiler-graph-schema"].as(); - if (!std::filesystem::exists(compiler_graph_schema)) { - LOG(FATAL) << "compiler-graph-schema not exists: " - << compiler_graph_schema; - } - } if (vm.count("gie-home") == 0) { LOG(FATAL) << "gie-home is not specified"; } else { @@ -208,9 +191,8 @@ void init_codegen_proxy(const bpo::variables_map& vm) { LOG(FATAL) << "gie-home not exists: " << gie_home; } } - server::CodegenProxy::get().Init(codegen_dir, codegen_bin, - ir_compiler_properties, - compiler_graph_schema, gie_home); + server::CodegenProxy::get().Init(codegen_dir, codegen_bin, graph_schema_file, + engine_config_file, gie_home); } } // namespace gs @@ -227,11 +209,7 @@ int main(int argc, char** argv) { "data-path,a", bpo::value(), "data directory path")( "bulk-load,l", bpo::value(), "bulk-load config file")( "plugin-dir,p", bpo::value(), "plugin directory path")( - "gie-home,h", bpo::value(), "path to gie home")( - "ir-compiler-prop,i", bpo::value(), - "ir compiler property file")("compiler-graph-schema,z", - bpo::value(), - "compiler graph schema file"); + "gie-home,h", bpo::value(), "path to gie home"); setenv("TZ", "Asia/Shanghai", 1); tzset(); @@ -251,9 +229,10 @@ int main(int argc, char** argv) { std::string data_path; std::string bulk_load_config_path; std::string plugin_dir; + std::string server_config_path; if (vm.count("server-config") != 0) { - std::string server_config_path = vm["server-config"].as(); + server_config_path = vm["server-config"].as(); // check file exists if (!std::filesystem::exists(server_config_path)) { LOG(ERROR) << "server-config not exists: " << server_config_path; @@ -295,7 +274,7 @@ int main(int argc, char** argv) { // loading plugin gs::load_plugins(vm); - gs::init_codegen_proxy(vm); + gs::init_codegen_proxy(vm, graph_schema_path, server_config_path); server::HQPSService::get().init(shard_num, http_port, false); server::HQPSService::get().run_and_wait_for_exit(); diff --git a/flex/cmake/FindArrow.cmake b/flex/cmake/FindArrow.cmake new file mode 100644 index 000000000000..b26ca9adf752 --- /dev/null +++ b/flex/cmake/FindArrow.cmake @@ -0,0 +1,438 @@ +# The file cmake/FindArrow.cmake is referered from project +# https://github.com/apache/arrow +# +# https://github.com/apache/arrow/blob/master/cpp/cmake_modules/FindArrow.cmake +# +# which has the following license: +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# - Find Arrow (arrow/api.h, libarrow.a, libarrow.so) +# This module defines +# ARROW_FOUND, whether Arrow has been found +# ARROW_FULL_SO_VERSION, full shared object version of found Arrow "100.0.0" +# ARROW_IMPORT_LIB, path to libarrow's import library (Windows only) +# ARROW_INCLUDE_DIR, directory containing headers +# ARROW_LIBS, deprecated. Use ARROW_LIB_DIR instead +# ARROW_LIB_DIR, directory containing Arrow libraries +# ARROW_SHARED_IMP_LIB, deprecated. Use ARROW_IMPORT_LIB instead +# ARROW_SHARED_LIB, path to libarrow's shared library +# ARROW_SO_VERSION, shared object version of found Arrow such as "100" +# ARROW_STATIC_LIB, path to libarrow.a +# ARROW_VERSION, version of found Arrow +# ARROW_VERSION_MAJOR, major version of found Arrow +# ARROW_VERSION_MINOR, minor version of found Arrow +# ARROW_VERSION_PATCH, patch version of found Arrow + +if(DEFINED ARROW_FOUND) + return() +endif() + +include(FindPkgConfig) +include(FindPackageHandleStandardArgs) + +set(ARROW_SEARCH_LIB_PATH_SUFFIXES) +if(CMAKE_LIBRARY_ARCHITECTURE) + list(APPEND ARROW_SEARCH_LIB_PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}") +endif() +list(APPEND ARROW_SEARCH_LIB_PATH_SUFFIXES + "lib64" + "lib32" + "lib" + "bin") +set(ARROW_CONFIG_SUFFIXES + "_RELEASE" + "_RELWITHDEBINFO" + "_MINSIZEREL" + "_DEBUG" + "") +if(CMAKE_BUILD_TYPE) + string(TOUPPER ${CMAKE_BUILD_TYPE} ARROW_CONFIG_SUFFIX_PREFERRED) + set(ARROW_CONFIG_SUFFIX_PREFERRED "_${ARROW_CONFIG_SUFFIX_PREFERRED}") + list(INSERT ARROW_CONFIG_SUFFIXES 0 "${ARROW_CONFIG_SUFFIX_PREFERRED}") +endif() + +if(NOT DEFINED ARROW_MSVC_STATIC_LIB_SUFFIX) + if(MSVC) + set(ARROW_MSVC_STATIC_LIB_SUFFIX "_static") + else() + set(ARROW_MSVC_STATIC_LIB_SUFFIX "") + endif() +endif() + +# Internal function. +# +# Set shared library name for ${base_name} to ${output_variable}. +# +# Example: +# arrow_build_shared_library_name(ARROW_SHARED_LIBRARY_NAME arrow) +# # -> ARROW_SHARED_LIBRARY_NAME=libarrow.so on Linux +# # -> ARROW_SHARED_LIBRARY_NAME=libarrow.dylib on macOS +# # -> ARROW_SHARED_LIBRARY_NAME=arrow.dll with MSVC on Windows +# # -> ARROW_SHARED_LIBRARY_NAME=libarrow.dll with MinGW on Windows +function(arrow_build_shared_library_name output_variable base_name) + set(${output_variable} + "${CMAKE_SHARED_LIBRARY_PREFIX}${base_name}${CMAKE_SHARED_LIBRARY_SUFFIX}" + PARENT_SCOPE) +endfunction() + +# Internal function. +# +# Set import library name for ${base_name} to ${output_variable}. +# This is useful only for MSVC build. Import library is used only +# with MSVC build. +# +# Example: +# arrow_build_import_library_name(ARROW_IMPORT_LIBRARY_NAME arrow) +# # -> ARROW_IMPORT_LIBRARY_NAME=arrow on Linux (meaningless) +# # -> ARROW_IMPORT_LIBRARY_NAME=arrow on macOS (meaningless) +# # -> ARROW_IMPORT_LIBRARY_NAME=arrow.lib with MSVC on Windows +# # -> ARROW_IMPORT_LIBRARY_NAME=libarrow.dll.a with MinGW on Windows +function(arrow_build_import_library_name output_variable base_name) + set(${output_variable} + "${CMAKE_IMPORT_LIBRARY_PREFIX}${base_name}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + PARENT_SCOPE) +endfunction() + +# Internal function. +# +# Set static library name for ${base_name} to ${output_variable}. +# +# Example: +# arrow_build_static_library_name(ARROW_STATIC_LIBRARY_NAME arrow) +# # -> ARROW_STATIC_LIBRARY_NAME=libarrow.a on Linux +# # -> ARROW_STATIC_LIBRARY_NAME=libarrow.a on macOS +# # -> ARROW_STATIC_LIBRARY_NAME=arrow.lib with MSVC on Windows +# # -> ARROW_STATIC_LIBRARY_NAME=libarrow.dll.a with MinGW on Windows +function(arrow_build_static_library_name output_variable base_name) + set( + ${output_variable} + "${CMAKE_STATIC_LIBRARY_PREFIX}${base_name}${ARROW_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) +endfunction() + +# Internal function. +# +# Set macro value for ${macro_name} in ${header_content} to ${output_variable}. +# +# Example: +# arrow_extract_macro_value(version_major +# "ARROW_VERSION_MAJOR" +# "#define ARROW_VERSION_MAJOR 1.0.0") +# # -> version_major=1.0.0 +function(arrow_extract_macro_value output_variable macro_name header_content) + string(REGEX MATCH "#define +${macro_name} +[^\r\n]+" macro_definition + "${header_content}") + string(REGEX + REPLACE "^#define +${macro_name} +(.+)$" "\\1" macro_value "${macro_definition}") + set(${output_variable} "${macro_value}" PARENT_SCOPE) +endfunction() + +# Internal macro only for arrow_find_package. +# +# Find package in HOME. +macro(arrow_find_package_home) + find_path(${prefix}_include_dir "${header_path}" + PATHS "${home}" + PATH_SUFFIXES "include" + NO_DEFAULT_PATH) + set(include_dir "${${prefix}_include_dir}") + set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) + + if(MSVC) + set(CMAKE_SHARED_LIBRARY_SUFFIXES_ORIGINAL ${CMAKE_FIND_LIBRARY_SUFFIXES}) + # .dll isn't found by find_library with MSVC because .dll isn't included in + # CMAKE_FIND_LIBRARY_SUFFIXES. + list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}") + endif() + find_library(${prefix}_shared_lib + NAMES "${shared_lib_name}" + PATHS "${home}" + PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} + NO_DEFAULT_PATH) + if(MSVC) + set(CMAKE_SHARED_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_ORIGINAL}) + endif() + set(shared_lib "${${prefix}_shared_lib}") + set(${prefix}_SHARED_LIB "${shared_lib}" PARENT_SCOPE) + if(shared_lib) + add_library(${target_shared} SHARED IMPORTED) + set_target_properties(${target_shared} PROPERTIES IMPORTED_LOCATION "${shared_lib}") + if(include_dir) + set_target_properties(${target_shared} + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}") + endif() + find_library(${prefix}_import_lib + NAMES "${import_lib_name}" + PATHS "${home}" + PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} + NO_DEFAULT_PATH) + set(import_lib "${${prefix}_import_lib}") + set(${prefix}_IMPORT_LIB "${import_lib}" PARENT_SCOPE) + if(import_lib) + set_target_properties(${target_shared} PROPERTIES IMPORTED_IMPLIB "${import_lib}") + endif() + endif() + + find_library(${prefix}_static_lib + NAMES "${static_lib_name}" + PATHS "${home}" + PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} + NO_DEFAULT_PATH) + set(static_lib "${${prefix}_static_lib}") + set(${prefix}_STATIC_LIB "${static_lib}" PARENT_SCOPE) + if(static_lib) + add_library(${target_static} STATIC IMPORTED) + set_target_properties(${target_static} PROPERTIES IMPORTED_LOCATION "${static_lib}") + if(include_dir) + set_target_properties(${target_static} + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}") + endif() + endif() +endmacro() + +# Internal macro only for arrow_find_package. +# +# Find package by CMake package configuration. +macro(arrow_find_package_cmake_package_configuration) + find_package(${cmake_package_name} CONFIG) + if(${cmake_package_name}_FOUND) + set(${prefix}_USE_CMAKE_PACKAGE_CONFIG TRUE PARENT_SCOPE) + if(TARGET ${target_shared}) + foreach(suffix ${ARROW_CONFIG_SUFFIXES}) + get_target_property(shared_lib ${target_shared} IMPORTED_LOCATION${suffix}) + if(shared_lib) + # Remove shared library version: + # libarrow.so.100.0.0 -> libarrow.so + # Because ARROW_HOME and pkg-config approaches don't add + # shared library version. + string(REGEX + REPLACE "(${CMAKE_SHARED_LIBRARY_SUFFIX})[.0-9]+$" "\\1" shared_lib + "${shared_lib}") + set(${prefix}_SHARED_LIB "${shared_lib}" PARENT_SCOPE) + break() + endif() + endforeach() + endif() + if(TARGET ${target_static}) + foreach(suffix ${ARROW_CONFIG_SUFFIXES}) + get_target_property(static_lib ${target_static} IMPORTED_LOCATION${suffix}) + if(static_lib) + set(${prefix}_STATIC_LIB "${static_lib}" PARENT_SCOPE) + break() + endif() + endforeach() + endif() + endif() +endmacro() + +# Internal macro only for arrow_find_package. +# +# Find package by pkg-config. +macro(arrow_find_package_pkg_config) + pkg_check_modules(${prefix}_PC ${pkg_config_name}) + if(${prefix}_PC_FOUND) + set(${prefix}_USE_PKG_CONFIG TRUE PARENT_SCOPE) + + set(include_dir "${${prefix}_PC_INCLUDEDIR}") + set(lib_dir "${${prefix}_PC_LIBDIR}") + set(shared_lib_paths "${${prefix}_PC_LINK_LIBRARIES}") + # Use the first shared library path as the IMPORTED_LOCATION + # for ${target_shared}. This assumes that the first shared library + # path is the shared library path for this module. + list(GET shared_lib_paths 0 first_shared_lib_path) + # Use the rest shared library paths as the INTERFACE_LINK_LIBRARIES + # for ${target_shared}. This assumes that the rest shared library + # paths are dependency library paths for this module. + list(LENGTH shared_lib_paths n_shared_lib_paths) + if(n_shared_lib_paths LESS_EQUAL 1) + set(rest_shared_lib_paths) + else() + list(SUBLIST + shared_lib_paths + 1 + -1 + rest_shared_lib_paths) + endif() + + set(${prefix}_VERSION "${${prefix}_PC_VERSION}" PARENT_SCOPE) + set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) + set(${prefix}_SHARED_LIB "${first_shared_lib_path}" PARENT_SCOPE) + + add_library(${target_shared} SHARED IMPORTED) + set_target_properties(${target_shared} + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${include_dir}" + INTERFACE_LINK_LIBRARIES + "${rest_shared_lib_paths}" + IMPORTED_LOCATION + "${first_shared_lib_path}") + get_target_property(shared_lib ${target_shared} IMPORTED_LOCATION) + + find_library(${prefix}_static_lib + NAMES "${static_lib_name}" + PATHS "${lib_dir}" + NO_DEFAULT_PATH) + set(static_lib "${${prefix}_static_lib}") + set(${prefix}_STATIC_LIB "${static_lib}" PARENT_SCOPE) + if(static_lib) + add_library(${target_static} STATIC IMPORTED) + set_target_properties(${target_static} + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}" + IMPORTED_LOCATION "${static_lib}") + endif() + endif() +endmacro() + +function(arrow_find_package + prefix + home + base_name + header_path + cmake_package_name + pkg_config_name) + arrow_build_shared_library_name(shared_lib_name ${base_name}) + arrow_build_import_library_name(import_lib_name ${base_name}) + arrow_build_static_library_name(static_lib_name ${base_name}) + + set(target_shared ${base_name}_shared) + set(target_static ${base_name}_static) + + if(home) + arrow_find_package_home() + set(${prefix}_FIND_APPROACH "HOME: ${home}" PARENT_SCOPE) + else() + arrow_find_package_cmake_package_configuration() + if(${cmake_package_name}_FOUND) + set(${prefix}_FIND_APPROACH + "CMake package configuration: ${cmake_package_name}" + PARENT_SCOPE) + else() + arrow_find_package_pkg_config() + set(${prefix}_FIND_APPROACH "pkg-config: ${pkg_config_name}" PARENT_SCOPE) + endif() + endif() + + if(NOT include_dir) + if(TARGET ${target_shared}) + get_target_property(include_dir ${target_shared} INTERFACE_INCLUDE_DIRECTORIES) + elseif(TARGET ${target_static}) + get_target_property(include_dir ${target_static} INTERFACE_INCLUDE_DIRECTORIES) + endif() + endif() + if(include_dir) + set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) + endif() + + if(shared_lib) + get_filename_component(lib_dir "${shared_lib}" DIRECTORY) + elseif(static_lib) + get_filename_component(lib_dir "${static_lib}" DIRECTORY) + else() + set(lib_dir NOTFOUND) + endif() + set(${prefix}_LIB_DIR "${lib_dir}" PARENT_SCOPE) + # For backward compatibility + set(${prefix}_LIBS "${lib_dir}" PARENT_SCOPE) +endfunction() + +if(NOT "$ENV{ARROW_HOME}" STREQUAL "") + file(TO_CMAKE_PATH "$ENV{ARROW_HOME}" ARROW_HOME) +endif() +arrow_find_package(ARROW + "${ARROW_HOME}" + arrow + arrow/api.h + Arrow + arrow) + +if(ARROW_HOME) + if(ARROW_INCLUDE_DIR) + file(READ "${ARROW_INCLUDE_DIR}/arrow/util/config.h" ARROW_CONFIG_H_CONTENT) + arrow_extract_macro_value(ARROW_VERSION_MAJOR "ARROW_VERSION_MAJOR" + "${ARROW_CONFIG_H_CONTENT}") + arrow_extract_macro_value(ARROW_VERSION_MINOR "ARROW_VERSION_MINOR" + "${ARROW_CONFIG_H_CONTENT}") + arrow_extract_macro_value(ARROW_VERSION_PATCH "ARROW_VERSION_PATCH" + "${ARROW_CONFIG_H_CONTENT}") + if("${ARROW_VERSION_MAJOR}" STREQUAL "" + OR "${ARROW_VERSION_MINOR}" STREQUAL "" + OR "${ARROW_VERSION_PATCH}" STREQUAL "") + set(ARROW_VERSION "0.0.0") + else() + set(ARROW_VERSION + "${ARROW_VERSION_MAJOR}.${ARROW_VERSION_MINOR}.${ARROW_VERSION_PATCH}") + endif() + + arrow_extract_macro_value(ARROW_SO_VERSION_QUOTED "ARROW_SO_VERSION" + "${ARROW_CONFIG_H_CONTENT}") + string(REGEX REPLACE "^\"(.+)\"$" "\\1" ARROW_SO_VERSION "${ARROW_SO_VERSION_QUOTED}") + arrow_extract_macro_value(ARROW_FULL_SO_VERSION_QUOTED "ARROW_FULL_SO_VERSION" + "${ARROW_CONFIG_H_CONTENT}") + string(REGEX + REPLACE "^\"(.+)\"$" "\\1" ARROW_FULL_SO_VERSION + "${ARROW_FULL_SO_VERSION_QUOTED}") + endif() +else() + if(ARROW_USE_CMAKE_PACKAGE_CONFIG) + find_package(Arrow CONFIG) + elseif(ARROW_USE_PKG_CONFIG) + pkg_get_variable(ARROW_SO_VERSION arrow so_version) + pkg_get_variable(ARROW_FULL_SO_VERSION arrow full_so_version) + endif() +endif() + +set(ARROW_ABI_VERSION ${ARROW_SO_VERSION}) + +mark_as_advanced(ARROW_ABI_VERSION + ARROW_CONFIG_SUFFIXES + ARROW_FULL_SO_VERSION + ARROW_IMPORT_LIB + ARROW_INCLUDE_DIR + ARROW_LIBS + ARROW_LIB_DIR + ARROW_SEARCH_LIB_PATH_SUFFIXES + ARROW_SHARED_IMP_LIB + ARROW_SHARED_LIB + ARROW_SO_VERSION + ARROW_STATIC_LIB + ARROW_VERSION + ARROW_VERSION_MAJOR + ARROW_VERSION_MINOR + ARROW_VERSION_PATCH) + +find_package_handle_standard_args(Arrow REQUIRED_VARS + # The first required variable is shown + # in the found message. So this list is + # not sorted alphabetically. + ARROW_INCLUDE_DIR + ARROW_LIB_DIR + ARROW_FULL_SO_VERSION + ARROW_SO_VERSION + VERSION_VAR + ARROW_VERSION) +set(ARROW_FOUND ${Arrow_FOUND}) + +if(Arrow_FOUND AND NOT Arrow_FIND_QUIETLY) + message(STATUS "Arrow version: ${ARROW_VERSION} (${ARROW_FIND_APPROACH})") + message(STATUS "Arrow SO and ABI version: ${ARROW_SO_VERSION}") + message(STATUS "Arrow full SO version: ${ARROW_FULL_SO_VERSION}") + message(STATUS "Found the Arrow core shared library: ${ARROW_SHARED_LIB}") + message(STATUS "Found the Arrow core import library: ${ARROW_IMPORT_LIB}") + message(STATUS "Found the Arrow core static library: ${ARROW_STATIC_LIB}") +endif() diff --git a/flex/engines/graph_db/database/graph_db.cc b/flex/engines/graph_db/database/graph_db.cc index dd80d49f9ab0..0d0ce3681538 100644 --- a/flex/engines/graph_db/database/graph_db.cc +++ b/flex/engines/graph_db/database/graph_db.cc @@ -62,13 +62,17 @@ void GraphDB::Init(const Schema& schema, const LoadingConfig& load_config, LOG(INFO) << "Initializing graph db through bulk loading"; { MutablePropertyFragment graph; - graph.Init(schema, load_config, thread_num); + auto loader = LoaderFactory::CreateFragmentLoader(schema, load_config, + thread_num); + loader->LoadFragment(graph); graph.Serialize(data_dir_path.string()); } graph_.Deserialize(data_dir_path.string()); } else { LOG(INFO) << "Initializing empty graph db"; - graph_.Init(schema, load_config, thread_num); + auto loader = + LoaderFactory::CreateFragmentLoader(schema, load_config, thread_num); + loader->LoadFragment(graph_); graph_.Serialize(data_dir_path.string()); } } else { diff --git a/flex/engines/graph_db/database/graph_db.h b/flex/engines/graph_db/database/graph_db.h index 2029f5299208..a85bae4d99e3 100644 --- a/flex/engines/graph_db/database/graph_db.h +++ b/flex/engines/graph_db/database/graph_db.h @@ -30,6 +30,7 @@ #include "flex/engines/graph_db/database/single_vertex_insert_transaction.h" #include "flex/engines/graph_db/database/update_transaction.h" #include "flex/engines/graph_db/database/version_manager.h" +#include "flex/storages/rt_mutable_graph/loader/loader_factory.h" #include "flex/storages/rt_mutable_graph/loading_config.h" #include "flex/storages/rt_mutable_graph/mutable_property_fragment.h" diff --git a/flex/engines/graph_db/grin/CMakeLists.txt b/flex/engines/graph_db/grin/CMakeLists.txt index 3a066292b992..12b016724d28 100644 --- a/flex/engines/graph_db/grin/CMakeLists.txt +++ b/flex/engines/graph_db/grin/CMakeLists.txt @@ -33,6 +33,20 @@ if (GLOG_FOUND) set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES} ${GLOG_LIBRARIES}") endif () +#find arrow---------------------------------------------------------------------- +include("../../../../flex/cmake/FindArrow.cmake") +if (NOT ARROW_FOUND) + message(FATAL_ERROR "arrow not found, please install the arrow library") +else () + include_directories(SYSTEM ${ARROW_INCLUDE_DIRS}) + if (TARGET arrow_shared) + set(ARROW_SHARED_LIB arrow_shared) + endif() + if (TARGET arrow_static) + set(ARROW_STATIC_LIB arrow_static) + endif() +endif () + find_package(yaml-cpp REQUIRED) include_directories(SYSTEM ${yaml-cpp_INCLUDE_DIRS}) @@ -57,13 +71,18 @@ add_custom_target(grin_clformat COMMENT "Running clang-format." VERBATIM) -file(GLOB SOURCES "src/*.cc" "src/topology/*.cc" "src/property/*.cc" "src/index/*.cc" "src/common/*.cc" "../../../utils/property/*.cc" "../../../storages/rt_mutable_graph/*.cc") +file(GLOB SOURCES "src/*.cc" "src/topology/*.cc" "src/property/*.cc" "src/index/*.cc" "src/common/*.cc" "../../../utils/property/*.cc" "../../../utils/*.cc" + "../../../storages/rt_mutable_graph/*.cc" "../../../storages/rt_mutable_graph/loader/*.cc") add_library(flex_grin SHARED ${SOURCES}) target_link_libraries(flex_grin ${LIBGRAPELITE_LIBRARIES} ${GFLAGS_LIBRARIES} ${CMAKE_DL_LIBS} ${YAML_CPP_LIBRARIES}) - +if (ARROW_SHARED_LIB) + target_link_libraries(flex_grin ${ARROW_SHARED_LIB}) +else() + target_link_libraries(flex_grin ${ARROW_STATIC_LIB}) +endif() add_executable(run_grin_test test/test.c) target_include_directories(run_grin_test PRIVATE ${LIBGRAPELITE_INCLUDE_DIRS}/grape/analytical_apps fragment) target_link_libraries(run_grin_test flex_grin ${LIBGRAPELITE_LIBRARIES} ${GFLAGS_LIBRARIES} ${CMAKE_DL_LIBS}) -target_link_libraries(run_grin_test) + diff --git a/flex/engines/graph_db/grin/src/predefine.h b/flex/engines/graph_db/grin/src/predefine.h index 4eb4020bf864..ad6df6d85555 100644 --- a/flex/engines/graph_db/grin/src/predefine.h +++ b/flex/engines/graph_db/grin/src/predefine.h @@ -1,6 +1,7 @@ #include #include "grin/predefine.h" +#include "storages/rt_mutable_graph/loader/loader_factory.h" #include "storages/rt_mutable_graph/loading_config.h" #include "storages/rt_mutable_graph/mutable_property_fragment.h" diff --git a/flex/engines/graph_db/grin/src/topology/structure.cc b/flex/engines/graph_db/grin/src/topology/structure.cc index f79b3cab40bf..ba72fd91b75a 100644 --- a/flex/engines/graph_db/grin/src/topology/structure.cc +++ b/flex/engines/graph_db/grin/src/topology/structure.cc @@ -24,7 +24,8 @@ limitations under the License. * flex://{path_to_yaml} * @return A graph handle. */ -GRIN_GRAPH grin_get_graph_from_storage(const char* uri) { +GRIN_GRAPH grin_get_graph_from_storage(const char* uri, const char* schema_file, + const char* bulk_load_file) { std::string _uri(uri); std::string::size_type pos = _uri.find("://"); if (pos == std::string::npos) { @@ -35,9 +36,9 @@ GRIN_GRAPH grin_get_graph_from_storage(const char* uri) { return GRIN_NULL_GRAPH; } _uri = _uri.substr(pos + 3); - std::string graph_schema_path = _uri + "/modern_graph.yaml"; + std::string graph_schema_path = schema_file; std::string data_path = uri; - std::string bulk_load_config_path = _uri + "/bulk_load.yaml"; + std::string bulk_load_config_path = bulk_load_file; if (!std::filesystem::exists(graph_schema_path) || !(std::filesystem::exists(bulk_load_config_path))) { return GRIN_NULL_GRAPH; @@ -47,7 +48,9 @@ GRIN_GRAPH grin_get_graph_from_storage(const char* uri) { gs::LoadingConfig::ParseFromYaml(schema, bulk_load_config_path); GRIN_GRAPH_T* g = new GRIN_GRAPH_T(); - g->g.Init(schema, loading_config); + auto loader = + gs::LoaderFactory::CreateFragmentLoader(schema, loading_config, 1); + loader->LoadFragment(g->g); init_cache(g); return g; } diff --git a/flex/engines/graph_db/grin/test/test.c b/flex/engines/graph_db/grin/test/test.c index 4612d041885b..19c95e43b701 100644 --- a/flex/engines/graph_db/grin/test/test.c +++ b/flex/engines/graph_db/grin/test/test.c @@ -1,8 +1,8 @@ #include +#include #include #include #include -#include #include "grin/predefine.h" @@ -126,7 +126,8 @@ const char* v_names[][4] = {{"josh", "vadas", "peter", "marko"}, {"lop", "ripple", "wrong", "wrong"}}; // TODO align with order in local graph -GRIN_GRAPH get_graph(const char* uri_str, int p) { +GRIN_GRAPH get_graph(const char* uri_str, const char* schema_file, + const char* bulk_load_file, int p) { #ifdef GRIN_ENABLE_GRAPH_PARTITION GRIN_PARTITIONED_GRAPH pg = grin_get_partitioned_graph_from_storage(argv[1]); GRIN_PARTITION_LIST local_partitions = grin_get_local_partition_list(pg); @@ -144,7 +145,7 @@ GRIN_GRAPH get_graph(const char* uri_str, int p) { grin_destroy_partition_list(pg, local_partitions); grin_destroy_partitioned_graph(pg); #else - GRIN_GRAPH g = grin_get_graph_from_storage(uri_str); + GRIN_GRAPH g = grin_get_graph_from_storage(uri_str, schema_file, bulk_load_file); #endif return g; } @@ -181,10 +182,11 @@ GRIN_VERTEX get_one_person(GRIN_GRAPH g) { return v; } -void test_property_type(const char* uri_str) { +void test_property_type(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { printf("+++++++++++++++++++++ Test property/type +++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); printf("------------ Vertex Type ------------\n"); GRIN_VERTEX_TYPE_LIST vtl = grin_get_vertex_type_list(g); @@ -334,9 +336,11 @@ void test_property_type(const char* uri_str) { grin_destroy_graph(g); } -void test_property_vertex_property_value(const char* uri_str) { +void test_property_vertex_property_value(const char* uri_str, + const char* schema_file, + const char* bulk_load_file) { printf("------------ Test Vertex property value ------------\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); // value check printf("------ check value ------\n"); @@ -469,9 +473,11 @@ void test_property_vertex_property_value(const char* uri_str) { } void test_property_edge_property_value(const char* uri_str, + const char* schema_file, + const char* bulk_load_file, GRIN_DIRECTION dir) { printf("------------ Test Edge property value ------------\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); // value check printf("------ check value ------\n"); @@ -626,11 +632,11 @@ void test_property_edge_property_value(const char* uri_str, } #ifdef GRIN_ENABLE_VERTEX_PRIMARY_KEYS -void test_property_primary_key(const char* uri_str) { +void test_property_primary_key(const char* uri_str, const char*schema_file, const char* bulk_load_file) { printf( "+++++++++++++++++++++ Test property/primary key " "+++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); GRIN_VERTEX_TYPE_LIST vtl = grin_get_vertex_types_with_primary_keys(g); size_t vtl_size = grin_get_vertex_type_list_size(g, vtl); printf("vertex type num with primary key: %zu\n", vtl_size); @@ -683,9 +689,9 @@ void test_property_primary_key(const char* uri_str) { } #endif -void test_error_code(const char* uri_str) { +void test_error_code(const char* uri_str,const char* schema_file, const char* bulk_load_file) { printf("+++++++++++++++++++++ Test error code +++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); GRIN_VERTEX_TYPE vt1 = grin_get_vertex_type_by_name(g, "person"); GRIN_VERTEX_TYPE vt2 = grin_get_vertex_type_by_name(g, "software"); @@ -700,24 +706,25 @@ void test_error_code(const char* uri_str) { assert(grin_get_last_error_code() == INVALID_VALUE); } -void test_property(const char* uri_str) { - test_property_type(uri_str); - test_property_vertex_property_value(uri_str); - test_property_edge_property_value(uri_str, OUT); - test_property_edge_property_value(uri_str, IN); +void test_property(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { + test_property_type(uri_str, schema_file, bulk_load_file); + test_property_vertex_property_value(uri_str, schema_file, bulk_load_file); + test_property_edge_property_value(uri_str, schema_file, bulk_load_file, OUT); + test_property_edge_property_value(uri_str, schema_file, bulk_load_file, IN); #ifdef GRIN_ENABLE_VERTEX_PRIMARY_KEYS - test_property_primary_key(uri_str); + test_property_primary_key(uri_str,schema_file, bulk_load_file); #endif #ifdef GRIN_WITH_VERTEX_PROPERTY_NAME - test_error_code(uri_str); + test_error_code(uri_str,schema_file, bulk_load_file); #endif } /** void test_partition_reference(const char* uri_str) { printf("+++++++++++++++++++++ Test partition/reference -+++++++++++++++++++++\n"); ++++++++++++++++++++++\n"); GRIN_PARTITIONED_GRAPH pg = -grin_get_partitioned_graph_from_storage(argv[1]); +grin_get_partitioned_graph_from_storage(argv[1]); GRIN_PARTITION_LIST local_partitions = grin_get_local_partition_list(pg); assert(grin_get_partition_list_size(pg, local_partitions) >= 2); @@ -827,10 +834,11 @@ void test_partition(const char* uri_str) { #endif } */ -void test_topology_structure(const char* uri_str) { +void test_topology_structure(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { printf( "+++++++++++++++++++++ Test topology/structure +++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); #ifndef GRIN_WITH_VERTEX_PROPERTY printf("vertex num: %zu\n", grin_get_vertex_num(g)); #endif @@ -841,11 +849,12 @@ void test_topology_structure(const char* uri_str) { grin_destroy_graph(g); } -void test_topology_vertex_list(const char* uri_str) { +void test_topology_vertex_list(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { printf( "+++++++++++++++++++++ Test topology/vertex_list " "+++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); FOR_VERTEX_LIST_BEGIN(g, vl) FOR_VERTEX_BEGIN(g, vl, v) @@ -860,7 +869,8 @@ void test_topology_vertex_list(const char* uri_str) { grin_destroy_graph(g); } -void test_topology_adjacent_list(const char* uri_str, GRIN_DIRECTION dir) { +void test_topology_adjacent_list(const char* uri_str, const char* schema_file, + const char* bulk_load_file, GRIN_DIRECTION dir) { if (dir == IN) { printf( "+++++++++++++++++++++ Test topology/adjacent_list IN " @@ -871,7 +881,7 @@ void test_topology_adjacent_list(const char* uri_str, GRIN_DIRECTION dir) { "+++++++++++++++++++++\n"); } - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); FOR_VERTEX_LIST_BEGIN(g, vl) FOR_VERTEX_BEGIN(g, vl, v) @@ -938,18 +948,20 @@ void test_topology_adjacent_list(const char* uri_str, GRIN_DIRECTION dir) { grin_destroy_graph(g); } -void test_topology(const char* uri_str) { - test_topology_structure(uri_str); - test_topology_vertex_list(uri_str); - test_topology_adjacent_list(uri_str, OUT); - test_topology_adjacent_list(uri_str, IN); +void test_topology(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { + test_topology_structure(uri_str, schema_file, bulk_load_file); + test_topology_vertex_list(uri_str, schema_file, bulk_load_file); + test_topology_adjacent_list(uri_str, schema_file, bulk_load_file, OUT); + test_topology_adjacent_list(uri_str, schema_file, bulk_load_file, IN); } #if defined(GRIN_ASSUME_ALL_VERTEX_LIST_SORTED) && \ defined(GRIN_ENABLE_VERTEX_LIST_ARRAY) -void test_index_order(const char* uri_str) { +void test_index_order(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { printf("+++++++++++++++++++++ Test index order +++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); FOR_VERTEX_LIST_BEGIN(g, vl) FOR_VERTEX_BEGIN(g, vl, v) @@ -996,10 +1008,11 @@ void test_index_order(const char* uri_str) { } #endif -void test_index_internal_id(const char* uri_str) { +void test_index_internal_id(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { printf( "+++++++++++++++++++++ Test index internal id +++++++++++++++++++++\n"); - GRIN_GRAPH g = get_graph(uri_str, 0); + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); FOR_VERTEX_LIST_BEGIN(g, vl) long long int min = grin_get_vertex_internal_id_lower_bound_by_type(g, __vt); @@ -1018,18 +1031,19 @@ void test_index_internal_id(const char* uri_str) { grin_destroy_graph(g); } -void test_index(const char* uri_str) { +void test_index(const char* uri_str, const char* schema_file, + const char* bulk_load_file) { #if defined(GRIN_ASSUME_ALL_VERTEX_LIST_SORTED) && \ defined(GRIN_ENABLE_VERTEX_LIST_ARRAY) - test_index_order(uri_str); + test_index_order(uri_str, schema_file, bulk_load_file); #endif #ifdef GRIN_ENABLE_VERTEX_INTERNAL_ID_INDEX - test_index_internal_id(uri_str); + test_index_internal_id(uri_str, schema_file, bulk_load_file); #endif } -void test_vertex_property_value(const char* uri_str) { - GRIN_GRAPH g = get_graph(uri_str, 0); +void test_vertex_property_value(const char* uri_str, const char* schema_file,const char* bulk_load_file) { + GRIN_GRAPH g = get_graph(uri_str, schema_file,bulk_load_file, 0); GRIN_VERTEX_TYPE vt = grin_get_vertex_type_by_name(g, "person"); GRIN_VERTEX_PROPERTY vp = grin_get_vertex_property_by_name(g, vt, "age"); GRIN_VERTEX v = get_one_person(g); @@ -1049,17 +1063,23 @@ void test_vertex_property_value(const char* uri_str) { grin_destroy_graph(g); } -void test_perf(const char* uri_str) { test_vertex_property_value(uri_str); } - +void test_perf(const char* uri_str,const char* schema_file,const char* bulk_load_file) { test_vertex_property_value(uri_str, schema_file, bulk_load_file); } +// uri_str = +//"flex://" +// "../../../../storages/rt_mutable_graph/modern_graph/"; int main(int argc, char** argv) { - const char* uri_str = - "flex://" - "../../../../storages/rt_mutable_graph/modern_graph/"; + if (argc != 4) { + printf("Usage: %s \n", argv[0]); + return 1; + } + const char* uri_str = argv[1]; + const char* schema_file = argv[2]; + const char* bulk_load_file = argv[3]; - test_index(uri_str); - test_property(uri_str); + test_index(uri_str, schema_file, bulk_load_file); + test_property(uri_str, schema_file, bulk_load_file); // test_partition(uri_str); - test_topology(uri_str); - test_perf(uri_str); + test_topology(uri_str, schema_file, bulk_load_file); + test_perf(uri_str, schema_file, bulk_load_file); return 0; } diff --git a/flex/engines/hqps_db/core/operator/edge_expand.h b/flex/engines/hqps_db/core/operator/edge_expand.h index 4d365ed38aa9..cfa6cf081b0d 100644 --- a/flex/engines/hqps_db/core/operator/edge_expand.h +++ b/flex/engines/hqps_db/core/operator/edge_expand.h @@ -731,20 +731,25 @@ class EdgeExpand { // Expand from multi label vertices and though multi edge labels. // result in general edge set. auto src_label = cur_vertex_set.GetLabel(); - LOG(INFO) << "[EdgeExpandEMultiTriplet] real labels: " - << gs::to_string(edge_labels); + LOG(INFO) << "[EdgeExpandEMultiTriplet] real labels: "; + for (auto i = 0; i < edge_labels.size(); ++i) { + LOG(INFO) << std::to_string(edge_labels[i][0]) << " " + << std::to_string(edge_labels[i][1]) << " " + << std::to_string(edge_labels[i][2]); + } // for each triplet, returns a vector of edge iters. auto& vertices = cur_vertex_set.GetVertices(); using sub_graph_t = typename GRAPH_INTERFACE::sub_graph_t; using edge_iter_t = typename sub_graph_t::iterator; std::vector sub_graphs; + auto prop_names_vec = prop_names_to_vec(prop_names); for (auto i = 0; i < edge_labels.size(); ++i) { // Check whether the edge triplet match input vertices. // return a hanlder to get edges - auto sub_graph_vec = - graph.GetSubGraph(edge_labels[i][0], edge_labels[i][1], - edge_labels[i][2], gs::to_string(direction)); + auto sub_graph_vec = graph.GetSubGraph( + edge_labels[i][0], edge_labels[i][1], edge_labels[i][2], + gs::to_string(direction), prop_names_vec[i]); for (auto sub_graph : sub_graph_vec) { sub_graphs.emplace_back(sub_graph); } @@ -811,7 +816,8 @@ class EdgeExpand { } auto set = UnTypedEdgeSet( - vertices, label_indices, label_vec, std::move(label_to_subgraphs)); + vertices, label_indices, label_vec, std::move(label_to_subgraphs), + direction); return std::make_pair(std::move(set), std::move(offsets)); } @@ -854,12 +860,13 @@ class EdgeExpand { using sub_graph_t = typename GRAPH_INTERFACE::sub_graph_t; using edge_iter_t = typename sub_graph_t::iterator; std::vector sub_graphs; + auto prop_names_vec = prop_names_to_vec(prop_names); for (auto i = 0; i < edge_labels.size(); ++i) { // Check whether the edge triplet match input vertices. // return a hanlder to get edges - auto sub_graph_vec = - graph.GetSubGraph(edge_labels[i][0], edge_labels[i][1], - edge_labels[i][2], gs::to_string(direction)); + auto sub_graph_vec = graph.GetSubGraph( + edge_labels[i][0], edge_labels[i][1], edge_labels[i][2], + gs::to_string(direction), prop_names_vec[i]); for (auto sub_graph : sub_graph_vec) { sub_graphs.emplace_back(sub_graph); } @@ -936,7 +943,8 @@ class EdgeExpand { } auto set = UnTypedEdgeSet( - vertices, label_indices, label_vec, std::move(label_to_subgraphs)); + vertices, label_indices, label_vec, std::move(label_to_subgraphs), + direction); return std::make_pair(std::move(set), std::move(offsets)); } @@ -1490,7 +1498,26 @@ class EdgeExpand { << gs::to_string(edge_label_id) << ", new vertices count: " << tmp_offset.back(); } -}; // namespace gs + + template + static void emplace_prop_names_to_vec( + std::vector>& vec_vec_prop_names, + std::tuple...>& prop_names, + std::index_sequence) { + (vec_vec_prop_names.emplace_back(array_to_vec(std::get(prop_names))), + ...); + } + template + static std::vector> prop_names_to_vec( + std::tuple...>& prop_names) { + std::vector> vec_vec_prop_names; + vec_vec_prop_names.reserve(sizeof...(PropTuple)); + emplace_prop_names_to_vec( + vec_vec_prop_names, prop_names, + std::make_index_sequence()); + return vec_vec_prop_names; + } +}; } // namespace gs diff --git a/flex/engines/hqps_db/core/operator/sink.h b/flex/engines/hqps_db/core/operator/sink.h index 97f3c397c775..397b7cd6605e 100644 --- a/flex/engines/hqps_db/core/operator/sink.h +++ b/flex/engines/hqps_db/core/operator/sink.h @@ -119,7 +119,7 @@ template )>::type* = nullptr> void template_set_value(common::Value* value, T v) { - value->set_str(v.data(), v.size()); + value->mutable_str()->assign(v.data(), v.size()); } template -struct CppTypeToPropertyType; - -template <> -struct CppTypeToPropertyType { - static constexpr PropertyType value = PropertyType::kInt32; -}; - -template <> -struct CppTypeToPropertyType { - static constexpr PropertyType value = PropertyType::kInt64; -}; - -template <> -struct CppTypeToPropertyType { - static constexpr PropertyType value = PropertyType::kDouble; -}; - -template <> -struct CppTypeToPropertyType { - static constexpr PropertyType value = PropertyType::kString; -}; - -template <> -struct CppTypeToPropertyType { - static constexpr PropertyType value = PropertyType::kString; -}; - template struct return_type; diff --git a/flex/engines/hqps_db/core/utils/keyed.h b/flex/engines/hqps_db/core/utils/keyed.h index 2bd8dff06708..58dda429a2e3 100644 --- a/flex/engines/hqps_db/core/utils/keyed.h +++ b/flex/engines/hqps_db/core/utils/keyed.h @@ -383,6 +383,21 @@ struct KeyedAggT, } }; +template +struct KeyedAggT, AggFunc::COUNT, + std::tuple, + std::integer_sequence> { + using agg_res_t = Collection; + using aggregate_res_builder_t = CountBuilder; + + static aggregate_res_builder_t create_agg_builder( + const FlatEdgeSet& set, const GI& graph, + std::tuple>& selectors) { + return CountBuilder(); + } +}; + template static inline auto insert_into_builder_v2_impl( diff --git a/flex/engines/hqps_db/database/adj_list.h b/flex/engines/hqps_db/database/adj_list.h index 2f28afcdde78..fb8f8e996a01 100644 --- a/flex/engines/hqps_db/database/adj_list.h +++ b/flex/engines/hqps_db/database/adj_list.h @@ -49,9 +49,14 @@ class EdgeIter { inline label_id_t GetSrcLabel() const { return label_triplet_[0]; } inline Any GetData() const { return ptr1_->get_data(); } - inline bool IsValid() const { return ptr1_->is_valid(); } + inline bool IsValid() const { return ptr1_ && ptr1_->is_valid(); } - size_t Size() const { return ptr1_->size(); } + size_t Size() const { + if (ptr1_) { + return ptr1_->size(); + } + return 0; + } private: std::shared_ptr ptr1_; @@ -66,21 +71,28 @@ class SubGraph { using iterator = EdgeIter; using label_id_t = LabelT; SubGraph(const MutableCsrBase* first, - const std::array& label_triplet) - : first_(first), label_triplet_(label_triplet) {} + const std::array& label_triplet, + const std::vector& prop_names) + : first_(first), label_triplet_(label_triplet), prop_names_(prop_names) {} inline iterator get_edges(VID_T vid) const { - return iterator(label_triplet_, first_->edge_iter(vid)); + if (first_) { + return iterator(label_triplet_, first_->edge_iter(vid)); + } + return iterator(label_triplet_, nullptr); } label_id_t GetSrcLabel() const { return label_triplet_[0]; } label_id_t GetEdgeLabel() const { return label_triplet_[2]; } label_id_t GetDstLabel() const { return label_triplet_[1]; } + const std::vector& GetPropNames() const { return prop_names_; } + private: const MutableCsrBase* first_; // We assume first is out edge, second is in edge. std::array label_triplet_; + std::vector prop_names_; }; template diff --git a/flex/engines/hqps_db/database/mutable_csr_interface.h b/flex/engines/hqps_db/database/mutable_csr_interface.h index 5531246ddbf2..0f1b9f3c89fa 100644 --- a/flex/engines/hqps_db/database/mutable_csr_interface.h +++ b/flex/engines/hqps_db/database/mutable_csr_interface.h @@ -498,21 +498,21 @@ class MutableCSRInterface { // get edges with input vids. return a edge list. std::vector> GetSubGraph(const label_id_t src_label_id, const label_id_t dst_label_id, - const label_id_t edge_label_id, - const std::string& direction_str) const { + const label_id_t edge_label_id, const std::string& direction_str, + const std::vector& prop_names) const { const MutableCsrBase *csr = nullptr, *other_csr = nullptr; if (direction_str == "out" || direction_str == "Out" || direction_str == "OUT") { csr = db_session_.graph().get_oe_csr(src_label_id, dst_label_id, edge_label_id); - return std::vector{ - sub_graph_t{csr, {src_label_id, dst_label_id, edge_label_id}}}; + return std::vector{sub_graph_t{ + csr, {src_label_id, dst_label_id, edge_label_id}, prop_names}}; } else if (direction_str == "in" || direction_str == "In" || direction_str == "IN") { csr = db_session_.graph().get_ie_csr(src_label_id, dst_label_id, edge_label_id); - return std::vector{ - sub_graph_t{csr, {src_label_id, dst_label_id, edge_label_id}}}; + return std::vector{sub_graph_t{ + csr, {src_label_id, dst_label_id, edge_label_id}, prop_names}}; } else if (direction_str == "both" || direction_str == "Both" || direction_str == "BOTH") { csr = db_session_.graph().get_oe_csr(src_label_id, dst_label_id, @@ -520,8 +520,11 @@ class MutableCSRInterface { other_csr = db_session_.graph().get_ie_csr(src_label_id, dst_label_id, edge_label_id); return std::vector{ - sub_graph_t{csr, {src_label_id, dst_label_id, edge_label_id}}, - sub_graph_t{other_csr, {dst_label_id, src_label_id, edge_label_id}}}; + sub_graph_t{ + csr, {src_label_id, dst_label_id, edge_label_id}, prop_names}, + sub_graph_t{other_csr, + {dst_label_id, src_label_id, edge_label_id}, + prop_names}}; } else { throw std::runtime_error("Not implemented - " + direction_str); } diff --git a/flex/engines/hqps_db/structures/multi_edge_set/flat_edge_set.h b/flex/engines/hqps_db/structures/multi_edge_set/flat_edge_set.h index 047f2f02e865..110e0b320bd9 100644 --- a/flex/engines/hqps_db/structures/multi_edge_set/flat_edge_set.h +++ b/flex/engines/hqps_db/structures/multi_edge_set/flat_edge_set.h @@ -648,112 +648,6 @@ class SingleLabelEdgeSet { Direction direction_; }; -// template -// class FlatEdgeSet { -// public: -// // TODO: use std::tuple is enough -// using ele_tuple_t = std::tuple; -// using index_ele_tuple_t = std::tuple; -// using iterator = FlatEdgeSetIter; -// using self_type_t = FlatEdgeSet; -// using flat_t = self_type_t; -// using data_tuple_t = ele_tuple_t; - -// static constexpr bool is_multi_label = false; -// static constexpr bool is_collection = false; -// static constexpr bool is_edge_set = true; -// static constexpr bool is_multi_src = false; -// static constexpr bool is_multi_dst_label = false; - -// FlatEdgeSet(std::vector&& vec, -// std::vector>&& label_triplet, -// std::vector&& label_triplet_ind, Direction& dire) -// : vec_(std::move(vec)), -// label_triplet_(std::move(label_triplet)), -// label_triplet_ind_(std::move(label_triplet_ind)), -// direction_(dire) { -// CHECK(label_triplet_ind_.size() == vec_.size()); -// } - -// iterator begin() const { return iterator(vec_, 0); } - -// iterator end() const { return iterator(vec_, vec_.size()); } - -// template -// flat_t Flat( -// std::vector>& index_ele_tuple) -// const -// { -// std::vector> res; -// std::vector label_triplet_ind; -// res.reserve(index_ele_tuple.size()); -// label_triplet_ind.reserve(index_ele_tuple.size()); -// for (auto i = 0; i < index_ele_tuple.size(); ++i) { -// auto cur_ind_ele = std::get(index_ele_tuple[i]); -// res.push_back(std::get<1>(cur_ind_ele)); -// label_triplet_ind.push_back(label_triplet_ind_[std::get<0>(cur_ind_ele)]); -// } -// return FlatEdgeSet(std::move(res), label_triplet_, -// std::move(label_triplet_ind)); -// } -// template -// void fillBuiltinProps(std::vector>& tuples, -// PropNameArray& prop_names, -// std::vector& repeat_array) { -// fillBuiltinPropsImpl(tuples, prop_names, repeat_array, -// std::make_index_sequence()); -// } - -// // fill builtin props withour repeat array. -// template -// void fillBuiltinProps(std::vector>& tuples, -// PropNameArray& prop_names) { -// std::vector repeat_array(vec_.size(), 1); -// fillBuiltinPropsImpl(tuples, prop_names, repeat_array, -// std::make_index_sequence()); -// } - -// template -// std::pair, -// std::vector> GetVertices(VOpt v_opt, std::array& labels, -// EXPR& expr) const { -// // We only contains one label for dst vertices. -// CHECK(v_opt == VOpt::End); -// std::vector offsets; -// std::vector vids; -// offsets.reserve(Size()); -// offsets.emplace_back(0); -// // TODO: check labels. -// bool flag = false; -// for (auto l : labels) { -// if (l == dst_label_) { -// flag = true; -// } -// } -// if (flag) { -// for (auto iter : *this) { -// vids.emplace_back(iter.GetDst()); -// offsets.emplace_back(vids.size()); -// } -// } else { -// size_t size = Size(); -// for (auto i = 0; i < size; ++i) { -// offsets.emplace_back(0); -// } -// } -// auto set = make_default_row_vertex_set(std::move(vids), dst_label_); -// return std::make_pair(std::move(set), std::move(offsets)); -// } - -// size_t Size() const { return vec_.size(); } - -// private: -// std::vector vec_; -// std::vector> label_triplet_; -// std::vector label_triplet_ind_; -// Direction direction_; -// }; } // namespace gs #endif // ENGINES_HQPS_ENGINE_DS_MULTI_EDGE_SET_FLAT_EDGE_SET_H_ \ No newline at end of file diff --git a/flex/engines/hqps_db/structures/multi_edge_set/untyped_edge_set.h b/flex/engines/hqps_db/structures/multi_edge_set/untyped_edge_set.h index 4e586b0287a5..fcdc9d1a42a8 100644 --- a/flex/engines/hqps_db/structures/multi_edge_set/untyped_edge_set.h +++ b/flex/engines/hqps_db/structures/multi_edge_set/untyped_edge_set.h @@ -23,6 +23,7 @@ #include "flex/engines/hqps_db/core/utils/hqps_utils.h" #include "flex/engines/hqps_db/structures/multi_vertex_set/general_vertex_set.h" +#include "flex/utils/arrow_utils.h" #include "grape/grape.h" namespace gs { @@ -170,12 +171,14 @@ class UnTypedEdgeSet { const std::vector& src_v, const std::vector& label_indices, const std::vector& labels, - std::unordered_map>&& adj_lists) + std::unordered_map>&& adj_lists, + const Direction& direction) : src_vertices_(src_v), label_indices_(label_indices), src_labels_(labels), adj_lists_(std::move(adj_lists)), - size_(0) { + size_(0), + direction_(direction) { sanity_check(); } @@ -367,6 +370,67 @@ class UnTypedEdgeSet { LOG(FATAL) << "not implemented, and should not be called"; } + template ::type* = nullptr> + auto ProjectWithRepeatArray(const std::vector& repeat_array, + KeyAlias& key_alias) const { + using dst_ele_tuple_t = std::tuple; + CHECK(repeat_array.size() == Size()); + size_t real_size = 0; + for (auto v : repeat_array) { + real_size += v; + } + std::vector dst_eles; + dst_eles.reserve(real_size); + auto edge_label_triplets = get_edge_triplets(); + auto edge_iters = generate_iters(); + std::vector label_triplet_indices; + label_triplet_indices.reserve(real_size); + std::vector sizes; + sizes.emplace_back(0); + for (auto i = 0; i < edge_label_triplets.size(); ++i) { + sizes.emplace_back(sizes.back() + edge_label_triplets[i].size()); + } + + // 0,2,4 + size_t cur_ind = 0; + for (auto i = 0; i < src_vertices_.size(); ++i) { + auto src_vid = src_vertices_[i]; + auto& cur_edge_iters = edge_iters[i]; + auto src_label_ind = label_indices_[i]; + auto src_label = src_labels_[src_label_ind]; + auto cur_triplets_vec = edge_label_triplets[src_label_ind]; + CHECK(cur_triplets_vec.size() == cur_edge_iters.size()); + + for (auto j = 0; j < cur_edge_iters.size(); ++j) { + auto& cur_iter = cur_edge_iters[j]; + while (cur_iter.IsValid()) { + auto dst_vid = cur_iter.GetDstId(); + auto data = cur_iter.GetData(); + for (auto k = 0; k < repeat_array[cur_ind]; ++k) { + dst_eles.emplace_back(std::make_tuple(src_vid, dst_vid, data)); + label_triplet_indices.emplace_back(sizes[src_label_ind] + j); + } + cur_iter.Next(); + cur_ind += 1; + } + } + } + std::vector> res_label_triplets; + // put edge_label_triplets into res_label_triplets + for (auto i = 0; i < edge_label_triplets.size(); ++i) { + auto& cur_triplets_vec = edge_label_triplets[i]; + for (auto j = 0; j < cur_triplets_vec.size(); ++j) { + res_label_triplets.emplace_back(cur_triplets_vec[j]); + } + } + std::vector> prop_names = get_prop_namees(); + CHECK(prop_names.size() == res_label_triplets.size()); + return FlatEdgeSet( + std::move(dst_eles), std::move(res_label_triplets), prop_names, + std::move(label_triplet_indices), direction_); + } + private: std::pair, std::unordered_map> preprocess_getting_labels(const std::vector& req_labels, @@ -419,6 +483,35 @@ class UnTypedEdgeSet { << " vertices, with " << edge_iter_vecs.size() << " iters"; return edge_iter_vecs; } + + std::vector>> get_edge_triplets() const { + std::vector>> ret; + for (auto iter : adj_lists_) { + auto& sub_graphs = iter.second; + std::vector> tmp; + for (auto i = 0; i < sub_graphs.size(); ++i) { + auto& sub_graph = sub_graphs[i]; + tmp.emplace_back(std::array({sub_graph.GetSrcLabel(), + sub_graph.GetDstLabel(), + sub_graph.GetEdgeLabel()})); + } + ret.emplace_back(std::move(tmp)); + } + return ret; + } + + std::vector> get_prop_namees() const { + std::vector> ret; + for (auto iter : adj_lists_) { + auto& sub_graphs = iter.second; + for (auto i = 0; i < sub_graphs.size(); ++i) { + auto& sub_graph = sub_graphs[i]; + ret.push_back(sub_graph.GetPropNames()); + } + } + return ret; + } + void sanity_check() { CHECK(src_vertices_.size() == label_indices_.size()); for (auto v : label_indices_) { @@ -436,6 +529,7 @@ class UnTypedEdgeSet { std::unordered_map> adj_lists_; // match src_label to all triplet. mutable size_t size_; // computed lazily + Direction direction_; }; } // namespace gs diff --git a/flex/interactive/README.md b/flex/interactive/README.md index fd6ca0e3e239..34be2f2596d4 100755 --- a/flex/interactive/README.md +++ b/flex/interactive/README.md @@ -3,3 +3,11 @@ GraphScope Interactive is a specialized construction of [GraphScope Flex](https://github.com/alibaba/GraphScope/tree/main/flex), designed to handle concurrent graph queries at an impressive speed. Its primary goal is to process as many queries as possible within a given timeframe, emphasizing a high query throughput rate. For the full documentation of GraphScope Interactive, please refer to [GraphScope-Interactive](https://graphscope.io/docs/interactive_engine/graphscope_interactive). + + +# problems +- 不在任何`.yaml`中配置graph_name相关的内容,`graph_name`相关必须在命令行指定 +- compiler: + - 去掉对workspace+data+graph.name的约定俗称的拼接,必须显式指定 +- gs interactive logs get server_log/compiler_log + diff --git a/flex/interactive/bin/db_admin.sh b/flex/interactive/bin/db_admin.sh deleted file mode 100755 index 8f98befcc53d..000000000000 --- a/flex/interactive/bin/db_admin.sh +++ /dev/null @@ -1,654 +0,0 @@ -#!/bin/bash -# Copyright 2020 Alibaba Group Holding Limited. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# The product name -DB_PROD_NAME="interactive" - -# colored error and info functions to wrap messages. -RED='\033[0;31m' -GREEN='\033[0;32m' -NC='\033[0m' # No Color -err() { - echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] -ERROR- $* ${NC}" >&2 -} - -info() { - echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] -INFO- $* ${NC}" -} - -################## Some Util Functions ################## -function parse_yaml { - local prefix=$2 - local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') - sed -ne "s|^\($s\):|\1|" \ - -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ - -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | - awk -F$fs '{ - indent = length($1)/2; - vname[indent] = $2; - for (i in vname) {if (i > indent) {delete vname[i]}} - if (length($3) > 0) { - vn=""; for (i=0; i/dev/null 2>&1 - pwd -P -)" -info "HOST_DB_HOME = ${HOST_DB_HOME}" - -#################### DEFINE CONSTANTS #################### -GRAPHSCOPE_GROUP_ID=1001 - -# the configuration directory -HOST_DB_CONF_DIR="${HOST_DB_HOME}/conf" -# the data directory -HOST_DB_DATA_DIR="${HOST_DB_HOME}/data" -# the log directory -HOST_DB_LOG_DIR="${HOST_DB_HOME}/logs" -HOST_DB_SERVER_OUTPUT_LOG="${HOST_DB_LOG_DIR}/server.log" -HOST_DB_COMPILER_OUTPUT_LOG="${HOST_DB_LOG_DIR}/compiler.log" -HOST_DB_INTERACTIVE_YAML="${HOST_DB_CONF_DIR}/interactive.yaml" -HOST_DB_EXAMPLE_DATASET_DIR=${HOST_DB_HOME}/"examples/sf0.1-raw/" -HOST_DB_RUNNING_FILE="${HOST_DB_HOME}/.running" -# will export DOCKER_DB_HOME, if not set, exist -get_docker_workspace_from_yaml "${HOST_DB_INTERACTIVE_YAML}" - -DOCKER_DB_GRAPHSCOPE_HOME="/home/graphscope/GraphScope" -DOCKER_DB_DATA_DIR="${DOCKER_DB_HOME}/data" -DOCKER_DB_LOG_DIR="${DOCKER_DB_HOME}/logs" -DOCKER_DB_CONF_DIR="${DOCKER_DB_HOME}/conf" -DOCKER_DB_IR_CONF_FILE="${DOCKER_DB_HOME}/conf/interactive.properties" -DOCKER_DB_GIE_HOME="${DOCKER_DB_GRAPHSCOPE_HOME}/interactive_engine/" -DOCKER_DB_INTERACTIVE_YAML="${DOCKER_DB_HOME}/conf/interactive.yaml" -DOCKER_DB_SERVER_BIN="${DOCKER_DB_GRAPHSCOPE_HOME}/flex/build/bin/sync_server" -DOCKER_DB_COMPILER_BIN="com.alibaba.graphscope.GraphServer" -DOCKER_DB_GEN_BIN="${DOCKER_DB_GRAPHSCOPE_HOME}/flex/bin/load_plan_and_gen.sh" -DOCKER_DB_SERVER_OUTPUT_LOG=${DOCKER_DB_LOG_DIR}/server.log -DOCKER_DB_COMPILER_OUTPUT_LOG=${DOCKER_DB_LOG_DIR}/compiler.log -export DOCKER_DB_CONNECTOR_PORT=7687 -DB_CONNECT_DEFAULT_PORT=7687 -# update the port by parsing the yaml file -DOCKER_DB_CONNECTOR_PORT=$(parse_yaml "${HOST_DB_INTERACTIVE_YAML}" | grep "compiler_endpoint_boltConnector_port" | awk -F "=" '{print $2}') -#remove "" and space -DOCKER_DB_CONNECTOR_PORT=$(echo "${DOCKER_DB_CONNECTOR_PORT}" | sed 's/^"//' | sed 's/"$//') - -EXAMPLE_DATA_SET_URL="https://github.com/GraphScope/gstest.git" - -################### IMAGE VERSION ################### -GIE_DB_IMAGE_VERSION="v0.0.1" -GIE_DB_IMAGE_NAME="registry.cn-hongkong.aliyuncs.com/graphscope/${DB_PROD_NAME}" -GIE_DB_CONTAINER_NAME="${DB_PROD_NAME}-server" - - -#################### DEFINE FUNCTIONS #################### -function check_running_containers_and_exit(){ - # check if there is any running containers - info "Check running containers and exit" - running_containers=$(docker ps -a --format "{{.Names}}" | grep "${GIE_DB_CONTAINER_NAME}") - if [ -n "${running_containers}" ]; then - err "There are running containers: ${running_containers}, please stop them first." - exit 1 - fi - info "finish check" -} - -function check_container_running(){ - if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "true" ]; then - info "container ${GIE_DB_CONTAINER_NAME} is running" - else - info "container ${GIE_DB_CONTAINER_NAME} is not running" - # start the container - docker start "${GIE_DB_CONTAINER_NAME}" - fi -} - -function ensure_container_running(){ - if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "true" ]; then - info "container ${GIE_DB_CONTAINER_NAME} is running" - else - info "container ${GIE_DB_CONTAINER_NAME} is not running" - # start the container - docker start "${GIE_DB_CONTAINER_NAME}" - fi -} - -function check_process_running_in_container(){ - local container_name=$1 - local process_name=$2 - local error_msg=$3 - local process_id=$(docker top "${container_name}" | grep "${process_name}" | awk '{print $2}\') - if [ -z "${process_id}" ]; then - err "process ${process_name} is not running in container ${container_name}" - err "${error_msg}" - exit 1 - fi - info "process ${process_name} is running in container ${container_name}, process id is ${process_id}" -} - - -#################### DEFINE USAGE #################### -# parse the args and set the variables. -function usage() { - init_usage - start_usage - stop_usage - restart_usage - compile_usage - show_stored_procedure_usage - download_dataset_usage - destroy_usage -} - -function init_usage() { - cat << EOF - db_admin.sh init -p[---publish] - -v[--volume] - --version - Init the database, create the containers. --publish and --volume can be used multiple times. -EOF -} - -function start_usage() { - cat << EOF - db_admin.sh start -n [--name] -b [--bulk-load] -r[--root-data-dir] - Start the database with the given graph. graph schema file should be placed at ./data/{graph_name}/graph.yaml. - If mode is override, we need to clear the data directory first. -EOF -} - -function stop_usage() { - cat << EOF - db_admin.sh stop - Stop the database with the given graph. -EOF -} - -function restart_usage() { - cat << EOF - db_admin.sh restart - Restart the database with current running graph. -EOF -} - -function compile_usage(){ - cat << EOF - db_admin.sh compile -g[--graph] -i ${DOCKER_DB_COMPILER_OUTPUT_LOG} 2>&1 &" - cmd=${cmd}"\"" - info "Running cmd: ${cmd}" - eval ${cmd} - sleep 6 - check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_COMPILER_BIN} "check ${HOST_DB_COMPILER_OUTPUT_LOG} to see more details" - info "Successfuly start compiler" - info "DataBase service is running..., port is open on :${DOCKER_DB_CONNECTOR_PORT}" - - # if do_start success, we should write current args to ${HOST_DB_RUNNING_FILE} - echo "GRAPH_NAME=${GRAPH_NAME}" > ${HOST_DB_RUNNING_FILE} - echo "BULK_LOAD_FILE=${BULK_LOAD_FILE}" >> ${HOST_DB_RUNNING_FILE} - echo "ROOT_DATA_DIR=${root_data_dir}" >> ${HOST_DB_RUNNING_FILE} -# info "Successfuly write running args to ${HOST_DB_RUNNING_FILE}" -} - - -#################### Stop database #################### -function do_stop(){ - # stop the container - docker stop ${GIE_DB_CONTAINER_NAME} - info "Successfuly stop database" -} - - -#################### Get database status #################### -function do_status() { - if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "true" ]; then - info "container ${GIE_DB_CONTAINER_NAME} is running" - else - info "container ${GIE_DB_CONTAINER_NAME} is not running" - info "Please start database first" - fi - # the container is running but the process is not running - check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_SERVER_BIN} "check ${HOST_DB_SERVER_OUTPUT_LOG} to see more details" - check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_COMPILER_BIN} "check ${HOST_DB_COMPILER_OUTPUT_LOG} to see more details" - info "Database service is running..., port is open on :${DOCKER_DB_CONNECTOR_PORT}" -} - - -#################### Download dataset #################### -function do_download_dataset(){ - git clone ${EXAMPLE_DATA_SET_URL} ${HOST_DB_EXAMPLE_DATASET_DIR} - info "Successfuly download dataset to: ${HOST_DB_EXAMPLE_DATASET_DIR}" -} - - -#################### Restart #################### -function do_restart() { - # if the container is not running, exit - if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "false" ]; then - info "container ${GIE_DB_CONTAINER_NAME} is not running" - info "Please start database first" - exit 1 - fi - info "Stopping database first..." - do_stop - info "Successfuly stop database" - # read args from cached file. - # get num lines in file ${HOST_DB_RUNNING_FILE} - num_lines=$(wc -l < ${HOST_DB_RUNNING_FILE}) - if [ ${num_lines} -ne 3 ]; then - err "Error: ${HOST_DB_RUNNING_FILE} should have 3 lines, but got ${num_lines}, something wrong with the file ${HOST_DB_RUNNING_FILE}" - exit 1 - fi - # read args from file - GRAPH_NAME=$(sed -n '1p' ${HOST_DB_RUNNING_FILE} | cut -d '=' -f 2) - BULK_LOAD_FILE=$(sed -n '2p' ${HOST_DB_RUNNING_FILE} | cut -d '=' -f 2) - ROOT_DATA_DIR=$(sed -n '3p' ${HOST_DB_RUNNING_FILE} | cut -d '=' -f 2) - do_start -n ${GRAPH_NAME} -b ${BULK_LOAD_FILE} -r ${ROOT_DATA_DIR} - info "Finish restart database" -} - -# the compiled dynamic libs will be placed at data/${graph_name}/plugins/ -# after compilation, the user need to write the cooresponding yaml, telling the compiler about -# the input and output of the stored procedure -function do_compile() { - # check args num == 4 - # start container - ensure_container_running - if [ $# -ne 4 ]; then - err "stored_procedure command need 2 args, but got $#" - compile_usage - exit 1 - fi - graph_name="" - file_path="" # file path - output_dir="" - - while [[ $# -gt 0 ]]; do - key="$1" - case $key in - -g | --graph) - graph_name="$2" - info "graph_name = ${graph_name}" - shift # past argument - shift - ;; - -i | --input) - file_path="$2" - shift # past argument - shift - ;; - *) - err "unknown option $1" - compile_usage - exit 1 - ;; - esac - done - - # check graph_name - if [ -z "${graph_name}" ]; then - err "graph_name is empty" - compile_usage - exit 1 - fi - - # check file_path - if [ -z "${file_path}" ]; then - err "file_path is empty" - compile_usage - exit 1 - fi - - # get real file_path - file_name=$(basename "${file_path}") - real_file_path=$(realpath "${file_path}") - # check exists - if [ ! -f "${real_file_path}" ]; then - err "file ${real_file_path} not exist" - exit 1 - fi - # check graph dir exists - graph_dir="${HOST_DB_HOME}/data/${graph_name}" - if [ ! -d "${graph_dir}" ]; then - err "graph ${graph_name} not exist" - exit 1 - fi - mkdir -p "${graph_dir}/plugins" - - DOCKER_OUTPUT_DIR="${DOCKER_DB_HOME}/data/${graph_name}/plugins" - HOST_OUTPUT_DIR="${HOST_DB_HOME}/data/${graph_name}/plugins" - DOCKER_DB_GRAPH_SCHEMA="${DOCKER_DB_HOME}/data/${graph_name}/graph.json" - DOCKER_REAL_FILE_PATH="/tmp/${file_name}" - # docker cp file to container - cmd="docker cp ${real_file_path} ${GIE_DB_CONTAINER_NAME}:${DOCKER_REAL_FILE_PATH}" - eval ${cmd} || exit 1 - - cmd="docker exec ${GIE_DB_CONTAINER_NAME} bash -c \"" - cmd=${cmd}" ${DOCKER_DB_GEN_BIN}" - cmd=${cmd}" --engine_type=hqps" - cmd=${cmd}" --input=${DOCKER_REAL_FILE_PATH}" - cmd=${cmd}" --work_dir=/tmp/codegen/" - cmd=${cmd}" --ir_conf=${DOCKER_DB_IR_CONF_FILE}" - cmd=${cmd}" --graph_schema_path=${DOCKER_DB_GRAPH_SCHEMA}" - cmd=${cmd}" --gie_home=${DOCKER_DB_GIE_HOME}" - cmd=${cmd}" --output_dir=${DOCKER_OUTPUT_DIR}" - cmd=${cmd}" \"" - - echo "Running cmd: ${cmd}" - eval ${cmd} || exit 1 - # check output exists - # get the file_name of file_path - file_name="${file_name%.*}" - output_file="${HOST_OUTPUT_DIR}/lib${file_name}.so" - - if [ ! -f "${output_file}" ]; then - err "output file ${output_file} not exist, compilation failed" - exit 1 - fi - info "success generate dynamic lib ${output_file}, please create the cooresponding yaml file ${HOST_OUTPUT_DIR}/${file_name}.yaml." -} - -#################### Entry #################### -if [ $# -eq 0 ]; then - usage - exit 1 -fi - -while [[ $# -gt 0 ]]; do - key="$1" - - case $key in - -h | --help) - usage - exit - ;; - init) - shift - info "Start initiating database..." - do_init "$@" - exit 0 - ;; - start) - shift - info "Start database service..." - do_start "$@" - exit 0 - ;; - status) - shift - do_status "$@" - exit 0 - ;; - stop) - shift - do_stop "$@" - exit 0 - ;; - restart) - shift - do_restart # restart current graph - exit 0 - ;; - compile) - shift - do_compile "$@" - exit 0 - ;; - show_stored_procedure) - shift - do_show_stored_procedure "$@" - exit 0 - ;; - destroy) - shift - do_destroy "$@" - exit 0 - ;; - download_dataset) - shift - do_download_dataset - exit 0 - ;; - *) # unknown option - err "unknown option $1" - usage - exit 1 - ;; - esac -done - - - - diff --git a/flex/interactive/bin/gs_interactive b/flex/interactive/bin/gs_interactive new file mode 100755 index 000000000000..a0890560fede --- /dev/null +++ b/flex/interactive/bin/gs_interactive @@ -0,0 +1,1346 @@ +#!/bin/bash +# Copyright 2020 Alibaba Group Holding Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# The product name +DB_PROD_NAME="interactive" + +# colored error and info functions to wrap messages. +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color +err() { + echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] -ERROR- $* ${NC}" >&2 +} + +info() { + echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] -INFO- $* ${NC}" +} + +################## Some Util Functions ################## + +function parse_yaml { + local prefix=$2 + local separator=${3:-_} + + local indexfix + # Detect awk flavor + if awk --version 2>&1 | grep -q "GNU Awk" ; then + # GNU Awk detected + indexfix=-1 + elif awk -Wv 2>&1 | grep -q "mawk" ; then + # mawk detected + indexfix=0 + fi + + local s='[[:space:]]*' sm='[ \t]*' w='[a-zA-Z0-9_]*' fs=${fs:-$(echo @|tr @ '\034')} i=${i:- } + cat $1 | \ + awk -F$fs "{multi=0; + if(match(\$0,/$sm\|$sm$/)){multi=1; sub(/$sm\|$sm$/,\"\");} + if(match(\$0,/$sm>$sm$/)){multi=2; sub(/$sm>$sm$/,\"\");} + while(multi>0){ + str=\$0; gsub(/^$sm/,\"\", str); + indent=index(\$0,str); + indentstr=substr(\$0, 0, indent+$indexfix) \"$i\"; + obuf=\$0; + getline; + while(index(\$0,indentstr)){ + obuf=obuf substr(\$0, length(indentstr)+1); + if (multi==1){obuf=obuf \"\\\\n\";} + if (multi==2){ + if(match(\$0,/^$sm$/)) + obuf=obuf \"\\\\n\"; + else obuf=obuf \" \"; + } + getline; + } + sub(/$sm$/,\"\",obuf); + print obuf; + multi=0; + if(match(\$0,/$sm\|$sm$/)){multi=1; sub(/$sm\|$sm$/,\"\");} + if(match(\$0,/$sm>$sm$/)){multi=2; sub(/$sm>$sm$/,\"\");} + } + print}" | \ + sed -e "s|^\($s\)?|\1-|" \ + -ne "s|^$s#.*||;s|$s#[^\"']*$||;s|^\([^\"'#]*\)#.*|\1|;t1;t;:1;s|^$s\$||;t2;p;:2;d" | \ + sed -ne "s|,$s\]$s\$|]|" \ + -e ":1;s|^\($s\)\($w\)$s:$s\(&$w\)\?$s\[$s\(.*\)$s,$s\(.*\)$s\]|\1\2: \3[\4]\n\1$i- \5|;t1" \ + -e "s|^\($s\)\($w\)$s:$s\(&$w\)\?$s\[$s\(.*\)$s\]|\1\2: \3\n\1$i- \4|;" \ + -e ":2;s|^\($s\)-$s\[$s\(.*\)$s,$s\(.*\)$s\]|\1- [\2]\n\1$i- \3|;t2" \ + -e "s|^\($s\)-$s\[$s\(.*\)$s\]|\1-\n\1$i- \2|;p" | \ + sed -ne "s|,$s}$s\$|}|" \ + -e ":1;s|^\($s\)-$s{$s\(.*\)$s,$s\($w\)$s:$s\(.*\)$s}|\1- {\2}\n\1$i\3: \4|;t1" \ + -e "s|^\($s\)-$s{$s\(.*\)$s}|\1-\n\1$i\2|;" \ + -e ":2;s|^\($s\)\($w\)$s:$s\(&$w\)\?$s{$s\(.*\)$s,$s\($w\)$s:$s\(.*\)$s}|\1\2: \3 {\4}\n\1$i\5: \6|;t2" \ + -e "s|^\($s\)\($w\)$s:$s\(&$w\)\?$s{$s\(.*\)$s}|\1\2: \3\n\1$i\4|;p" | \ + sed -e "s|^\($s\)\($w\)$s:$s\(&$w\)\(.*\)|\1\2:\4\n\3|" \ + -e "s|^\($s\)-$s\(&$w\)\(.*\)|\1- \3\n\2|" | \ + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\(---\)\($s\)||" \ + -e "s|^\($s\)\(\.\.\.\)\($s\)||" \ + -e "s|^\($s\)-$s[\"']\(.*\)[\"']$s\$|\1$fs$fs\2|p;t" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p;t" \ + -e "s|^\($s\)-$s\(.*\)$s\$|\1$fs$fs\2|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\?\(.*\)$s\$|\1$fs\2$fs\3|" \ + -e "s|^\($s\)[\"']\?\([^&][^$fs]\+\)[\"']$s\$|\1$fs$fs$fs\2|" \ + -e "s|^\($s\)[\"']\?\([^&][^$fs]\+\)$s\$|\1$fs$fs$fs\2|" \ + -e "s|$s\$||p" | \ + awk -F$fs "{ + gsub(/\t/,\" \",\$1); + if(NF>3){if(value!=\"\"){value = value \" \";}value = value \$4;} + else { + if(match(\$1,/^&/)){anchor[substr(\$1,2)]=full_vn;getline}; + indent = length(\$1)/length(\"$i\"); + vname[indent] = \$2; + value= \$3; + for (i in vname) {if (i > indent) {delete vname[i]; idx[i]=0}} + if(length(\$2)== 0){ vname[indent]= ++idx[indent] }; + vn=\"\"; for (i=0; i0)&&index(val, ref)==1){ + tmpval=assignment[val]; + sub(ref,full_vn,val); + if(match(val,\"$separator\$\")){ + gsub(ref,full_vn,tmpval); + } else if (length(tmpval) > 0) { + printf(\"%s=\\\"%s\\\"\n\", val, tmpval); + } + assignment[val]=tmpval; + } + } + } + } else if (length(value) > 0) { + printf(\"%s=\\\"%s\\\"\n\", full_vn, value); + } + }END{ + for(val in assignment){ + if(match(val,\"$separator\$\")) + printf(\"%s=\\\"%s\\\"\n\", val, assignment[val]); + } + }" +} + +# check if the file exists, if not, exit. +function check_file_exists(){ + if [ ! -f "$1" ]; then + err "file $1 not exists" + exit 1 + fi +} +function check_directory_exists(){ + if [ ! -d "$1" ]; then + err "directory $1 not exists" + exit 1 + fi +} + +HOST_DB_HOME="$( + cd "$(dirname "$0")/../" >/dev/null 2>&1 + pwd -P +)" +info "HOST_DB_HOME = ${HOST_DB_HOME}" + +################### GET USER INFO ################### +# get uid +uid=$(id -u) +# get group id +gid=$(id -g) + + +#################### DEFINE CONSTANTS #################### + +# the log directory +# HOST_DB_INTERACTIVE_YAML="${HOST_DB_CONF_DIR}/interactive.yaml" +HOST_DB_RUNNING_FILE="${HOST_DB_HOME}/.running" +HOST_DB_ENV_FILE="${HOST_DB_HOME}/.env" + +DOCKER_DB_GRAPHSCOPE_HOME="/home/graphscope/GraphScope" +DOCKER_DB_GIE_HOME="${DOCKER_DB_GRAPHSCOPE_HOME}/interactive_engine/" +DOCKER_DB_SERVER_BIN="${DOCKER_DB_GRAPHSCOPE_HOME}/flex/build/bin/sync_server" +DOCKER_DB_GRAPH_IMPORT_BIN="${DOCKER_DB_GRAPHSCOPE_HOME}/flex/build/tests/rt_mutable_graph/test_graph_loading" +DOCKER_DB_COMPILER_BIN="com.alibaba.graphscope.GraphServer" +DOCKER_DB_GEN_BIN="${DOCKER_DB_GRAPHSCOPE_HOME}/flex/bin/load_plan_and_gen.sh" +HOST_DB_TMP_DIR="/tmp" + +#################### DEFINE DEFAULT CONSTATNS #################### +DATABASE_VERSION="v0.0.2" +DATABASE_DEFAULT_GRAPH_NAME="modern" +DATABASE_CURRENT_GRAPH_NAME="modern" +DATABASE_DEFAULT_GRAPH_DOCKER_PATH="/home/graphscope/default_graph/${DEFAULT_GRAPH_NAME}" +DATABASE_DEFAULT_GRAPH_MOUNT_CMD="${HOST_DB_HOME}/examples/modern_graph/:${DATABASE_DEFAULT_GRAPH_DOCKER_PATH}" +DATABASE_VOLUMES="${DATABASE_DEFAULT_GRAPH_MOUNT_CMD}" +DATABASE_LOG_LEVEL="INFO" +DATABASE_PORTS="" + +## compiler related default configuration +DATABASE_COMPILER_PLANNER_IS_ON="true" +DATABASE_COMPILER_PLANNER_OPT="RBO" +DATABASE_COMPILER_PLANNER_RULES="FilterMatchRule,FilterIntoJoinRule,NotExistToAntiJoinRule" +DATABASE_COMPILER_ENDPOINT_ADDRESS="localhost" +DATABASE_COMPILER_BOLT_PORT="7687" +DATABASE_COMPILER_QUERY_TIMEOUT="20000" + +## hiactor related default configuration +DATABASE_COMPUTE_ENGINE_PORT="10000" +DATABASE_COMPUTE_ENGINE_SHARD_NUM=1 + +## directories +DATABASE_WORKSPACE="/home/graphscope/workspace/" +DATABASE_DATA_DIR_NAME="data" +DATABASE_LOG_DIR_NAME="logs" +DATABASE_CONF_DIR_NAME="conf" + + +################### IMAGE VERSION ################### +GIE_DB_IMAGE_VERSION="v0.0.2" +#GIE_DB_IMAGE_NAME="registry.cn-hongkong.aliyuncs.com/graphscope/${DB_PROD_NAME}" +GIE_DB_IMAGE_NAME="interactive" +GIE_DB_CONTAINER_NAME="${DB_PROD_NAME}-server" + + +#################### Prepare uncreated directories #################### + +info "Finish create log dir" + +#################### DEFINE FUNCTIONS #################### +function check_running_containers_and_exit(){ + # check if there is any running containers + info "Check running containers and exit" + running_containers=$(docker ps -a --format "{{.Names}}" | grep "${GIE_DB_CONTAINER_NAME}") + if [ -n "${running_containers}" ]; then + err "There are running containers: ${running_containers}, please stop them first." + exit 1 + fi + info "finish check" +} + +function check_container_running(){ + if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "true" ]; then + info "container ${GIE_DB_CONTAINER_NAME} is running" + else + info "container ${GIE_DB_CONTAINER_NAME} is not running" + # start the container + docker start "${GIE_DB_CONTAINER_NAME}" + fi +} + +function ensure_container_running(){ + if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "true" ]; then + info "container ${GIE_DB_CONTAINER_NAME} is running" + else + info "container ${GIE_DB_CONTAINER_NAME} is not running" + # start the container + docker start "${GIE_DB_CONTAINER_NAME}" + fi +} + +function check_process_running_in_container(){ + local container_name=$1 + local process_name=$2 + local error_msg=$3 + local process_id=$(docker top "${container_name}" | grep "${process_name}" | awk '{print $2}\') + if [ -z "${process_id}" ]; then + err "process ${process_name} is not running in container ${container_name}" + err "${error_msg}" + exit 1 + fi + info "process ${process_name} is running in container ${container_name}, process id is ${process_id}" +} + +function check_process_not_running_in_container(){ + local container_name=$1 + local process_name=$2 + local error_msg=$3 + local process_id=$(docker top "${container_name}" | grep "${process_name}" | awk '{print $2}\') + if [ -z "${process_id}" ]; then + info "process ${process_name} is not running in container ${container_name}" + else + err "process ${process_name} is running in container ${container_name}, process id is ${process_id}" + err "${error_msg}" + exit 1 + fi +} + +# check the given graph is locked or not. +function check_graph_not_running(){ + info "Check graph whether is not running" + if [ $# -ne 1 ]; then + err "Expect graph name given." + exit 1 + fi + local graph_name=$1 + # check whether .lock is presented in container's data/${graph_name}/ directory + . ${HOST_DB_ENV_FILE} + local lock_file="${DATABASE_WORKSPACE}/data/${graph_name}/.lock" + info "Check lock file ${lock_file}" + # check lock_file whether exists in container, if not exists, exit 0, else exit 1 + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "[ ! -f ${lock_file} ]" +} + +function update_init_config_from_yaml(){ + if [ $# -ne 1 ]; then + err "Expect configuration file given" + exit 1 + fi + config_file=$1 + eval $(parse_yaml "${config_file}") + # update workspace if exists + if [ -n "${workspace}" ]; then + DATABASE_WORKSPACE="${workspace}" + fi + # update database version if exists + if [ -n "${version}" ]; then + DATABASE_VERSION="${version}" + fi + # append the found volumes to DATABASE_VOLUMES + # map the HOST_DB_HOME/data/ to ${DATABASE_WORKSPACE}/data + DATABASE_VOLUMES="${DATABASE_VOLUMES},${HOST_DB_HOME}/data:${DATABASE_WORKSPACE}/data" + + x=1 + while true; do + volume_x_key="volume_${x}" + volume_x=$(eval echo "\$${volume_x_key}") + if [ -z "${volume_x}" ]; then + break + fi + DATABASE_VOLUMES="${DATABASE_VOLUMES},${volume_x}" + x=$((x + 1)) + done + # append compiler port and engine port to DATABASE_PORTS + DATABASE_PORTS="${DATABASE_COMPILER_BOLT_PORT}:${DATABASE_COMPILER_BOLT_PORT}" + DATABASE_PORTS="${DATABASE_PORTS},${DATABASE_COMPUTE_ENGINE_PORT}:${DATABASE_COMPUTE_ENGINE_PORT}" +} + +function update_engine_config_from_yaml(){ + if [ $# -ne 1 ]; then + err "Expect configuration file given" + exit 1 + fi + config_file=$1 + eval $(parse_yaml "${config_file}") + if [ -n "${log_level}" ]; then + DATABASE_LOG_LEVEL="${log_level}" + fi + # default_graph + if [ -n "${default_graph}" ]; then + DATABASE_CURRENT_GRAPH_NAME="${default_graph}" + fi + # compiler + if [ -n ${compiler_planner_is_on} ]; then + DATABASE_COMPILER_PLANNER_IS_ON="${compiler_planner_is_on}" + fi + info "Found compiler planner opt: ${compiler_planner_is_on}, ${DATABASE_COMPILER_PLANNER_IS_ON}" + if [ -n ${compiler_planner_opt} ]; then + DATABASE_COMPILER_PLANNER_OPT="${compiler_planner_opt}" + fi + # append the founded compiler planner rules to DATABASE_COMPILER_PLANNER_RULES + x=1 + while true; do + compiler_planner_rules_x_key="compiler_planner_rules_${x}" + compiler_planner_rules_x=$(eval echo "\$${compiler_planner_rules_x_key}") + if [ -z "${compiler_planner_rules_x}" ]; then + break + fi + # check compiler_planner_rules_x present in DATABASE_COMPILER_PLANNER_RULES, if not, append + if [[ ! "${DATABASE_COMPILER_PLANNER_RULES}" =~ "${compiler_planner_rules_x}" ]]; then + DATABASE_COMPILER_PLANNER_RULES="${DATABASE_COMPILER_PLANNER_RULES},${compiler_planner_rules_x}" + fi + x=$((x + 1)) + done + if [ -n "${compiler_endpoint_address}" ]; then + DATABASE_COMPILER_ENDPOINT_ADDRESS="${compiler_endpoint_address}" + fi + if [ -n "${compiler_endpoint_bolt_connector_port}" ]; then + DATABASE_COMPILER_BOLT_PORT="${compiler_endpoint_bolt_connector_port}" + fi + if [ -n "${compiler_query_timeout}" ]; then + DATABASE_COMPILER_QUERY_TIMEOUT="${compiler_query_timeout}" + fi +} + + +#################### DEFINE USAGE #################### + +function init_usage() { + cat << EOF + db_admin.sh init -c [--config] + Init the database, create the containers. Specify the database version and volume mounting in the config yaml. +EOF +} + +function destroy_usage() { + cat << EOF + db_admin.sh destroy + Destroy the current database, remove the container. +EOF +} + +function create_usage() { + cat << EOF + db_admin.sh database create -n [--name] -c [--config] + Create a graph in database, with the provided schema file. + User should import data to the created graph. +EOF +} + +function remove_usage() { + cat << EOF + db_admin.sh database remove -n [--name] + Remove the database with the given graph. +EOF +} + +function import_usage() { + cat << EOF + db_admin.sh database import -n [--name] -c [--config] + Load the raw data specified in bulk load file to the specified graph. +EOF +} + +function database_usage(){ + create_usage + remove_usage + import_usage +} + + +function start_usage() { + cat << EOF + db_admin.sh service start -n [--name] -c [--config] + Start the graph service on the specified graph, with the provided engine config file. +EOF +} + +function stop_usage() { + cat << EOF + db_admin.sh service stop + Stop the database with the given graph. +EOF +} + +function restart_usage() { + cat << EOF + db_admin.sh service restart -c [--config] [engine config file] + Restart the database with current running graph. Can update with new engine config file. +EOF +} + +function get_log_usage() { + cat << EOF + db_admin.sh service get_log -o [--output] output directory + Get the log of the specified service/compiler, and write to the output file. +EOF +} + +function services_usage(){ + start_usage + stop_usage + restart_usage + get_log_usage +} + +function compile_usage(){ + cat << EOF + db_admin.sh procedure compile -g[--graph] -i + Compile cypher/.cc to dynamic library, according to the schema of graph. The output library will be placed at ./data/{graph_name}/lib. +EOF +} + +function show_stored_procedure_usage(){ + cat << EOF + db_admin.sh procedure show -n[--name] graph_name + Show all stored procedure for the given graph. +EOF +} + + +function procedure_usage(){ + compile_usage + show_stored_procedure_usage +} + +# parse the args and set the variables. +function usage() { + init_usage + destroy_usage + database_usage + services_usage + procedure_usage +} + +################### Generate config file ################### +function do_gen_conf(){ + # receive only one args, the config file + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -o | --output) + output_config_file="$2" + shift + shift + ;; + *) + err "unknown option $1" + exit 1 + ;; + esac + done + + #if output_config_file exists, remove + if [ -f "${output_config_file}" ]; then + rm "${output_config_file}" + fi + + # echo directories + echo "directories:" >> ${output_config_file} + echo " workspace: ${DATABASE_WORKSPACE}" >> ${output_config_file} + echo " subdirs:" >> ${output_config_file} + echo " data: ${DATABASE_DATA_DIR_NAME}" >> ${output_config_file} + echo " logs: ${DATABASE_LOG_DIR_NAME}" >> ${output_config_file} + echo " conf: ${DATABASE_CONF_DIR_NAME}" >> ${output_config_file} + + # log level + echo "log_level: ${DATABASE_LOG_LEVEL}" >> ${output_config_file} + + # current graph + echo "default_graph: ${DATABASE_CURRENT_GRAPH_NAME}" >> ${output_config_file} + + + #compute_engine + echo "compute_engine:" >> ${output_config_file} + echo " type: hiactor" >> ${output_config_file} + echo " hosts:" >> ${output_config_file} + echo " - localhost:${DATABASE_COMPUTE_ENGINE_PORT}" >> ${output_config_file} + echo " shard_num: ${DATABASE_COMPUTE_ENGINE_SHARD_NUM}" >> ${output_config_file} + + + #compiler + echo "compiler:" >> ${output_config_file} + echo " planner:" >> ${output_config_file} + echo " is_on: ${DATABASE_COMPILER_PLANNER_IS_ON}" >> ${output_config_file} + echo " opt: ${DATABASE_COMPILER_PLANNER_OPT}" >> ${output_config_file} + # split compiler planner rules and put as sequences in yaml + echo " rules:" >> ${output_config_file} + IFS=',' read -ra RULES_ARRAY <<<"${DATABASE_COMPILER_PLANNER_RULES}" + for rule in "${RULES_ARRAY[@]}"; do + echo " - ${rule}" >> ${output_config_file} + done + echo " endpoint:" >> ${output_config_file} + echo " default_listen_address: ${DATABASE_COMPILER_ENDPOINT_ADDRESS}" >> ${output_config_file} + echo " bolt_connector:" >> ${output_config_file} + echo " port: ${DATABASE_COMPILER_BOLT_PORT}" >> ${output_config_file} + echo " gremlin_connector:" >> ${output_config_file} + echo " disabled: true" >> ${output_config_file} + echo " port: 8182" >> ${output_config_file} + echo " query_timeout: ${DATABASE_COMPILER_QUERY_TIMEOUT}" >> ${output_config_file} + info "Finish generate config file ${output_config_file}" +} + +function generate_real_engine_conf(){ + # expect two args + if [ $# -ne 2 ]; then + err "Expect two args, but got $#" + exit 1 + fi + engine_config_file=$1 + real_engine_config_file=$2 + if [ -z "${engine_config_file}" ]; then + info "engine config file is not specified, using default engine config" + do_gen_conf -o ${real_engine_config_file} + else + check_file_exists "${engine_config_file}" + update_engine_config_from_yaml "${engine_config_file}" + do_gen_conf -o ${real_engine_config_file} + fi +} + + +#################### Init database #################### +# Init the current data base. +# create a user with same user id in container +function do_init(){ + # check running containers and exit + check_running_containers_and_exit + info "Ok, no running instance found, start init database..." + # if no containers running, procede to init + +# check args num 1, and get the first args as CONFIG_FILE + if [ $# -eq 0 ]; then + err "init command need 1 args, but got $#" + init_usage + exit 1 + fi + + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -c | --config) + config_file="$2" + shift # past argument + shift + ;; + *) + err "unknown option $1" + init_usage + exit 1 + ;; + esac + done + + check_file_exists "${config_file}" + + # parse yaml config + # eval $(parse_yaml "${config_file}") + + # Parse the configuration presented in yaml, and override the default values. + update_init_config_from_yaml "${config_file}" + + #0. Found workspace + info "Found docker db home: ${DATABASE_WORKSPACE}" + # put docker_workspace into env + echo "export DATABASE_WORKSPACE=${DATABASE_WORKSPACE}" >> ${HOST_DB_ENV_FILE} + echo "export DATABASE_DATA_DIR_NAME=${DATABASE_DATA_DIR_NAME}" >> ${HOST_DB_ENV_FILE} + info "Found databse version: ${DATABASE_VERSION}" + + #2. Found mounting volumes from yaml file + mount_cmd="" + # split DATABASE_VOLUMES and append to mount_cmd + IFS=',' read -ra VOLUME_ARRAY <<<"${DATABASE_VOLUMES}" + for volume in "${VOLUME_ARRAY[@]}"; do + # split with : and check host path exists + volume_value_array=(${volume//:/ }) + # if volume_value_array length is not 2, error + if [ ${#volume_value_array[@]} -ne 2 ]; then + err "volume ${volume_value_array} is not valid, should be :" + exit 1 + fi + # get host_path + host_path=${volume_value_array[0]} + docker_path=${volume_value_array[1]} + # check host_path exists + info "Found host path: ${host_path}" + check_directory_exists "${host_path}" || (err "host path ${host_path} not exists" && exit 1) + mount_cmd="${mount_cmd} -v ${volume}" + done +# mount_cmd="${mount_cmd} -v /etc/passwd:/etc/passwd:ro -v /etc/group:/etc/group:ro" + + info "Found docker volumes: ${mount_cmd}" + + #3. get mapped port + port_cmd="" + # split the DATABASE_PORTS and append to port_cmd + IFS=',' read -ra DATABASE_PORTS_ARRAY <<<"${DATABASE_PORTS}" + for ports in "${DATABASE_PORTS_ARRAY[@]}"; do + port_x_value_array=(${ports//:/ }) + # if volume_x_value_array length is not 2, error + if [ ${#port_x_value_array[@]} -ne 2 ]; then + err "port ${port_x_value_array} is not valid, should be :" + exit 1 + fi + # get host_path + host_port=${port_x_value_array[0]} + docker_port=${port_x_value_array[1]} + #check port are int + if ! [[ "${host_port}" =~ ^[0-9]+$ ]]; then + err "host port ${host_port} is not valid" + exit 1 + fi + if ! [[ "${docker_port}" =~ ^[0-9]+$ ]]; then + err "docker port ${docker_port} is not valid" + exit 1 + fi + port_cmd="${port_cmd} -p ${host_port}:${docker_port}" + done + info "Found docker port: ${port_cmd}" + + # get uid + local uid=$(id -u) + local gid=$(id -g) + # get group name + local group_name=$(id -gn) + # get username + local username=$(id -un) + + GIE_DB_IMAGE_NAME_TAG="${GIE_DB_IMAGE_NAME}:${DATABASE_VERSION}" + cmd="docker run -it -d --privileged --name ${GIE_DB_CONTAINER_NAME}" + # create user in container + cmd="${cmd} ${port_cmd} ${mount_cmd} ${GIE_DB_IMAGE_NAME_TAG} bash" + + info "Running cmd: ${cmd}" + eval ${cmd} || docker rm "${GIE_DB_CONTAINER_NAME}" + + info "Finish init database" + + # create the workspace directory in container + docker exec -u graphscope "${GIE_DB_CONTAINER_NAME}" bash -c "mkdir -p ${DATABASE_WORKSPACE}" || exit 1 + docker exec -u graphscope "${GIE_DB_CONTAINER_NAME}" bash -c "sudo chown -R graphscope:graphscope ${DATABASE_WORKSPACE}" || exit 1 + docker exec -u graphscope "${GIE_DB_CONTAINER_NAME}" bash -c "mkdir -p ${DATABASE_WORKSPACE}/logs" || exit 1 + docker exec -u graphscope "${GIE_DB_CONTAINER_NAME}" bash -c "mkdir -p ${DATABASE_WORKSPACE}/conf" || exit 1 +} + + +#################### Create graph #################### +function do_create(){ + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -n | --name) + graph_name="$2" + shift # past argument + shift + ;; + -c | --config) + schema_file="$2" + shift + shift + ;; + *) + err "unknown option $1" + create_usage + exit 1 + ;; + esac + done + # check graph_name is set + if [ -z "${graph_name}" ]; then + err "graph name is not specified" + create_usage + exit 1 + fi + check_file_exists "${schema_file}" + # check graph is running inside docker + check_graph_not_running ${graph_name} || err "Can not create graph ${graph_name}, since a graph with same nameing running." + # create the graph directory in the docker's workspace + . ${HOST_DB_ENV_FILE} + docker_graph_dir="${DATABASE_WORKSPACE}/data/${graph_name}" + docker_graph_schema_file="${docker_graph_dir}/graph.yaml" + # check docker_graph_schema_file exists in the container, if exists, tell user to remove it first + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "[ -f ${docker_graph_schema_file} ] && echo \"graph ${graph_name} already exists, please remove it first\" && exit 1 || exit 0" || exit 1 + # create the graph directory in the docker's workspace + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "mkdir -p ${docker_graph_dir}" || exit 1 + # create plugins dir + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "mkdir -p ${docker_graph_dir}/plugins" || exit 1 + # copy the schema file to the docker's workspace + docker cp "${schema_file}" "${GIE_DB_CONTAINER_NAME}:${docker_graph_schema_file}" || exit 1 + info "Successfuly create graph ${graph_name}" + #TODO: support creating an empty graph +} + +###################Remove graph#################### +function do_remove(){ + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -n | --name) + graph_name="$2" + shift # past argument + shift + ;; + *) + err "unknown option $1" + create_usage + exit 1 + ;; + esac + done + # check graph_name is set + if [ -z "${graph_name}" ]; then + err "graph name is not specified" + remove_usage + exit 1 + fi + # check graph is running inside docker + check_graph_not_running ${graph_name} || err "Can not remove graph ${graph_name}, since a graph with same nameing running." + . ${HOST_DB_ENV_FILE} + docker_graph_dir="${DATABASE_WORKSPACE}/data/${graph_name}" + # rm -rf the graph directory in the docker's workspace + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "rm -rf ${docker_graph_dir}" || exit 1 +} + +#################### Import #################### +function do_import(){ + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -n | --name) + graph_name="$2" + shift # past argument + shift + ;; + -c | --config) + bulk_load_file="$2" + shift + shift + ;; + *) + err "unknown option $1" + import_usage + exit 1 + ;; + esac + done + info "Import data to graph ${graph_name} from ${bulk_load_file}" + # check if the container is running + check_container_running + # check if the bulk_load_file exists + check_file_exists "${bulk_load_file}" + info "bulk_load_file ${bulk_load_file} exists" + + check_graph_not_running ${graph_name} || info "Can not import data to graph ${graph_name}, since it is already running." + . ${HOST_DB_ENV_FILE} + # check graph_schema_file exists in container, if not, let user create graph first + docker_graph_schema_file="${DATABASE_WORKSPACE}/data/${graph_name}/graph.yaml" + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "[ -f ${docker_graph_schema_file} ] || (echo \"graph ${graph_name} not exists, please create it first\" && exit 1)" + info "Graph Schema exists" + # copy the bulk_load_file to container + bulk_load_file_name=$(basename "${bulk_load_file}") + docker_bulk_load_file="/tmp/${bulk_load_file_name}" + docker cp "${bulk_load_file}" "${GIE_DB_CONTAINER_NAME}:${docker_bulk_load_file}" + + docker_graph_data_dir="${DATABASE_WORKSPACE}/data//${graph_name}/indices" + # currently we can only overwrite the indices, so if it exists, remove it first + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "[ -d ${docker_graph_data_dir} ] && rm -rf ${docker_graph_data_dir} || exit 0" || exit 1 + + cmd="docker exec ${GIE_DB_CONTAINER_NAME} bash -c \"" + cmd="${cmd} ${DOCKER_DB_GRAPH_IMPORT_BIN} ${docker_graph_schema_file} ${docker_bulk_load_file} ${docker_graph_data_dir}" + cmd="${cmd} \"" + + info "Running cmd: ${cmd}" + eval ${cmd} || (echo "Fail to import graph to database " && exit 1) + info "Successfuly import data to graph ${graph_name}" +} + +#################### Destroy #################### +function do_destroy() { + info "Destroy database" + docker stop "${GIE_DB_CONTAINER_NAME}" + docker rm "${GIE_DB_CONTAINER_NAME}" + . ${HOST_DB_ENV_FILE} + # rm host data/* + rm -rf ${HOST_DB_HOME}/data/* + + #rm .running + rm ${HOST_DB_RUNNING_FILE} + rm ${HOST_DB_ENV_FILE} + + + info "Finish destroy database" +} + +#################### Start database #################### +function do_start(){ + . ${HOST_DB_ENV_FILE} + info "Starting database..." + + # check whether the .running file exists, if exists, exit + check_process_not_running_in_container "${GIE_DB_CONTAINER_NAME}" "${DOCKER_DB_SERVER_BIN}" "Database is already running" + check_process_not_running_in_container "${GIE_DB_CONTAINER_NAME}" "${DOCKER_DB_COMPILER_BIN}" "Compiler is already running" + + # set trap to do_stop + trap do_stop SIGINT SIGTERM + + graph_name="" + engine_config_file="" + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -n | --name) + graph_name="$2" + shift # past argument + shift + ;; + -c | --config) + engine_config_file="$2" + shift + shift + ;; + *) + err "unknown option $1" + start_usage + exit 1 + ;; + esac + done + # try parse default_graph from engine_config_file + # generate real engine config file, put it at /tmp/real_engine_config.yaml + if [ -z "${graph_name}" ]; then + graph_name=${DATABASE_CURRENT_GRAPH_NAME} + info "Using user specified graph ${graph_name}" + else + DATABASE_CURRENT_GRAPH_NAME=${graph_name} + fi + + real_engine_config_file="/tmp/real_engine_config.yaml" + generate_real_engine_conf "${engine_config_file}" "${real_engine_config_file}" + + # copy engine config file to container + dst_engine_config_file="${DATABASE_WORKSPACE}/conf/engine_config.yaml" + docker cp "${real_engine_config_file}" "${GIE_DB_CONTAINER_NAME}:${dst_engine_config_file}" || (echo "fail to copy $engine_config_file to container" && exit 1) + + + if [ -z "${graph_name}" ]; then + info "graph name is not specified" + info "Using default graph [modern]" + graph_name="modern" + fi + + # check if modern_graph exists in container, get the result as bool + docker_graph_schema_file="${DATABASE_WORKSPACE}/data/${graph_name}/graph.yaml" + wal_file="${DATABASE_WORKSPACE}/data/${graph_name}/indices/init_snapshot.bin" + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "( [ -f ${docker_graph_schema_file} ] && [ -f ${wal_file} ] && echo \"true\" e) || echo \"false\"" > /tmp/graph_exists + graph_exists=$(cat /tmp/graph_exists) + if [ "${graph_exists}" = "false" ]; then + info "graph ${graph_name} not exists, create it first" + # remove the data/${graph_name} directory in container + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "rm -rf ${DATABASE_WORKSPACE}/data/${graph_name}" + do_create -n ${graph_name} -c ${HOST_DB_HOME}/examples/modern_graph/modern_graph.yaml + do_import -n ${graph_name} -c ${HOST_DB_HOME}/examples/modern_graph/bulk_load.yaml + info "Successfuly create and import graph ${graph_name}" + else + info "graph ${graph_name} exists, skip create and import" + fi + + # copy to container + + do_stop + ensure_container_running + + # the bulk_load_file shoud place inside ${DATABASE_WORKSPACE}. and should use relative path + info "In start datebase, received graph_name = ${graph_name}, engine_config_file = ${engine_config_file}" + . ${HOST_DB_ENV_FILE} + docker_server_log_path="${DATABASE_WORKSPACE}/logs/server.log" + graph_schema_file="${DATABASE_WORKSPACE}/data/${graph_name}/graph.yaml" + csr_data_dir="${DATABASE_WORKSPACE}/data/${graph_name}/indices" + docker_graph_plugin_dir=${DATABASE_WORKSPACE}/data/${graph_name}/plugins/ + cmd="docker exec ${GIE_DB_CONTAINER_NAME} bash -c \"" + cmd="${cmd} ${DOCKER_DB_SERVER_BIN} -c ${dst_engine_config_file}" + cmd="${cmd} -g ${graph_schema_file} --data-path ${csr_data_dir}" + cmd="${cmd} --gie-home ${DOCKER_DB_GIE_HOME}" + cmd="${cmd} --plugin-dir ${docker_graph_plugin_dir}" + cmd="${cmd} > ${docker_server_log_path} 2>&1 & \"" + echo "Running cmd: ${cmd}" + # eval command, if fails exist + eval ${cmd} || (echo "Fail to launch hqps server" && exit 1) + sleep 4 + # check whether the process is running + check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_SERVER_BIN} ", use gs_interactive service get_log -o [dir] to see get logs" + info "Successfuly start server" + + # start compiler + docker_compiler_log_path="${DATABASE_WORKSPACE}/logs/compiler.log" + cmd="docker exec ${GIE_DB_CONTAINER_NAME} bash -c \"" + cmd=${cmd}"java -cp \"${DOCKER_DB_GIE_HOME}/compiler/target/libs/*:${DOCKER_DB_GIE_HOME}/compiler/target/compiler-0.0.1-SNAPSHOT.jar\" " + cmd=${cmd}" -Djna.library.path=${DOCKER_DB_GIE_HOME}/executor/ir/target/release" + cmd=${cmd}" -Dgraph.schema=${graph_schema_file}" + # should error be reported? + # cmd=${cmd}" -Dgraph.stored.procedures.uri=file:${docker_graph_plugin_dir}" + cmd=${cmd}" ${DOCKER_DB_COMPILER_BIN} ${dst_engine_config_file} > ${docker_compiler_log_path} 2>&1 &" + cmd=${cmd}"\"" + info "Running cmd: ${cmd}" + eval ${cmd} + sleep 6 + check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_COMPILER_BIN} ", use gs_interactive service get_log -o [dir] to see more details" + info "Successfuly start compiler" + # get cypher port from engine config file + # bolt_connector_port=$(parse_yaml "${engine_config_file}" | grep "compiler_endpoint_bolt_connector_port" | awk -F "=" '{print $2}') + info "DataBase service is running..., port is open on :${DATABASE_COMPILER_BOLT_PORT}" + + # if do_start success, we should write current args to ${HOST_DB_RUNNING_FILE} + echo "GRAPH_NAME=${graph_name}" > ${HOST_DB_RUNNING_FILE} + echo "ENGINE_CONFIG_FILE=${engine_config_file}" >> ${HOST_DB_RUNNING_FILE} + # create .lock file + docker_graph_lock_file="${DATABASE_WORKSPACE}/data/${graph_name}/.lock" + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "touch ${docker_graph_lock_file}" || exit 1 +} + + +#################### Stop database #################### +function do_stop(){ + # if container is not running, do nothing + if [ -f "${HOST_DB_RUNNING_FILE}" ]; then + . ${HOST_DB_ENV_FILE} + else + info "No running database found, do nothing" + fi + # get graph_name from ${HOST_DB_RUNNING_FILE} + local graph_name=$(sed -n '1p' ${HOST_DB_RUNNING_FILE} | cut -d '=' -f 2) + docker_graph_lock_file="${DATABASE_WORKSPACE}/data/${graph_name}/.lock" + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "rm -f ${docker_graph_lock_file}" || exit 1 + info "Successfuly remove ${docker_graph_lock_file} file" + # stop the SERVER_BIN process and graph_server process + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "pkill -f ${DOCKER_DB_SERVER_BIN}" + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "pkill -f ${DOCKER_DB_COMPILER_BIN}" + sleep 5 + info "Successfuly stop database" +} + + +#################### Get database status #################### +function do_status() { + if [ "$(docker inspect -f '{{.State.Running}}' "${GIE_DB_CONTAINER_NAME}")" = "true" ]; then + info "container ${GIE_DB_CONTAINER_NAME} is running" + else + info "container ${GIE_DB_CONTAINER_NAME} is not running" + info "Please start database first" + fi + . ${HOST_DB_ENV_FILE} + # the container is running but the process is not running + check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_SERVER_BIN} "The service is stopped or down. Use gs_interactive service get_log -o [dir] to see more details" + check_process_running_in_container ${GIE_DB_CONTAINER_NAME} ${DOCKER_DB_COMPILER_BIN} "The service is stopped or down. Use gs_interactive service get_log -o [dir] to see more details" + # get cypher port from engine config file in container + + docker_engine_config_file="${DATABASE_WORKSPACE}/conf/engine_config.yaml" + # copy the engine config file to host's tmp directory + docker cp "${GIE_DB_CONTAINER_NAME}:${docker_engine_config_file}" "${HOST_DB_TMP_DIR}/engine_config.yaml" || exit 1 + eval $(parse_yaml "${HOST_DB_TMP_DIR}/engine_config.yaml") + info "Database service is running..., port is open on :${compiler_endpoint_bolt_connector_port}" +} + + + +#################### Restart #################### +function do_restart() { + # read args from cached file. + # get num lines in file ${HOST_DB_RUNNING_FILE} + num_lines=$(wc -l < ${HOST_DB_RUNNING_FILE}) + if [ ${num_lines} -ne 2 ]; then + err "Error: ${HOST_DB_RUNNING_FILE} should have 2 lines, but got ${num_lines}, something wrong with the file ${HOST_DB_RUNNING_FILE}" + exit 1 + fi + # read args from file + GRAPH_NAME=$(sed -n '1p' ${HOST_DB_RUNNING_FILE} | cut -d '=' -f 2) + ENGINE_CONFIG_FILE=$(sed -n '2p' ${HOST_DB_RUNNING_FILE} | cut -d '=' -f 2) + # parse current args, override the args from file + info "Restarting database..." + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -n | --name) + GRAPH_NAME="$2" + shift # past argument + shift + ;; + -c | --config) + ENGINE_CONFIG_FILE="$2" + shift + shift + ;; + *) + err "unknown option $1" + restart_usage + exit 1 + ;; + esac + done + do_stop + info "Successfuly stop database" + do_start -n ${GRAPH_NAME} -c ${ENGINE_CONFIG_FILE} + info "Finish restarting database..." +} + +#################### Get log #################### +function do_log(){ + . ${HOST_DB_ENV_FILE} + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -o | --output) + directory="$2" + shift # past argument + shift + ;; + *) + err "unknown option $1" + get_log_usage + exit 1 + ;; + esac + done + # check directory is set + if [ -z "${directory}" ]; then + err "output directory is not specified" + get_log_usage + exit 1 + fi + # get log directory in container + docker_log_dir="${DATABASE_WORKSPACE}/logs" + # copy ${docker_log_dir}/compiler.log and ${docker_log_dir}/server.log to ${directory} + docker_compiler_log="${docker_log_dir}/compiler.log" + docker_server_log="${docker_log_dir}/server.log" + # docker cp + docker cp "${GIE_DB_CONTAINER_NAME}:${docker_compiler_log}" "${directory}/compiler.log" || exit 1 + docker cp "${GIE_DB_CONTAINER_NAME}:${docker_server_log}" "${directory}/server.log" || exit 1 + info "Successfuly get log to ${directory}, please check compiler.log and server.log" +} + +# the compiled dynamic libs will be placed at data/${graph_name}/plugins/ +# after compilation, the user need to write the cooresponding yaml, telling the compiler about +# the input and output of the stored procedure +function do_compile() { + ensure_container_running + if [ $# -ne 6 ]; then + err "stored_procedure command need 6 args, but got $#" + compile_usage + exit 1 + fi + + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -g | --graph) + graph_name="$2" + info "graph_name = ${graph_name}" + shift # past argument + shift + ;; + -i | --input) + file_path="$2" + shift # past argument + shift + ;; + -c | --config) + engine_config="$2" + shift + shift + ;; + *) + err "unknown option $1" + compile_usage + exit 1 + ;; + esac + done + + # check graph_name + if [ -z "${graph_name}" ]; then + err "graph_name is empty" + compile_usage + exit 1 + fi + + # check file_path + check_file_exists "${file_path}" + # check engine_config + check_file_exists "${engine_config}" + # copy engine_config to container + . ${HOST_DB_ENV_FILE} + # generate_real_engine_conf + real_engine_config_file="/tmp/real_engine_config.yaml" + generate_real_engine_conf "${engine_config}" "${real_engine_config_file}" + # copy to container + docker_engine_config="${DATABASE_WORKSPACE}/conf/engine_config.yaml" + docker cp "${real_engine_config_file}" "${GIE_DB_CONTAINER_NAME}:${docker_engine_config}" || exit 1 + + # get real file_path + file_name=$(basename "${file_path}") + real_file_path=$(realpath "${file_path}") + # check exists + if [ ! -f "${real_file_path}" ]; then + err "file ${real_file_path} not exist" + exit 1 + fi + docker_graph_dir="${DATABASE_WORKSPACE}/data/${graph_name}" + docker_graph_schema="${docker_graph_dir}/graph.yaml" + docker exec "${GIE_DB_CONTAINER_NAME}" bash -c "[ -d ${docker_graph_dir} ] || (echo \"graph ${graph_name} not exists, please create it first\" && exit 1)" + + container_output_dir="${DATABASE_WORKSPACE}/data/${graph_name}/plugins" + cotainer_input_path="/tmp/${file_name}" + # docker cp file to container + cmd="docker cp ${real_file_path} ${GIE_DB_CONTAINER_NAME}:${cotainer_input_path}" + eval ${cmd} || exit 1 + + cmd="docker exec ${GIE_DB_CONTAINER_NAME} bash -c \"" + cmd=${cmd}" ${DOCKER_DB_GEN_BIN}" + cmd=${cmd}" --engine_type=hqps" + cmd=${cmd}" --input=${cotainer_input_path}" + cmd=${cmd}" --work_dir=/tmp/codegen/" + cmd=${cmd}" --ir_conf=${docker_engine_config}" + cmd=${cmd}" --graph_schema_path=${docker_graph_schema}" + cmd=${cmd}" --gie_home=${DOCKER_DB_GIE_HOME}" + cmd=${cmd}" --output_dir=${container_output_dir}" + cmd=${cmd}" \"" + + echo "Running cmd: ${cmd}" + eval ${cmd} || exit 1 + # check output exists + # get the file_name of file_path + file_name="${file_name%.*}" + output_file="${HOST_DB_HOME}/data/${graph_name}/plugins/lib${file_name}.so" + + if [ ! -f "${output_file}" ]; then + err "output file ${output_file} not exist, compilation failed" + exit 1 + fi + info "success generate dynamic lib ${output_file}." +} + +function do_database(){ + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + create) + shift + do_create "$@" + exit 0 + ;; + remove) + shift + do_remove "$@" + exit 0 + ;; + import) + shift + do_import "$@" + exit 0 + ;; + *) + err "unknown option $1" + database_usage + exit 1 + ;; + esac + done +} + +function do_service(){ + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + start) + shift + do_start "$@" + exit 0 + ;; + stop) + shift + do_stop "$@" + exit 0 + ;; + restart) + shift + do_restart "$@" + exit 0 + ;; + status) + shift + do_status "$@" + exit 0 + ;; + get_log) + shift + do_log "$@" + exit 0 + ;; + *) + err "unknown option $1" + services_usage + exit 1 + ;; + esac + done +} + +function do_procedure(){ + while [[ $# -gt 0 ]]; do + key="$1" + case $key in + compile) + shift + do_compile "$@" + exit 0 + ;; + show) + shift + err "show stored procedure not implemented yet." + exit 1 + ;; + *) + err "unknown option $1" + procedure_usage + exit 1 + ;; + esac + done + procedure_usage +} + +#################### Entry #################### +if [ $# -eq 0 ]; then + usage + exit 1 +fi + +while [[ $# -gt 0 ]]; do + key="$1" + + case $key in + -h | --help) + usage + exit + ;; + init) + shift + info "Start initiating database..." + do_init "$@" + exit 0 + ;; + database) + shift + do_database "$@" + exit 0 + ;; + service) + shift + do_service "$@" + exit 0 + ;; + procedure) + shift + do_procedure "$@" + exit 0 + ;; + destroy) + shift + do_destroy "$@" + exit 0 + ;; + gen_conf) + shift + do_gen_conf "$@" + exit 0 + ;; + *) # unknown option + err "unknown option $1" + usage + exit 1 + ;; + esac +done + + + + diff --git a/flex/interactive/conf/engine_config.yaml b/flex/interactive/conf/engine_config.yaml new file mode 100644 index 000000000000..0bb4487bda46 --- /dev/null +++ b/flex/interactive/conf/engine_config.yaml @@ -0,0 +1,13 @@ +log_level: INFO # default INFO +default_graph: modern # configure the graph to be loaded while starting the service, if graph name not specified +compute_engine: + shard_num: 1 # the number of shared workers, default 1 +compiler: + planner: + is_on: true + opt: RBO + rules: + - FilterMatchRule + - FilterIntoJoinRule + - NotExistToAntiJoinRule + query_timeout: 20000 # query timeout in milliseconds, default 2000 \ No newline at end of file diff --git a/flex/interactive/conf/interactive.properties b/flex/interactive/conf/interactive.properties deleted file mode 100755 index bd1ee716ef00..000000000000 --- a/flex/interactive/conf/interactive.properties +++ /dev/null @@ -1,8 +0,0 @@ -engine.type: hiactor -hiactor.hosts: localhost:10000 -graph.store: exp -graph.schema: file:../data/ldbc/graph.json -graph.stored.procedures.uri: file:/tmp -graph.planner: {"isOn":true,"opt":"RBO","rules":["FilterMatchRule"]} -gremlin.server.disabled: true -neo4j.bolt.server.port: 7687 \ No newline at end of file diff --git a/flex/interactive/conf/interactive.yaml b/flex/interactive/conf/interactive.yaml index 969b6ca80401..12ca707b2fab 100755 --- a/flex/interactive/conf/interactive.yaml +++ b/flex/interactive/conf/interactive.yaml @@ -1,23 +1,3 @@ ---- -version: 0.0.1 -directories: - workspace: /home/graphscope/interactive/ - subdirs: - data: data # by default data, relative to ${workspace} - conf: conf # by default conf, relative to ${workspace} - logs: logs # by default logs, relative to ${workspace} -logLevel: INFO # default INFO -default_graph: modern # configure the graph to be loaded while starting the service, if graph name not specified - # may include other configuration items of other engines -compute_engine: - type: hiactor - hosts: - - localhost:10000 # currently only one host can be specified - shared_num: 1 # the number of shared workers, default 1 -compiler: - planner: {"isOn":true,"opt":"RBO","rules":["FilterMatchRule"]} # Confirm这个配置 - endpoint: - default_listen_address: localhost # default localhost - bolt_connector: # for cypher, there may be other connectors, such as bolt_connector, https_connector - enabled: true # default false - port: 7687 +version: v0.0.2 +volumes: + - /home/zhanglei/code/lei/gie-db/GraphScope/flex/interactive/examples/modern_graph:/home/modern_graph/ \ No newline at end of file diff --git a/flex/interactive/data/ldbc/graph.json b/flex/interactive/data/ldbc/graph.json deleted file mode 100755 index f16dd1710336..000000000000 --- a/flex/interactive/data/ldbc/graph.json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "entities": [ - { - "label": { - "id": 1, - "name": "software" - }, - "columns": [ - { - "key": { - "id": 4, - "name": "id" - }, - "data_type": 1, - "is_primary_key": false - }, - { - "key": { - "id": 0, - "name": "name" - }, - "data_type": 4, - "is_primary_key": false - }, - { - "key": { - "id": 2, - "name": "lang" - }, - "data_type": 4, - "is_primary_key": false - } - ] - }, - { - "label": { - "id": 0, - "name": "person" - }, - "columns": [ - { - "key": { - "id": 4, - "name": "id" - }, - "data_type": 1, - "is_primary_key": false - }, - { - "key": { - "id": 0, - "name": "name" - }, - "data_type": 4, - "is_primary_key": false - }, - { - "key": { - "id": 1, - "name": "age" - }, - "data_type": 1, - "is_primary_key": false - } - ] - } - ], - "relations": [ - { - "label": { - "id": 0, - "name": "knows" - }, - "entity_pairs": [ - { - "src": { - "id": 0, - "name": "person" - }, - "dst": { - "id": 0, - "name": "person" - } - } - ], - "columns": [ - { - "key": { - "id": 3, - "name": "weight" - }, - "data_type": 3, - "is_primary_key": false - } - ] - }, - { - "label": { - "id": 1, - "name": "created" - }, - "entity_pairs": [ - { - "src": { - "id": 0, - "name": "person" - }, - "dst": { - "id": 1, - "name": "software" - } - } - ], - "columns": [ - { - "key": { - "id": 3, - "name": "weight" - }, - "data_type": 3, - "is_primary_key": false - } - ] - } - ], - "is_table_id": true, - "is_column_id": false -} \ No newline at end of file diff --git a/flex/interactive/data/ldbc/graph.yaml b/flex/interactive/data/ldbc/graph.yaml deleted file mode 100755 index c37d4731b071..000000000000 --- a/flex/interactive/data/ldbc/graph.yaml +++ /dev/null @@ -1,70 +0,0 @@ -name: modern # then must have a modern dir under ${data} directory -store_type: mutable_csr # v6d, groot, gart -stored_procedures: - directory: plugins # default plugins, relative to ${workspace}/${name} -schema: - vertex_types: - - type_name: person - x_csr_params: - max_vertex_num: 100 - properties: - - property_id: 0 - property_name: id - property_type: - primitive_type: DT_SIGNED_INT64 - - property_id: 1 - property_name: name - property_type: - primitive_type: DT_STRING - - property_id: 2 - property_name: age - property_type: - primitive_type: DT_SIGNED_INT32 - primary_keys: - - id - - type_name: software - x_csr_params: - max_vertex_num: 100 - properties: - - property_id: 0 - property_name: id - property_type: - primitive_type: DT_SIGNED_INT64 - x_csr_params: - - property_id: 1 - property_name: name - property_type: - primitive_type: DT_STRING - - property_id: 2 - property_name: lang - property_type: - primitive_type: DT_STRING - primary_keys: - - id - edge_types: - - type_name: knows - x_csr_params: - incoming_edge_strategy: None - outgoing_edge_strategy: Multiple - vertex_type_pair_relations: - source_vertex: person - destination_vertex: person - relation: MANY_TO_MANY - properties: - - property_id: 0 - property_name: weight - property_type: - primitive_type: DT_DOUBLE - - type_name: created - x_csr_params: - incoming_edge_strategy: None - outgoing_edge_strategy: Single - vertex_type_pair_relations: - source_vertex: person - destination_vertex: software - relation: ONE_TO_MANY - properties: - - property_id: 0 - property_name: weight - property_type: - primitive_type: DT_DOUBLE diff --git a/flex/interactive/docker/interactive-runtime.Dockerfile b/flex/interactive/docker/interactive-runtime.Dockerfile index e5a480644a90..399b67f91d4a 100755 --- a/flex/interactive/docker/interactive-runtime.Dockerfile +++ b/flex/interactive/docker/interactive-runtime.Dockerfile @@ -1,4 +1,4 @@ -FROM registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.4 +FROM registry.cn-hongkong.aliyuncs.com/graphscope/hqps-server-base:v0.0.6 ARG CI=false # change bash as default @@ -9,4 +9,5 @@ RUN cd /home/graphscope/ && git clone -b main --single-branch https://github.com cd GraphScope/flex && mkdir build && cd build && cmake .. -DBUILD_DOC=OFF && sudo make -j install # install graphscope GIE -RUN . /home/graphscope/.cargo/env && cd /home/graphscope/GraphScope/interactive_engine/compiler && make build +RUN . /home/graphscope/.cargo/env && cd /home/graphscope/GraphScope/interactive_engine && \ + mvn clean install -DskipTests -Drevision=0.0.1-SNAPSHOT -Pexperimental diff --git a/flex/interactive/examples/modern_graph b/flex/interactive/examples/modern_graph deleted file mode 120000 index 8ed59122aab3..000000000000 --- a/flex/interactive/examples/modern_graph +++ /dev/null @@ -1 +0,0 @@ -../../storages/rt_mutable_graph/modern_graph/ \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/modern_graph/bulk_load.yaml b/flex/interactive/examples/modern_graph/bulk_load.yaml similarity index 83% rename from flex/storages/rt_mutable_graph/modern_graph/bulk_load.yaml rename to flex/interactive/examples/modern_graph/bulk_load.yaml index b4e690f5bf49..4601075c8266 100644 --- a/flex/storages/rt_mutable_graph/modern_graph/bulk_load.yaml +++ b/flex/interactive/examples/modern_graph/bulk_load.yaml @@ -3,11 +3,21 @@ loading_config: data_source: scheme: file # file, oss, s3, hdfs; only file is supported now # location: # specify it or use FLEX_DATA_DIR env. + location: /home/graphscope/default_graph/ import_option: init # append, overwrite, only init is supported now format: type: csv metadata: delimiter: "|" # other loading configuration places here + header_row: true # whether to use the first row as the header + quoting: false + quote_char: '"' + double_quote: true + escape_char: '\' + escaping: false + block_size: 4MB + batch_reader: true + vertex_mappings: - type_name: person # must align with the schema inputs: @@ -31,7 +41,7 @@ vertex_mappings: column_mappings: - column: index: 0 # can be omitted if the index is the same as the property index - name: id # can be omitted if the name is not known + name: id # can be omitted if the name is not known or header_row is set to true. property: id # must align with the schema - column: index: 1 diff --git a/flex/interactive/examples/modern_graph/count_vertex_num.cypher b/flex/interactive/examples/modern_graph/count_vertex_num.cypher new file mode 100644 index 000000000000..cca16c40269d --- /dev/null +++ b/flex/interactive/examples/modern_graph/count_vertex_num.cypher @@ -0,0 +1 @@ +MATCH(v:person { id: $personId}) RETURN COUNT(v); \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/modern_graph/modern_graph.yaml b/flex/interactive/examples/modern_graph/modern_graph.yaml similarity index 96% rename from flex/storages/rt_mutable_graph/modern_graph/modern_graph.yaml rename to flex/interactive/examples/modern_graph/modern_graph.yaml index 7823b3fd7561..df9b605292c8 100644 --- a/flex/storages/rt_mutable_graph/modern_graph/modern_graph.yaml +++ b/flex/interactive/examples/modern_graph/modern_graph.yaml @@ -2,8 +2,6 @@ name: modern # then must have a modern dir under ${data} directory store_type: mutable_csr # v6d, groot, gart stored_procedures: directory: plugins # default plugins, relative to ${workspace}/${name} - enable_lists: - - ldbc_ic1 schema: vertex_types: - type_id: 0 diff --git a/flex/storages/rt_mutable_graph/modern_graph/person.csv b/flex/interactive/examples/modern_graph/person.csv similarity index 100% rename from flex/storages/rt_mutable_graph/modern_graph/person.csv rename to flex/interactive/examples/modern_graph/person.csv diff --git a/flex/storages/rt_mutable_graph/modern_graph/person_created_software.csv b/flex/interactive/examples/modern_graph/person_created_software.csv similarity index 100% rename from flex/storages/rt_mutable_graph/modern_graph/person_created_software.csv rename to flex/interactive/examples/modern_graph/person_created_software.csv diff --git a/flex/storages/rt_mutable_graph/modern_graph/person_knows_person.csv b/flex/interactive/examples/modern_graph/person_knows_person.csv similarity index 100% rename from flex/storages/rt_mutable_graph/modern_graph/person_knows_person.csv rename to flex/interactive/examples/modern_graph/person_knows_person.csv diff --git a/flex/storages/rt_mutable_graph/modern_graph/software.csv b/flex/interactive/examples/modern_graph/software.csv similarity index 100% rename from flex/storages/rt_mutable_graph/modern_graph/software.csv rename to flex/interactive/examples/modern_graph/software.csv diff --git a/flex/resources/queries/ic/adhoc/ic6_adhoc.cypher b/flex/resources/queries/ic/adhoc/ic6_adhoc.cypher index a5598d9e7ea9..be628ae543e5 100644 --- a/flex/resources/queries/ic/adhoc/ic6_adhoc.cypher +++ b/flex/resources/queries/ic/adhoc/ic6_adhoc.cypher @@ -1,3 +1,3 @@ -MATCH (p_:PERSON {id: 6597069812321})-[:KNOWS*1..3]-(other:PERSON)<-[:HASCREATOR]-(p:POST)-[:HASTAG]->(t:TAG {name: "William_Wordsworth"}), +MATCH (p_:PERSON {id: 30786325579101})-[:KNOWS*1..3]-(other:PERSON)<-[:HASCREATOR]-(p:POST)-[:HASTAG]->(t:TAG {name: "Shakira"}), (p:POST)-[:HASTAG]->(otherTag:TAG) WHERE otherTag <> t RETURN otherTag.name as name, count(distinct p) as postCnt -ORDER BY postCnt desc, name asc LIMIT 10 \ No newline at end of file +ORDER BY postCnt desc, name asc LIMIT 10; \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/CMakeLists.txt b/flex/storages/rt_mutable_graph/CMakeLists.txt index ecd664f540b8..35ed3b1379fe 100644 --- a/flex/storages/rt_mutable_graph/CMakeLists.txt +++ b/flex/storages/rt_mutable_graph/CMakeLists.txt @@ -7,6 +7,12 @@ file(GLOB_RECURSE RT_MUTABLE_GRAPH_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.cc") add_library(flex_rt_mutable_graph SHARED ${RT_MUTABLE_GRAPH_SRC_FILES}) target_link_libraries(flex_rt_mutable_graph ${LIBGRAPELITE_LIBRARIES} ${YAML_CPP_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) +if (ARROW_SHARED_LIB) + target_link_libraries(flex_rt_mutable_graph ${ARROW_SHARED_LIB}) +else() + target_link_libraries(flex_rt_mutable_graph ${ARROW_STATIC_LIB}) +endif() + install(TARGETS flex_rt_mutable_graph RUNTIME DESTINATION bin ARCHIVE DESTINATION lib diff --git a/flex/storages/rt_mutable_graph/loader/basic_fragment_loader.cc b/flex/storages/rt_mutable_graph/loader/basic_fragment_loader.cc new file mode 100644 index 000000000000..4da485d490d3 --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/basic_fragment_loader.cc @@ -0,0 +1,95 @@ + +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flex/storages/rt_mutable_graph/loader/basic_fragment_loader.h" + +namespace gs { + +BasicFragmentLoader::BasicFragmentLoader(const Schema& schema) + : schema_(schema), + vertex_label_num_(schema_.vertex_label_num()), + edge_label_num_(schema_.edge_label_num()) { + vertex_data_.resize(vertex_label_num_); + ie_.resize(vertex_label_num_ * vertex_label_num_ * edge_label_num_, NULL); + oe_.resize(vertex_label_num_ * vertex_label_num_ * edge_label_num_, NULL); + lf_indexers_.resize(vertex_label_num_); + + init_vertex_data(); +} + +void BasicFragmentLoader::init_vertex_data() { + for (label_t v_label = 0; v_label < vertex_label_num_; v_label++) { + auto& v_data = vertex_data_[v_label]; + auto label_name = schema_.get_vertex_label_name(v_label); + auto& property_types = schema_.get_vertex_properties(v_label); + auto& property_names = schema_.get_vertex_property_names(v_label); + v_data.init(property_names, property_types, + schema_.get_vertex_storage_strategies(label_name), + schema_.get_max_vnum(label_name)); + } + VLOG(10) << "Finish init vertex data"; +} + +void BasicFragmentLoader::LoadFragment(MutablePropertyFragment& res_fragment) { + CHECK(res_fragment.ie_.empty()) << "Fragment is not empty"; + CHECK(res_fragment.oe_.empty()) << "Fragment is not empty"; + CHECK(res_fragment.vertex_data_.empty()) << "Fragment is not empty"; + + res_fragment.schema_ = schema_; + res_fragment.vertex_label_num_ = vertex_label_num_; + res_fragment.edge_label_num_ = edge_label_num_; + res_fragment.ie_.swap(ie_); + res_fragment.oe_.swap(oe_); + res_fragment.vertex_data_.swap(vertex_data_); + res_fragment.lf_indexers_.swap(lf_indexers_); + VLOG(10) << "Finish Building Fragment, " << res_fragment.vertex_label_num_ + << " vertices labels, " << res_fragment.edge_label_num_ + << " edges labels"; +} + +void BasicFragmentLoader::AddVertexBatch( + label_t v_label, const std::vector& vids, + const std::vector>& props) { + auto& table = vertex_data_[v_label]; + CHECK(props.size() == table.col_num()); + for (auto i = 0; i < props.size(); ++i) { + CHECK(props[i].size() == vids.size()) + << "vids size: " << vids.size() << ", props size: " << props.size() + << ", props[i] size: " << props[i].size(); + } + auto dst_columns = table.column_ptrs(); + for (auto j = 0; j < props.size(); ++j) { + auto& cur_vec = props[j]; + for (auto i = 0; i < vids.size(); ++i) { + auto index = vids[i]; + dst_columns[j]->set_any(index, cur_vec[i]); + } + } +} + +void BasicFragmentLoader::FinishAddingVertex( + label_t v_label, const IdIndexer& indexer) { + CHECK(v_label < vertex_label_num_); + build_lf_indexer(indexer, lf_indexers_[v_label]); +} + +const LFIndexer& BasicFragmentLoader::GetLFIndexer( + label_t v_label) const { + CHECK(v_label < vertex_label_num_); + return lf_indexers_[v_label]; +} + +} // namespace gs diff --git a/flex/storages/rt_mutable_graph/loader/basic_fragment_loader.h b/flex/storages/rt_mutable_graph/loader/basic_fragment_loader.h new file mode 100644 index 000000000000..85fed7cd2c24 --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/basic_fragment_loader.h @@ -0,0 +1,135 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STORAGES_RT_MUTABLE_GRAPH_LOADER_BASIC_FRAGMENT_LOADER_H_ +#define STORAGES_RT_MUTABLE_GRAPH_LOADER_BASIC_FRAGMENT_LOADER_H_ + +#include "flex/storages/rt_mutable_graph/mutable_property_fragment.h" +#include "flex/storages/rt_mutable_graph/schema.h" + +namespace gs { + +template +TypedMutableCsrBase* create_typed_csr(EdgeStrategy es) { + if (es == EdgeStrategy::kSingle) { + return new SingleMutableCsr(); + } else if (es == EdgeStrategy::kMultiple) { + return new MutableCsr(); + } else if (es == EdgeStrategy::kNone) { + return new EmptyCsr(); + } + LOG(FATAL) << "not support edge strategy or edge data type"; +} + +// FragmentLoader should use this BasicFragmentLoader to construct +// mutable_csr_fragment. +class BasicFragmentLoader { + public: + BasicFragmentLoader(const Schema& schema); + + void LoadFragment(MutablePropertyFragment& res_fragment); + + // props vector is column_num X batch_size + void AddVertexBatch(label_t v_label, const std::vector& vids, + const std::vector>& props); + + inline void SetVertexProperty(label_t v_label, size_t col_ind, vid_t vid, + Any&& prop) { + auto& table = vertex_data_[v_label]; + auto dst_columns = table.column_ptrs(); + CHECK(col_ind < dst_columns.size()); + dst_columns[col_ind]->set_any(vid, prop); + } + + void FinishAddingVertex(label_t v_label, + const IdIndexer& indexer); + + template + void AddNoPropEdgeBatch(label_t src_label_id, label_t dst_label_id, + label_t edge_label_id) { + size_t index = src_label_id * vertex_label_num_ * edge_label_num_ + + dst_label_id * edge_label_num_ + edge_label_id; + CHECK(ie_[index] == NULL); + CHECK(oe_[index] == NULL); + auto src_label_name = schema_.get_vertex_label_name(src_label_id); + auto dst_label_name = schema_.get_vertex_label_name(dst_label_id); + auto edge_label_name = schema_.get_edge_label_name(edge_label_id); + EdgeStrategy oe_strategy = schema_.get_outgoing_edge_strategy( + src_label_name, dst_label_name, edge_label_name); + EdgeStrategy ie_strategy = schema_.get_incoming_edge_strategy( + src_label_name, dst_label_name, edge_label_name); + ie_[index] = create_typed_csr(ie_strategy); + oe_[index] = create_typed_csr(oe_strategy); + ie_[index]->batch_init(0, {}); + oe_[index]->batch_init(0, {}); + } + + template + void PutEdges(label_t src_label_id, label_t dst_label_id, + label_t edge_label_id, + const std::vector>& edges, + const std::vector& ie_degree, + const std::vector& oe_degree) { + size_t index = src_label_id * vertex_label_num_ * edge_label_num_ + + dst_label_id * edge_label_num_ + edge_label_id; + auto& src_indexer = lf_indexers_[src_label_id]; + auto& dst_indexer = lf_indexers_[dst_label_id]; + CHECK(ie_[index] == NULL); + CHECK(oe_[index] == NULL); + auto src_label_name = schema_.get_vertex_label_name(src_label_id); + auto dst_label_name = schema_.get_vertex_label_name(dst_label_id); + auto edge_label_name = schema_.get_edge_label_name(edge_label_id); + EdgeStrategy oe_strategy = schema_.get_outgoing_edge_strategy( + src_label_name, dst_label_name, edge_label_name); + EdgeStrategy ie_strategy = schema_.get_incoming_edge_strategy( + src_label_name, dst_label_name, edge_label_name); + auto ie_csr = create_typed_csr(ie_strategy); + auto oe_csr = create_typed_csr(oe_strategy); + CHECK(ie_degree.size() == dst_indexer.size()); + CHECK(oe_degree.size() == src_indexer.size()); + + ie_csr->batch_init(dst_indexer.size(), ie_degree); + oe_csr->batch_init(src_indexer.size(), oe_degree); + + for (auto& edge : edges) { + ie_csr->batch_put_edge(std::get<1>(edge), std::get<0>(edge), + std::get<2>(edge)); + oe_csr->batch_put_edge(std::get<0>(edge), std::get<1>(edge), + std::get<2>(edge)); + } + ie_[index] = ie_csr; + oe_[index] = oe_csr; + VLOG(10) << "Finish adding edge batch of size: " << edges.size(); + } + + Table& GetVertexTable(size_t ind) { + CHECK(ind < vertex_data_.size()); + return vertex_data_[ind]; + } + + // get lf_indexer + const LFIndexer& GetLFIndexer(label_t v_label) const; + + private: + void init_vertex_data(); + const Schema& schema_; + size_t vertex_label_num_, edge_label_num_; + std::vector> lf_indexers_; + std::vector ie_, oe_; + std::vector vertex_data_; +}; +} // namespace gs + +#endif // STORAGES_RT_MUTABLE_GRAPH_LOADER_BASIC_FRAGMENT_LOADER_H_ \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/loader/csv_fragment_loader.cc b/flex/storages/rt_mutable_graph/loader/csv_fragment_loader.cc new file mode 100644 index 000000000000..65606d9aec2c --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/csv_fragment_loader.cc @@ -0,0 +1,1364 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flex/storages/rt_mutable_graph/loader/csv_fragment_loader.h" +#include "flex/engines/hqps_db/core/utils/hqps_utils.h" + +namespace gs { + +static void preprocess_line(char* line) { + size_t len = strlen(line); + while (len >= 0) { + if (line[len] != '\0' && line[len] != '\n' && line[len] != '\r' && + line[len] != ' ' && line[len] != '\t') { + break; + } else { + --len; + } + } + line[len + 1] = '\0'; +} + +static std::vector read_header(const std::string& file_name, + char delimiter) { + char line_buf[4096]; + FILE* fin = fopen(file_name.c_str(), "r"); + if (fgets(line_buf, 4096, fin) == NULL) { + LOG(FATAL) << "Failed to read header from file: " << file_name; + } + preprocess_line(line_buf); + const char* cur = line_buf; + std::vector res_vec; + while (*cur != '\0') { + const char* tmp = cur; + while (*tmp != '\0' && *tmp != delimiter) { + ++tmp; + } + + std::string_view sv(cur, tmp - cur); + res_vec.emplace_back(sv); + cur = tmp + 1; + } + return res_vec; +} + +static void put_delimiter_option(const LoadingConfig& loading_config, + arrow::csv::ParseOptions& parse_options) { + auto delimiter_str = loading_config.GetDelimiter(); + if (delimiter_str.size() != 1) { + LOG(FATAL) << "Delimiter should be a single character"; + } + parse_options.delimiter = delimiter_str[0]; +} + +static bool put_skip_rows_option(const LoadingConfig& loading_config, + arrow::csv::ReadOptions& read_options) { + bool header_row = loading_config.GetHasHeaderRow(); + if (header_row) { + read_options.skip_rows = 1; + } else { + read_options.skip_rows = 0; + } + return header_row; +} + +static void put_escape_char_option(const LoadingConfig& loading_config, + arrow::csv::ParseOptions& parse_options) { + auto escape_str = loading_config.GetEscapeChar(); + if (escape_str.size() != 1) { + LOG(FATAL) << "Escape char should be a single character"; + } + parse_options.escape_char = escape_str[0]; + parse_options.escaping = loading_config.GetIsEscaping(); +} + +static void put_block_size_option(const LoadingConfig& loading_config, + arrow::csv::ReadOptions& read_options) { + auto batch_size = loading_config.GetBatchSize(); + if (batch_size <= 0) { + LOG(FATAL) << "Block size should be positive"; + } + read_options.block_size = batch_size; +} + +static void put_quote_char_option(const LoadingConfig& loading_config, + arrow::csv::ParseOptions& parse_options) { + auto quoting_str = loading_config.GetQuotingChar(); + if (quoting_str.size() != 1) { + LOG(FATAL) << "Quote char should be a single character"; + } + parse_options.quote_char = quoting_str[0]; + parse_options.quoting = loading_config.GetIsQuoting(); + parse_options.double_quote = loading_config.GetIsDoubleQuoting(); +} + +static void put_column_names_option(const LoadingConfig& loading_config, + bool header_row, + const std::string& file_path, + char delimiter, + arrow::csv::ReadOptions& read_options) { + std::vector all_column_names; + if (header_row) { + all_column_names = read_header(file_path, delimiter); + // It is possible that there exists duplicate column names in the header, + // transform them to unique names + std::unordered_map name_count; + for (auto& name : all_column_names) { + if (name_count.find(name) == name_count.end()) { + name_count[name] = 1; + } else { + name_count[name]++; + } + } + VLOG(10) << "before Got all column names: " << all_column_names.size() + << gs::to_string(all_column_names); + for (auto i = 0; i < all_column_names.size(); ++i) { + auto& name = all_column_names[i]; + if (name_count[name] > 1) { + auto cur_cnt = name_count[name]; + name_count[name] -= 1; + all_column_names[i] = name + "_" + std::to_string(cur_cnt); + } + } + VLOG(10) << "Got all column names: " << all_column_names.size() + << gs::to_string(all_column_names); + } else { + // just get the number of columns. + size_t num_cols = 0; + { + auto tmp = read_header(file_path, delimiter); + num_cols = tmp.size(); + } + all_column_names.resize(num_cols); + for (auto i = 0; i < all_column_names.size(); ++i) { + all_column_names[i] = std::string("f") + std::to_string(i); + } + } + read_options.column_names = all_column_names; + VLOG(10) << "Got all column names: " << all_column_names.size() + << gs::to_string(all_column_names); +} + +static void check_edge_invariant( + const Schema& schema, + const std::vector>& + column_mappings, + size_t src_col_ind, size_t dst_col_ind, label_t src_label_i, + label_t dst_label_i, label_t edge_label_i) { + // TODO(zhanglei): Check column mappings after multiple property on edge is + // supported + if (column_mappings.size() > 1) { + LOG(FATAL) << "Edge column mapping must be less than 1"; + } + if (column_mappings.size() > 0) { + auto& mapping = column_mappings[0]; + if (std::get<0>(mapping) == src_col_ind || + std::get<0>(mapping) == dst_col_ind) { + LOG(FATAL) << "Edge column mappings must not contain src_col_ind or " + "dst_col_ind"; + } + auto src_label_name = schema.get_vertex_label_name(src_label_i); + auto dst_label_name = schema.get_vertex_label_name(dst_label_i); + auto edge_label_name = schema.get_edge_label_name(edge_label_i); + // check property exists in schema + if (!schema.edge_has_property(src_label_name, dst_label_name, + edge_label_name, std::get<2>(mapping))) { + LOG(FATAL) << "property " << std::get<2>(mapping) + << " not exists in schema for edge triplet " << src_label_name + << " -> " << edge_label_name << " -> " << dst_label_name; + } + } +} + +static void set_vertex_properties(gs::ColumnBase* col, + std::shared_ptr array, + const std::vector& vids) { + auto type = array->type(); + size_t cur_ind = 0; + if (type == arrow::int64()) { + for (auto j = 0; j < array->num_chunks(); ++j) { + auto casted = + std::static_pointer_cast(array->chunk(j)); + for (auto k = 0; k < casted->length(); ++k) { + col->set_any( + vids[cur_ind++], + std::move(AnyConverter::to_any(casted->Value(k)))); + } + } + } else if (type == arrow::int32()) { + for (auto j = 0; j < array->num_chunks(); ++j) { + auto casted = + std::static_pointer_cast(array->chunk(j)); + for (auto k = 0; k < casted->length(); ++k) { + col->set_any( + vids[cur_ind++], + std::move(AnyConverter::to_any(casted->Value(k)))); + } + } + } else if (type == arrow::float64()) { + for (auto j = 0; j < array->num_chunks(); ++j) { + auto casted = + std::static_pointer_cast(array->chunk(j)); + for (auto k = 0; k < casted->length(); ++k) { + col->set_any(vids[cur_ind++], + std::move(AnyConverter::to_any(casted->Value(k)))); + } + } + } else if (type == arrow::large_utf8()) { + for (auto j = 0; j < array->num_chunks(); ++j) { + auto casted = + std::static_pointer_cast(array->chunk(j)); + for (auto k = 0; k < casted->length(); ++k) { + auto str = casted->GetView(k); + std::string_view str_view(str.data(), str.size()); + col->set_any( + vids[cur_ind++], + std::move(AnyConverter::to_any(str_view))); + } + for (auto k = 0; k < std::min((int64_t) 10, casted->length()); ++k) { + VLOG(10) << "set vertex property: " << vids[k] << ", " + << casted->GetString(k) << " " + << col->get(vids[k]).to_string(); + } + } + } else if (type == arrow::utf8()) { + for (auto j = 0; j < array->num_chunks(); ++j) { + auto casted = + std::static_pointer_cast(array->chunk(j)); + for (auto k = 0; k < casted->length(); ++k) { + auto str = casted->GetView(k); + std::string_view str_view(str.data(), str.size()); + col->set_any( + vids[cur_ind++], + std::move(AnyConverter::to_any(str_view))); + } + } + } else { + LOG(FATAL) << "Not support type: " << type->ToString(); + } +} + +template +static void append_edges( + std::shared_ptr src_col, + std::shared_ptr dst_col, + const LFIndexer& src_indexer, const LFIndexer& dst_indexer, + std::vector>& edata_cols, + std::vector>& parsed_edges, + std::vector& ie_degree, std::vector& oe_degree) { + CHECK(src_col->length() == dst_col->length()); + + auto old_size = parsed_edges.size(); + parsed_edges.resize(old_size + src_col->length()); + VLOG(10) << "resize parsed_edges from" << old_size << " to " + << parsed_edges.size(); + + auto src_col_thread = std::thread([&]() { + size_t cur_ind = old_size; + for (auto i = 0; i < src_col->length(); ++i) { + auto src_vid = src_indexer.get_index(src_col->Value(i)); + std::get<0>(parsed_edges[cur_ind++]) = src_vid; + oe_degree[src_vid]++; + } + }); + auto dst_col_thread = std::thread([&]() { + size_t cur_ind = old_size; + for (auto i = 0; i < dst_col->length(); ++i) { + auto dst_vid = dst_indexer.get_index(dst_col->Value(i)); + std::get<1>(parsed_edges[cur_ind++]) = dst_vid; + ie_degree[dst_vid]++; + } + }); + src_col_thread.join(); + dst_col_thread.join(); + + // if EDATA_T is grape::EmptyType, no need to read columns + if constexpr (!std::is_same::value) { + CHECK(edata_cols.size() == 1); + auto edata_col = edata_cols[0]; + CHECK(src_col->length() == edata_col->length()); + size_t cur_ind = old_size; + auto type = edata_col->type(); + if (type != CppTypeToArrowType::TypeValue()) { + LOG(FATAL) << "Inconsistent data type, expect " + << CppTypeToArrowType::TypeValue()->ToString() + << ", but got " << type->ToString(); + } + + using arrow_array_type = + typename gs::CppTypeToArrowType::ArrayType; + // cast chunk to EDATA_T array + auto data = std::static_pointer_cast(edata_col); + for (auto j = 0; j < edata_col->length(); ++j) { + if constexpr (std::is_same::value || + std::is_same::value) { + std::get<2>(parsed_edges[cur_ind++]) = data->GetString(j); + } else { + std::get<2>(parsed_edges[cur_ind++]) = data->Value(j); + } + } + VLOG(10) << "Finish inserting: " << src_col->length() << " edges"; + } +} + +template +static void append_edges( + std::shared_ptr src_col, + std::shared_ptr dst_col, + const LFIndexer& src_indexer, const LFIndexer& dst_indexer, + std::vector>& edata_cols, + std::vector>& parsed_edges, + std::vector& ie_degree, std::vector& oe_degree) { + CHECK(src_col->length() == dst_col->length()); + CHECK(src_col->type() == arrow::int64()); + CHECK(dst_col->type() == arrow::int64()); + + auto old_size = parsed_edges.size(); + parsed_edges.resize(old_size + src_col->length()); + VLOG(10) << "resize parsed_edges from" << old_size << " to " + << parsed_edges.size(); + + auto src_col_thread = std::thread([&]() { + size_t cur_ind = old_size; + for (auto i = 0; i < src_col->num_chunks(); ++i) { + auto chunk = src_col->chunk(i); + CHECK(chunk->type() == arrow::int64()); + auto casted_chunk = std::static_pointer_cast(chunk); + for (auto j = 0; j < casted_chunk->length(); ++j) { + auto src_vid = src_indexer.get_index(casted_chunk->Value(j)); + std::get<0>(parsed_edges[cur_ind++]) = src_vid; + oe_degree[src_vid]++; + } + } + }); + auto dst_col_thread = std::thread([&]() { + size_t cur_ind = old_size; + for (auto i = 0; i < dst_col->num_chunks(); ++i) { + auto chunk = dst_col->chunk(i); + CHECK(chunk->type() == arrow::int64()); + auto casted_chunk = std::static_pointer_cast(chunk); + for (auto j = 0; j < casted_chunk->length(); ++j) { + auto dst_vid = dst_indexer.get_index(casted_chunk->Value(j)); + std::get<1>(parsed_edges[cur_ind++]) = dst_vid; + ie_degree[dst_vid]++; + } + } + }); + + // if EDATA_T is grape::EmptyType, no need to read columns + auto edata_col_thread = std::thread([&]() { + if constexpr (!std::is_same::value) { + CHECK(edata_cols.size() == 1); + auto edata_col = edata_cols[0]; + CHECK(src_col->length() == edata_col->length()); + // iterate and put data + size_t cur_ind = old_size; + auto type = edata_col->type(); + if (type != CppTypeToArrowType::TypeValue()) { + LOG(FATAL) << "Inconsistent data type, expect " + << CppTypeToArrowType::TypeValue()->ToString() + << ", but got " << type->ToString(); + } + + using arrow_array_type = + typename gs::CppTypeToArrowType::ArrayType; + for (auto i = 0; i < edata_col->num_chunks(); ++i) { + auto chunk = edata_col->chunk(i); + auto casted_chunk = std::static_pointer_cast(chunk); + for (auto j = 0; j < casted_chunk->length(); ++j) { + if constexpr (std::is_same::value || + std::is_same::value) { + std::get<2>(parsed_edges[cur_ind++]) = casted_chunk->GetView(j); + } else { + std::get<2>(parsed_edges[cur_ind++]) = casted_chunk->Value(j); + } + } + } + } + }); + src_col_thread.join(); + dst_col_thread.join(); + edata_col_thread.join(); + VLOG(10) << "Finish inserting: " << src_col->length() << " edges"; +} + +// Create VertexTableReader +std::shared_ptr +CSVFragmentLoader::createVertexTableReader(label_t v_label, + const std::string& v_file) { + // Create options. + arrow::csv::ConvertOptions convert_options; + arrow::csv::ReadOptions read_options; + arrow::csv::ParseOptions parse_options; + fillVertexReaderMeta(read_options, parse_options, convert_options, v_file, + v_label); + + auto read_result = arrow::io::ReadableFile::Open(v_file); + if (!read_result.ok()) { + LOG(FATAL) << "Fail to open: " << v_file + << " error: " << read_result.status().message(); + } + std::shared_ptr file = read_result.ValueOrDie(); + auto res = + arrow::csv::TableReader::Make(arrow::io::IOContext(), file, read_options, + parse_options, convert_options); + if (!res.ok()) { + LOG(FATAL) << "Fail to create StreamingReader for file: " << v_file + << " error: " << res.status().message(); + } + return res.ValueOrDie(); +} + +std::shared_ptr +CSVFragmentLoader::createVertexStreamReader(label_t v_label, + const std::string& v_file) { + arrow::csv::ConvertOptions convert_options; + arrow::csv::ReadOptions read_options; + arrow::csv::ParseOptions parse_options; + fillVertexReaderMeta(read_options, parse_options, convert_options, v_file, + v_label); + + auto read_result = arrow::io::ReadableFile::Open(v_file); + if (!read_result.ok()) { + LOG(FATAL) << "Fail to open: " << v_file + << " error: " << read_result.status().message(); + } + std::shared_ptr file = read_result.ValueOrDie(); + auto res = arrow::csv::StreamingReader::Make(arrow::io::IOContext(), file, + read_options, parse_options, + convert_options); + if (!res.ok()) { + LOG(FATAL) << "Fail to create StreamingReader for file: " << v_file + << " error: " << res.status().message(); + } + return res.ValueOrDie(); +} + +std::shared_ptr +CSVFragmentLoader::createEdgeStreamReader(label_t src_label_id, + label_t dst_label_id, + label_t label_id, + const std::string& e_file) { + arrow::csv::ConvertOptions convert_options; + arrow::csv::ReadOptions read_options; + arrow::csv::ParseOptions parse_options; + + fillEdgeReaderMeta(read_options, parse_options, convert_options, e_file, + src_label_id, dst_label_id, label_id); + + auto read_result = arrow::io::ReadableFile::Open(e_file); + if (!read_result.ok()) { + LOG(FATAL) << "Fail to open: " << e_file + << " error: " << read_result.status().message(); + } + std::shared_ptr file = read_result.ValueOrDie(); + auto res = arrow::csv::StreamingReader::Make(arrow::io::IOContext(), file, + read_options, parse_options, + convert_options); + if (!res.ok()) { + LOG(FATAL) << "Fail to create StreamingReader for file: " << e_file + << " error: " << res.status().message(); + } + return res.ValueOrDie(); +} + +std::shared_ptr +CSVFragmentLoader::createEdgeTableReader(label_t src_label_id, + label_t dst_label_id, label_t label_id, + const std::string& e_file) { + arrow::csv::ConvertOptions convert_options; + arrow::csv::ReadOptions read_options; + arrow::csv::ParseOptions parse_options; + + fillEdgeReaderMeta(read_options, parse_options, convert_options, e_file, + src_label_id, dst_label_id, label_id); + + auto read_result = arrow::io::ReadableFile::Open(e_file); + if (!read_result.ok()) { + LOG(FATAL) << "Fail to open: " << e_file + << " error: " << read_result.status().message(); + } + std::shared_ptr file = read_result.ValueOrDie(); + auto res = + arrow::csv::TableReader::Make(arrow::io::IOContext(), file, read_options, + parse_options, convert_options); + if (!res.ok()) { + LOG(FATAL) << "Fail to create TableReader for file: " << e_file + << " error: " << res.status().message(); + } + return res.ValueOrDie(); +} + +void CSVFragmentLoader::addVertexBatch( + label_t v_label_id, IdIndexer& indexer, + std::shared_ptr& primary_key_col, + const std::vector>& property_cols) { + size_t row_num = primary_key_col->length(); + CHECK_EQ(primary_key_col->type()->id(), arrow::Type::INT64); + auto col_num = property_cols.size(); + for (size_t i = 0; i < col_num; ++i) { + CHECK_EQ(property_cols[i]->length(), row_num); + } + auto casted_array = + std::static_pointer_cast(primary_key_col); + std::vector> prop_vec(property_cols.size()); + + double t = -grape::GetCurrentTime(); + vid_t vid; + std::vector vids; + vids.reserve(row_num); + for (auto i = 0; i < row_num; ++i) { + if (!indexer.add(casted_array->Value(i), vid)) { + LOG(FATAL) << "Duplicate vertex id: " << casted_array->Value(i) << " for " + << schema_.get_vertex_label_name(v_label_id); + } + vids.emplace_back(vid); + } + + t += grape::GetCurrentTime(); + for (double tmp = convert_to_internal_vertex_time_; + !convert_to_internal_vertex_time_.compare_exchange_weak(tmp, tmp + t);) { + } + + t = -grape::GetCurrentTime(); + for (auto j = 0; j < property_cols.size(); ++j) { + auto array = property_cols[j]; + auto chunked_array = std::make_shared(array); + set_vertex_properties( + basic_fragment_loader_.GetVertexTable(v_label_id).column_ptrs()[j], + chunked_array, vids); + } + + t += grape::GetCurrentTime(); + for (double tmp = basic_frag_loader_vertex_time_; + !basic_frag_loader_vertex_time_.compare_exchange_weak(tmp, tmp + t);) {} + + VLOG(10) << "Insert rows: " << row_num; +} + +void CSVFragmentLoader::addVertexBatch( + label_t v_label_id, IdIndexer& indexer, + std::shared_ptr& primary_key_col, + const std::vector>& property_cols) { + size_t row_num = primary_key_col->length(); + std::vector vids; + vids.reserve(row_num); + CHECK_EQ(primary_key_col->type()->id(), arrow::Type::INT64); + // check row num + auto col_num = property_cols.size(); + for (size_t i = 0; i < col_num; ++i) { + CHECK_EQ(property_cols[i]->length(), row_num); + } + std::vector> prop_vec(property_cols.size()); + + double t = -grape::GetCurrentTime(); + for (auto i = 0; i < primary_key_col->num_chunks(); ++i) { + auto chunk = primary_key_col->chunk(i); + auto casted_array = std::static_pointer_cast(chunk); + for (auto j = 0; j < casted_array->length(); ++j) { + vid_t vid; + if (!indexer.add(casted_array->Value(j), vid)) { + LOG(FATAL) << "Duplicate vertex id: " << casted_array->Value(j) + << " for " << schema_.get_vertex_label_name(v_label_id); + } + vids.emplace_back(vid); + } + } + + t += grape::GetCurrentTime(); + for (double tmp = convert_to_internal_vertex_time_; + !convert_to_internal_vertex_time_.compare_exchange_weak(tmp, tmp + t);) { + } + + t = -grape::GetCurrentTime(); + for (auto i = 0; i < property_cols.size(); ++i) { + auto array = property_cols[i]; + auto& table = basic_fragment_loader_.GetVertexTable(v_label_id); + auto& col_ptrs = table.column_ptrs(); + set_vertex_properties(col_ptrs[i], array, vids); + } + t += grape::GetCurrentTime(); + for (double tmp = basic_frag_loader_vertex_time_; + !basic_frag_loader_vertex_time_.compare_exchange_weak(tmp, tmp + t);) {} + + VLOG(10) << "Insert rows: " << row_num; +} + +void CSVFragmentLoader::addVerticesImplWithTableReader( + const std::string& v_file, label_t v_label_id, + IdIndexer& indexer) { + auto vertex_column_mappings = + loading_config_.GetVertexColumnMappings(v_label_id); + auto primary_key = schema_.get_vertex_primary_key(v_label_id)[0]; + size_t primary_key_ind = std::get<2>(primary_key); + auto reader = createVertexTableReader(v_label_id, v_file); + std::shared_ptr table; + double t = -grape::GetCurrentTime(); + auto result = reader->Read(); + t += grape::GetCurrentTime(); + for (double tmp = read_vertex_table_time_; + !read_vertex_table_time_.compare_exchange_weak(tmp, tmp + t);) {} + + auto status = result.status(); + if (!status.ok()) { + LOG(FATAL) << "Failed to read next batch from file " << v_file + << status.message(); + } + table = result.ValueOrDie(); + if (table == nullptr) { + LOG(FATAL) << "Empty file: " << v_file; + } + auto header = table->schema()->field_names(); + auto schema_column_names = schema_.get_vertex_property_names(v_label_id); + CHECK(schema_column_names.size() + 1 == header.size()); + VLOG(10) << "Find header of size: " << header.size(); + + auto columns = table->columns(); + CHECK(primary_key_ind < columns.size()); + auto primary_key_column = columns[primary_key_ind]; + auto other_columns_array = columns; + other_columns_array.erase(other_columns_array.begin() + primary_key_ind); + VLOG(10) << "Reading record batch of size: " << table->num_rows(); + addVertexBatch(v_label_id, indexer, primary_key_column, other_columns_array); +} + +void CSVFragmentLoader::addVerticesImplWithStreamReader( + const std::string& v_file, label_t v_label_id, + IdIndexer& indexer) { + auto vertex_column_mappings = + loading_config_.GetVertexColumnMappings(v_label_id); + auto primary_key = schema_.get_vertex_primary_key(v_label_id)[0]; + auto primary_key_name = std::get<1>(primary_key); + size_t primary_key_ind = std::get<2>(primary_key); + auto reader = createVertexStreamReader(v_label_id, v_file); + std::shared_ptr record_batch; + bool first_batch = true; + while (true) { + double t = -grape::GetCurrentTime(); + auto status = reader->ReadNext(&record_batch); + t += grape::GetCurrentTime(); + for (double tmp = read_vertex_table_time_; + !read_vertex_table_time_.compare_exchange_weak(tmp, tmp + t);) {} + if (!status.ok()) { + LOG(FATAL) << "Failed to read next batch from file " << v_file + << status.message(); + } + if (record_batch == nullptr) { + break; + } + if (first_batch) { + // get header + auto header = record_batch->schema()->field_names(); + auto schema_column_names = schema_.get_vertex_property_names(v_label_id); + CHECK(schema_column_names.size() + 1 == header.size()); + VLOG(10) << "Find header of size: " << header.size(); + first_batch = false; + } + + auto columns = record_batch->columns(); + CHECK(primary_key_ind < columns.size()); + auto primary_key_column = columns[primary_key_ind]; + auto other_columns_array = columns; + other_columns_array.erase(other_columns_array.begin() + primary_key_ind); + VLOG(10) << "Reading record batch of size: " << record_batch->num_rows(); + addVertexBatch(v_label_id, indexer, primary_key_column, + other_columns_array); + } +} + +void CSVFragmentLoader::addVerticesImpl(label_t v_label_id, + const std::string& v_label_name, + const std::vector v_files, + IdIndexer& indexer) { + VLOG(10) << "Parsing vertex file:" << v_files.size() << " for label " + << v_label_name; + + for (auto& v_file : v_files) { + if (loading_config_.GetIsBatchReader()) { + addVerticesImplWithStreamReader(v_file, v_label_id, indexer); + } else { + addVerticesImplWithTableReader(v_file, v_label_id, indexer); + } + } + + VLOG(10) << "Finish parsing vertex file:" << v_files.size() << " for label " + << v_label_name; +} + +void CSVFragmentLoader::addVertices(label_t v_label_id, + const std::vector& v_files) { + auto primary_keys = schema_.get_vertex_primary_key(v_label_id); + + if (primary_keys.size() != 1) { + LOG(FATAL) << "Only support one primary key for vertex."; + } + if (std::get<0>(primary_keys[0]) != PropertyType::kInt64) { + LOG(FATAL) << "Only support int64_t primary key for vertex."; + } + + std::string v_label_name = schema_.get_vertex_label_name(v_label_id); + VLOG(10) << "Start init vertices for label " << v_label_name << " with " + << v_files.size() << " files."; + + IdIndexer indexer; + + addVerticesImpl(v_label_id, v_label_name, v_files, indexer); + + if (indexer.bucket_count() == 0) { + indexer._rehash(schema_.get_max_vnum(v_label_name)); + } + basic_fragment_loader_.FinishAddingVertex(v_label_id, indexer); + + VLOG(10) << "Finish init vertices for label " << v_label_name; +} + +template +void CSVFragmentLoader::addEdgesImplWithTableReader( + const std::string& filename, label_t src_label_id, label_t dst_label_id, + label_t e_label_id, std::vector& ie_degree, + std::vector& oe_degree, + std::vector>& parsed_edges) { + const auto& src_indexer = basic_fragment_loader_.GetLFIndexer(src_label_id); + const auto& dst_indexer = basic_fragment_loader_.GetLFIndexer(dst_label_id); + auto reader = + createEdgeTableReader(src_label_id, dst_label_id, e_label_id, filename); + std::shared_ptr table; + double t = -grape::GetCurrentTime(); + auto result = reader->Read(); + t += grape::GetCurrentTime(); + for (double tmp = read_edge_table_time_; + !read_edge_table_time_.compare_exchange_weak(tmp, tmp + t);) {} + + auto status = result.status(); + if (!status.ok()) { + LOG(FATAL) << "Failed to read Table from file " << filename + << status.message(); + } + table = result.ValueOrDie(); + if (table == nullptr) { + LOG(FATAL) << "Empty file: " << filename; + } + auto header = table->schema()->field_names(); + auto schema_column_names = + schema_.get_edge_property_names(src_label_id, dst_label_id, e_label_id); + auto schema_column_types = + schema_.get_edge_properties(src_label_id, dst_label_id, e_label_id); + CHECK(schema_column_names.size() + 2 == header.size()); + CHECK(schema_column_types.size() + 2 == header.size()); + VLOG(10) << "Find header of size: " << header.size(); + + auto columns = table->columns(); + CHECK(columns.size() >= 2); + auto src_col = columns[0]; + auto dst_col = columns[1]; + CHECK(src_col->type() == arrow::int64()) + << "src_col type: " << src_col->type()->ToString(); + CHECK(dst_col->type() == arrow::int64()) + << "dst_col type: " << dst_col->type()->ToString(); + + std::vector> property_cols; + for (auto i = 2; i < columns.size(); ++i) { + property_cols.emplace_back(columns[i]); + } + CHECK(property_cols.size() <= 1) + << "Currently only support at most one property on edge"; + { + CHECK(src_col->length() == dst_col->length()); + CHECK(src_col->type() == arrow::int64()); + CHECK(dst_col->type() == arrow::int64()); + t = -grape::GetCurrentTime(); + append_edges(src_col, dst_col, src_indexer, dst_indexer, property_cols, + parsed_edges, ie_degree, oe_degree); + t += grape::GetCurrentTime(); + for (double tmp = convert_to_internal_edge_time_; + !convert_to_internal_edge_time_.compare_exchange_weak(tmp, tmp + t);) { + } + } +} + +template +void CSVFragmentLoader::addEdgesImplWithStreamReader( + const std::string& filename, label_t src_label_id, label_t dst_label_id, + label_t e_label_id, std::vector& ie_degree, + std::vector& oe_degree, + std::vector>& parsed_edges) { + const auto& src_indexer = basic_fragment_loader_.GetLFIndexer(src_label_id); + const auto& dst_indexer = basic_fragment_loader_.GetLFIndexer(dst_label_id); + auto reader = + createEdgeStreamReader(src_label_id, dst_label_id, e_label_id, filename); + std::shared_ptr record_batch; + // read first batch + bool first_batch = true; + while (true) { + double t = -grape::GetCurrentTime(); + auto status = reader->ReadNext(&record_batch); + t += grape::GetCurrentTime(); + for (double tmp = read_edge_table_time_; + !read_edge_table_time_.compare_exchange_weak(tmp, tmp + t);) {} + if (!status.ok()) { + LOG(FATAL) << "Failed to read next batch from file " << filename + << status.message(); + } + if (record_batch == nullptr) { + break; + } + if (first_batch) { + auto header = record_batch->schema()->field_names(); + auto schema_column_names = schema_.get_edge_property_names( + src_label_id, dst_label_id, e_label_id); + auto schema_column_types = + schema_.get_edge_properties(src_label_id, dst_label_id, e_label_id); + CHECK(schema_column_names.size() + 2 == header.size()) + << "schema size: " << schema_column_names.size() + << " header size: " << header.size(); + CHECK(schema_column_types.size() + 2 == header.size()) + << "schema size: " << schema_column_types.size() + << " header size: " << header.size(); + VLOG(10) << "Find header of size: " << header.size(); + first_batch = false; + } + + // copy the table to csr. + auto columns = record_batch->columns(); + // We assume the src_col and dst_col will always be put at front. + CHECK(columns.size() >= 2); + auto src_col = columns[0]; + auto dst_col = columns[1]; + CHECK(src_col->type() == arrow::int64()) + << "src_col type: " << src_col->type()->ToString(); + CHECK(dst_col->type() == arrow::int64()) + << "dst_col type: " << dst_col->type()->ToString(); + + std::vector> property_cols; + for (auto i = 2; i < columns.size(); ++i) { + property_cols.emplace_back(columns[i]); + } + CHECK(property_cols.size() <= 1) + << "Currently only support at most one property on edge"; + { + // add edges to vector + CHECK(src_col->length() == dst_col->length()); + CHECK(src_col->type() == arrow::int64()); + CHECK(dst_col->type() == arrow::int64()); + auto src_casted_array = + std::static_pointer_cast(src_col); + auto dst_casted_array = + std::static_pointer_cast(dst_col); + t = -grape::GetCurrentTime(); + append_edges(src_casted_array, dst_casted_array, src_indexer, dst_indexer, + property_cols, parsed_edges, ie_degree, oe_degree); + t += grape::GetCurrentTime(); + for (double tmp = convert_to_internal_edge_time_; + !convert_to_internal_edge_time_.compare_exchange_weak(tmp, tmp + t); + tmp = convert_to_internal_edge_time_) {} + } + } +} + +template +void CSVFragmentLoader::addEdgesImpl(label_t src_label_id, label_t dst_label_id, + label_t e_label_id, + const std::vector& e_files) { + auto edge_column_mappings = loading_config_.GetEdgeColumnMappings( + src_label_id, dst_label_id, e_label_id); + auto src_dst_col_pair = + loading_config_.GetEdgeSrcDstCol(src_label_id, dst_label_id, e_label_id); + if (src_dst_col_pair.first.size() != 1 || + src_dst_col_pair.second.size() != 1) { + LOG(FATAL) << "We currently only support one src primary key and one " + "dst primary key"; + } + size_t src_col_ind = src_dst_col_pair.first[0]; + size_t dst_col_ind = src_dst_col_pair.second[0]; + CHECK(src_col_ind != dst_col_ind); + + check_edge_invariant(schema_, edge_column_mappings, src_col_ind, dst_col_ind, + src_label_id, dst_label_id, e_label_id); + + std::vector> parsed_edges; + std::vector ie_degree, oe_degree; + const auto& src_indexer = basic_fragment_loader_.GetLFIndexer(src_label_id); + const auto& dst_indexer = basic_fragment_loader_.GetLFIndexer(dst_label_id); + ie_degree.resize(dst_indexer.size()); + oe_degree.resize(src_indexer.size()); + VLOG(10) << "src indexer size: " << src_indexer.size() + << " dst indexer size: " << dst_indexer.size(); + + for (auto filename : e_files) { + VLOG(10) << "processing " << filename << " with src_col_id " << src_col_ind + << " and dst_col_id " << dst_col_ind; + if (loading_config_.GetIsBatchReader()) { + VLOG(1) << "Using batch reader"; + addEdgesImplWithStreamReader(filename, src_label_id, dst_label_id, + e_label_id, ie_degree, oe_degree, + parsed_edges); + } else { + VLOG(1) << "Using table reader"; + addEdgesImplWithTableReader(filename, src_label_id, dst_label_id, + e_label_id, ie_degree, oe_degree, + parsed_edges); + } + } + double t = -grape::GetCurrentTime(); + basic_fragment_loader_.PutEdges(src_label_id, dst_label_id, e_label_id, + parsed_edges, ie_degree, oe_degree); + t += grape::GetCurrentTime(); + // basic_frag_loader_edge_time_.fetch_add(t); + for (double tmp = basic_frag_loader_edge_time_; + !basic_frag_loader_edge_time_.compare_exchange_weak(tmp, tmp + t);) {} + VLOG(10) << "Finish putting: " << parsed_edges.size() << " edges"; +} + +void CSVFragmentLoader::addEdges(label_t src_label_i, label_t dst_label_i, + label_t edge_label_i, + const std::vector& filenames) { + auto src_label_name = schema_.get_vertex_label_name(src_label_i); + auto dst_label_name = schema_.get_vertex_label_name(dst_label_i); + auto edge_label_name = schema_.get_edge_label_name(edge_label_i); + if (filenames.size() <= 0) { + LOG(FATAL) << "No edge files found for src label: " << src_label_name + << " dst label: " << dst_label_name + << " edge label: " << edge_label_name; + } + if (filenames.size() <= 0) { + LOG(FATAL) << "No edge files found for src label: " << src_label_name + << " dst label: " << dst_label_name + << " edge label: " << edge_label_name; + } + VLOG(10) << "Init edges src label: " << src_label_name + << " dst label: " << dst_label_name + << " edge label: " << edge_label_name + << " filenames: " << filenames.size(); + auto& property_types = schema_.get_edge_properties( + src_label_name, dst_label_name, edge_label_name); + size_t col_num = property_types.size(); + CHECK_LE(col_num, 1) << "Only single or no property is supported for edge."; + + if (col_num == 0) { + if (filenames.empty()) { + basic_fragment_loader_.AddNoPropEdgeBatch( + src_label_i, dst_label_i, edge_label_i); + } else { + addEdgesImpl(src_label_i, dst_label_i, edge_label_i, + filenames); + } + } else if (property_types[0] == PropertyType::kDate) { + if (filenames.empty()) { + basic_fragment_loader_.AddNoPropEdgeBatch(src_label_i, dst_label_i, + edge_label_i); + } else { + addEdgesImpl(src_label_i, dst_label_i, edge_label_i, filenames); + } + } else if (property_types[0] == PropertyType::kInt32) { + if (filenames.empty()) { + basic_fragment_loader_.AddNoPropEdgeBatch(src_label_i, dst_label_i, + edge_label_i); + } else { + addEdgesImpl(src_label_i, dst_label_i, edge_label_i, filenames); + } + } else if (property_types[0] == PropertyType::kInt64) { + if (filenames.empty()) { + basic_fragment_loader_.AddNoPropEdgeBatch( + src_label_i, dst_label_i, edge_label_i); + } else { + addEdgesImpl(src_label_i, dst_label_i, edge_label_i, filenames); + } + } else if (property_types[0] == PropertyType::kString) { + if (filenames.empty()) { + basic_fragment_loader_.AddNoPropEdgeBatch( + src_label_i, dst_label_i, edge_label_i); + } else { + LOG(FATAL) << "Unsupported edge property type."; + } + } else if (property_types[0] == PropertyType::kDouble) { + if (filenames.empty()) { + basic_fragment_loader_.AddNoPropEdgeBatch( + src_label_i, dst_label_i, edge_label_i); + } else { + addEdgesImpl(src_label_i, dst_label_i, edge_label_i, filenames); + } + } else { + LOG(FATAL) << "Unsupported edge property type." << property_types[0]; + } +} + +void CSVFragmentLoader::loadVertices() { + auto vertex_sources = loading_config_.GetVertexLoadingMeta(); + if (vertex_sources.empty()) { + LOG(INFO) << "Skip loading vertices since no vertex source is specified."; + return; + } + + if (thread_num_ == 1) { + LOG(INFO) << "Loading vertices with single thread..."; + for (auto iter = vertex_sources.begin(); iter != vertex_sources.end(); + ++iter) { + auto v_label_id = iter->first; + auto v_files = iter->second; + addVertices(v_label_id, v_files); + } + } else { + // copy vertex_sources and edge sources to vector, since we need to + // use multi-thread loading. + std::vector>> vertex_files; + for (auto iter = vertex_sources.begin(); iter != vertex_sources.end(); + ++iter) { + vertex_files.emplace_back(iter->first, iter->second); + } + LOG(INFO) << "Parallel loading with " << thread_num_ << " threads, " + << " " << vertex_files.size() << " vertex files, "; + std::atomic v_ind(0); + std::vector threads(thread_num_); + for (int i = 0; i < thread_num_; ++i) { + threads[i] = std::thread([&]() { + while (true) { + size_t cur = v_ind.fetch_add(1); + if (cur >= vertex_files.size()) { + break; + } + auto v_label_id = vertex_files[cur].first; + addVertices(v_label_id, vertex_files[cur].second); + } + }); + } + for (auto& thread : threads) { + thread.join(); + } + + LOG(INFO) << "Finished loading vertices"; + } +} + +void CSVFragmentLoader::fillVertexReaderMeta( + arrow::csv::ReadOptions& read_options, + arrow::csv::ParseOptions& parse_options, + arrow::csv::ConvertOptions& convert_options, const std::string& v_file, + label_t v_label) const { + auto time_stamp_parser = arrow::TimestampParser::MakeISO8601(); + convert_options.timestamp_parsers.emplace_back(time_stamp_parser); + + put_delimiter_option(loading_config_, parse_options); + bool header_row = put_skip_rows_option(loading_config_, read_options); + put_column_names_option(loading_config_, header_row, v_file, + parse_options.delimiter, read_options); + put_escape_char_option(loading_config_, parse_options); + put_quote_char_option(loading_config_, parse_options); + put_block_size_option(loading_config_, read_options); + + // parse all column_names + + std::vector included_col_names; + std::vector included_col_indices; + std::vector mapped_property_names; + + auto cur_label_col_mapping = loading_config_.GetVertexColumnMappings(v_label); + auto primary_keys = schema_.get_vertex_primary_key(v_label); + CHECK(primary_keys.size() == 1); + auto primary_key = primary_keys[0]; + + if (cur_label_col_mapping.size() == 0) { + // use default mapping, we assume the order of the columns in the file is + // the same as the order of the properties in the schema, except for + // primary key. + auto primary_key_name = std::get<1>(primary_key); + auto primary_key_ind = std::get<2>(primary_key); + auto property_names = schema_.get_vertex_property_names(v_label); + // for example, schema is : (name,age) + // file header is (id,name,age), the primary key is id. + // so, the mapped_property_names are: (id,name,age) + CHECK(property_names.size() + 1 == read_options.column_names.size()); + // insert primary_key to property_names + property_names.insert(property_names.begin() + primary_key_ind, + primary_key_name); + + for (auto i = 0; i < read_options.column_names.size(); ++i) { + included_col_names.emplace_back(read_options.column_names[i]); + included_col_indices.emplace_back(i); + // We assume the order of the columns in the file is the same as the + // order of the properties in the schema, except for primary key. + mapped_property_names.emplace_back(property_names[i]); + } + } else { + for (auto i = 0; i < cur_label_col_mapping.size(); ++i) { + auto& [col_id, col_name, property_name] = cur_label_col_mapping[i]; + if (col_name.empty()) { + // use default mapping + col_name = read_options.column_names[col_id]; + } + included_col_names.emplace_back(col_name); + included_col_indices.emplace_back(col_id); + mapped_property_names.emplace_back(property_name); + } + } + + VLOG(10) << "Include columns: " << included_col_names.size() + << gs::to_string(included_col_names); + // if empty, then means need all columns + convert_options.include_columns = included_col_names; + + // put column_types, col_name : col_type + std::unordered_map> arrow_types; + { + auto property_types = schema_.get_vertex_properties(v_label); + auto property_names = schema_.get_vertex_property_names(v_label); + CHECK(property_types.size() == property_names.size()); + + for (auto i = 0; i < property_types.size(); ++i) { + // for each schema' property name, get the index of the column in + // vertex_column mapping, and bind the type with the column name + auto property_type = property_types[i]; + auto property_name = property_names[i]; + size_t ind = mapped_property_names.size(); + for (auto i = 0; i < mapped_property_names.size(); ++i) { + if (mapped_property_names[i] == property_name) { + ind = i; + break; + } + } + if (ind == mapped_property_names.size()) { + LOG(FATAL) << "The specified property name: " << property_name + << " does not exist in the vertex column mapping for " + "vertex label: " + << schema_.get_vertex_label_name(v_label) + << " please " + "check your configuration"; + } + VLOG(10) << "vertex_label: " << schema_.get_vertex_label_name(v_label) + << " property_name: " << property_name + << " property_type: " << property_type << " ind: " << ind; + arrow_types.insert( + {included_col_names[ind], PropertyTypeToArrowType(property_type)}); + } + { + // add primary key types; + auto primary_key_name = std::get<1>(primary_key); + auto primary_key_type = std::get<0>(primary_key); + size_t ind = mapped_property_names.size(); + for (auto i = 0; i < mapped_property_names.size(); ++i) { + if (mapped_property_names[i] == primary_key_name) { + ind = i; + break; + } + } + if (ind == mapped_property_names.size()) { + LOG(FATAL) << "The specified property name: " << primary_key_name + << " does not exist in the vertex column mapping, please " + "check your configuration"; + } + arrow_types.insert( + {included_col_names[ind], PropertyTypeToArrowType(primary_key_type)}); + } + + convert_options.column_types = arrow_types; + } +} + +void CSVFragmentLoader::fillEdgeReaderMeta( + arrow::csv::ReadOptions& read_options, + arrow::csv::ParseOptions& parse_options, + arrow::csv::ConvertOptions& convert_options, const std::string& e_file, + label_t src_label_id, label_t dst_label_id, label_t label_id) const { + auto time_stamp_parser = + arrow::TimestampParser::MakeISO8601(); // 2011-08-17T14:26:59.961+0000 + convert_options.timestamp_parsers.emplace_back(time_stamp_parser); + + put_delimiter_option(loading_config_, parse_options); + bool header_row = put_skip_rows_option(loading_config_, read_options); + put_column_names_option(loading_config_, header_row, e_file, + parse_options.delimiter, read_options); + put_escape_char_option(loading_config_, parse_options); + put_quote_char_option(loading_config_, parse_options); + put_block_size_option(loading_config_, read_options); + + auto src_dst_cols = + loading_config_.GetEdgeSrcDstCol(src_label_id, dst_label_id, label_id); + + // parse all column_names + // Get all column names(header, and always skip the first row) + std::vector included_col_names; + std::vector mapped_property_names; + + { + // add src and dst primary col, to included_columns, put src_col and + // dst_col at the first of included_columns. + CHECK(src_dst_cols.first.size() == 1 && src_dst_cols.second.size() == 1); + auto src_col_ind = src_dst_cols.first[0]; + auto dst_col_ind = src_dst_cols.second[0]; + CHECK(src_col_ind >= 0 && src_col_ind < read_options.column_names.size()); + CHECK(dst_col_ind >= 0 && dst_col_ind < read_options.column_names.size()); + + included_col_names.emplace_back(read_options.column_names[src_col_ind]); + included_col_names.emplace_back(read_options.column_names[dst_col_ind]); + } + + auto cur_label_col_mapping = loading_config_.GetEdgeColumnMappings( + src_label_id, dst_label_id, label_id); + if (cur_label_col_mapping.empty()) { + // use default mapping, we assume the order of the columns in the file is + // the same as the order of the properties in the schema, + auto edge_prop_names = + schema_.get_edge_property_names(src_label_id, dst_label_id, label_id); + for (auto i = 0; i < edge_prop_names.size(); ++i) { + auto property_name = edge_prop_names[i]; + included_col_names.emplace_back(property_name); + mapped_property_names.emplace_back(property_name); + } + } else { + // add the property columns into the included columns + for (auto i = 0; i < cur_label_col_mapping.size(); ++i) { + // TODO: make the property column's names are in same order with schema. + auto& [col_id, col_name, property_name] = cur_label_col_mapping[i]; + if (col_name.empty()) { + // use default mapping + col_name = read_options.column_names[col_id]; + } + included_col_names.emplace_back(col_name); + mapped_property_names.emplace_back(property_name); + } + } + + VLOG(10) << "Include Edge columns: " << gs::to_string(included_col_names); + // if empty, then means need all columns + convert_options.include_columns = included_col_names; + + // put column_types, col_name : col_type + std::unordered_map> arrow_types; + { + auto property_types = + schema_.get_edge_properties(src_label_id, dst_label_id, label_id); + auto property_names = + schema_.get_edge_property_names(src_label_id, dst_label_id, label_id); + CHECK(property_types.size() == property_names.size()); + for (auto i = 0; i < property_types.size(); ++i) { + // for each schema' property name, get the index of the column in + // vertex_column mapping, and bind the type with the column name + auto property_type = property_types[i]; + auto property_name = property_names[i]; + size_t ind = mapped_property_names.size(); + for (auto i = 0; i < mapped_property_names.size(); ++i) { + if (mapped_property_names[i] == property_name) { + ind = i; + break; + } + } + if (ind == mapped_property_names.size()) { + LOG(FATAL) << "The specified property name: " << property_name + << " does not exist in the vertex column mapping, please " + "check your configuration"; + } + VLOG(10) << "vertex_label: " << schema_.get_edge_label_name(label_id) + << " property_name: " << property_name + << " property_type: " << property_type << " ind: " << ind; + arrow_types.insert({included_col_names[ind + 2], + PropertyTypeToArrowType(property_type)}); + } + { + // add src and dst primary col, to included_columns and column types. + auto src_dst_cols = loading_config_.GetEdgeSrcDstCol( + src_label_id, dst_label_id, label_id); + CHECK(src_dst_cols.first.size() == 1 && src_dst_cols.second.size() == 1); + auto src_col_ind = src_dst_cols.first[0]; + auto dst_col_ind = src_dst_cols.second[0]; + CHECK(src_col_ind >= 0 && src_col_ind < read_options.column_names.size()); + CHECK(dst_col_ind >= 0 && dst_col_ind < read_options.column_names.size()); + PropertyType src_col_type, dst_col_type; + { + auto src_primary_keys = schema_.get_vertex_primary_key(src_label_id); + CHECK(src_primary_keys.size() == 1); + src_col_type = std::get<0>(src_primary_keys[0]); + arrow_types.insert({read_options.column_names[src_col_ind], + PropertyTypeToArrowType(src_col_type)}); + } + { + auto dst_primary_keys = schema_.get_vertex_primary_key(dst_label_id); + CHECK(dst_primary_keys.size() == 1); + dst_col_type = std::get<0>(dst_primary_keys[0]); + arrow_types.insert({read_options.column_names[dst_col_ind], + PropertyTypeToArrowType(dst_col_type)}); + } + } + + convert_options.column_types = arrow_types; + + VLOG(10) << "Column types: "; + for (auto iter : arrow_types) { + VLOG(10) << iter.first << " : " << iter.second->ToString(); + } + } +} + +void CSVFragmentLoader::loadEdges() { + auto& edge_sources = loading_config_.GetEdgeLoadingMeta(); + + if (edge_sources.empty()) { + LOG(INFO) << "Skip loading edges since no edge source is specified."; + return; + } + + if (thread_num_ == 1) { + LOG(INFO) << "Loading edges with single thread..."; + for (auto iter = edge_sources.begin(); iter != edge_sources.end(); ++iter) { + auto& src_label_id = std::get<0>(iter->first); + auto& dst_label_id = std::get<1>(iter->first); + auto& e_label_id = std::get<2>(iter->first); + auto& e_files = iter->second; + + addEdges(src_label_id, dst_label_id, e_label_id, e_files); + } + } else { + std::vector>> + edge_files; + for (auto iter = edge_sources.begin(); iter != edge_sources.end(); ++iter) { + edge_files.emplace_back(iter->first, iter->second); + } + LOG(INFO) << "Parallel loading with " << thread_num_ << " threads, " + << edge_files.size() << " edge files."; + std::atomic e_ind(0); + std::vector threads(thread_num_); + for (int i = 0; i < thread_num_; ++i) { + threads[i] = std::thread([&]() { + while (true) { + size_t cur = e_ind.fetch_add(1); + if (cur >= edge_files.size()) { + break; + } + auto& edge_file = edge_files[cur]; + auto src_label_id = std::get<0>(edge_file.first); + auto dst_label_id = std::get<1>(edge_file.first); + auto e_label_id = std::get<2>(edge_file.first); + auto& file_names = edge_file.second; + addEdges(src_label_id, dst_label_id, e_label_id, file_names); + } + }); + } + for (auto& thread : threads) { + thread.join(); + } + LOG(INFO) << "Finished loading edges"; + } +} + +void CSVFragmentLoader::LoadFragment(MutablePropertyFragment& fragment) { + loadVertices(); + loadEdges(); + + return basic_fragment_loader_.LoadFragment(fragment); +} + +} // namespace gs diff --git a/flex/storages/rt_mutable_graph/loader/csv_fragment_loader.h b/flex/storages/rt_mutable_graph/loader/csv_fragment_loader.h new file mode 100644 index 000000000000..54433e3876ae --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/csv_fragment_loader.h @@ -0,0 +1,153 @@ + +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STORAGES_RT_MUTABLE_GRAPH_LOADER_CSV_FRAGMENT_LOADER_H_ +#define STORAGES_RT_MUTABLE_GRAPH_LOADER_CSV_FRAGMENT_LOADER_H_ + +#include "flex/storages/rt_mutable_graph/loader/basic_fragment_loader.h" +#include "flex/storages/rt_mutable_graph/loader/i_fragment_loader.h" +#include "flex/storages/rt_mutable_graph/loading_config.h" +#include "flex/storages/rt_mutable_graph/mutable_property_fragment.h" + +#include +#include +#include +#include "arrow/util/value_parsing.h" + +#include "grape/util.h" + +namespace gs { + +// LoadFragment for csv files. +class CSVFragmentLoader : public IFragmentLoader { + public: + CSVFragmentLoader(const Schema& schema, const LoadingConfig& loading_config, + int32_t thread_num) + : loading_config_(loading_config), + schema_(schema), + thread_num_(thread_num), + basic_fragment_loader_(schema_), + read_vertex_table_time_(0), + read_edge_table_time_(0), + convert_to_internal_vertex_time_(0), + convert_to_internal_edge_time_(0), + basic_frag_loader_vertex_time_(0), + basic_frag_loader_edge_time_(0) { + vertex_label_num_ = schema_.vertex_label_num(); + edge_label_num_ = schema_.edge_label_num(); + } + + ~CSVFragmentLoader() {} + + FragmentLoaderType GetFragmentLoaderType() const override { + return FragmentLoaderType::kCSVFragmentLoader; + } + + void LoadFragment(MutablePropertyFragment& fragment) override; + + private: + void loadVertices(); + + void loadEdges(); + + void addVertices(label_t v_label_id, const std::vector& v_files); + + void addVerticesImpl(label_t v_label_id, const std::string& v_label_name, + const std::vector v_file, + IdIndexer& indexer); + + void addVerticesImplWithStreamReader(const std::string& filename, + label_t v_label_id, + IdIndexer& indexer); + + void addVerticesImplWithTableReader(const std::string& filename, + label_t v_label_id, + IdIndexer& indexer); + + void addVertexBatch( + label_t v_label_id, IdIndexer& indexer, + std::shared_ptr& primary_key_col, + const std::vector>& property_cols); + + void addVertexBatch( + label_t v_label_id, IdIndexer& indexer, + std::shared_ptr& primary_key_col, + const std::vector>& property_cols); + + void addEdges(label_t src_label_id, label_t dst_label_id, label_t e_label_id, + const std::vector& e_files); + + template + void addEdgesImpl(label_t src_label_id, label_t dst_label_id, + label_t e_label_id, + const std::vector& e_files); + + template + void addEdgesImplWithStreamReader( + const std::string& file_name, label_t src_label_id, label_t dst_label_id, + label_t e_label_id, std::vector& ie_degree, + std::vector& oe_degree, + std::vector>& edges); + + template + void addEdgesImplWithTableReader( + const std::string& filename, label_t src_label_id, label_t dst_label_id, + label_t e_label_id, std::vector& ie_degree, + std::vector& oe_degree, + std::vector>& edges); + + std::shared_ptr createVertexStreamReader( + label_t v_label, const std::string& v_file); + + std::shared_ptr createVertexTableReader( + label_t v_label, const std::string& v_file); + + std::shared_ptr createEdgeStreamReader( + label_t src_label_id, label_t dst_label_id, label_t e_label, + const std::string& e_file); + + std::shared_ptr createEdgeTableReader( + label_t src_label_id, label_t dst_label_id, label_t e_label, + const std::string& e_file); + + void fillEdgeReaderMeta(arrow::csv::ReadOptions& read_options, + arrow::csv::ParseOptions& parse_options, + arrow::csv::ConvertOptions& convert_options, + const std::string& e_file, label_t src_label_id, + label_t dst_label_id, label_t label_id) const; + + void fillVertexReaderMeta(arrow::csv::ReadOptions& read_options, + arrow::csv::ParseOptions& parse_options, + arrow::csv::ConvertOptions& convert_options, + const std::string& v_file, label_t v_label) const; + + const LoadingConfig& loading_config_; + const Schema& schema_; + size_t vertex_label_num_, edge_label_num_; + int32_t thread_num_; + + mutable BasicFragmentLoader basic_fragment_loader_; + + std::atomic read_vertex_table_time_, read_edge_table_time_; + std::atomic convert_to_internal_vertex_time_, + convert_to_internal_edge_time_; + std::atomic basic_frag_loader_vertex_time_, + basic_frag_loader_edge_time_; +}; + +} // namespace gs + +#endif // STORAGES_RT_MUTABLE_GRAPH_LOADER_CSV_FRAGMENT_LOADER_H_ \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/loader/i_fragment_loader.h b/flex/storages/rt_mutable_graph/loader/i_fragment_loader.h new file mode 100644 index 000000000000..c332fa35e7e0 --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/i_fragment_loader.h @@ -0,0 +1,36 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STORAGES_RT_MUTABLE_GRAPH_LOADER_FRAGMENT_LOADER_H_ +#define STORAGES_RT_MUTABLE_GRAPH_LOADER_FRAGMENT_LOADER_H_ + +#include "flex/storages/rt_mutable_graph/mutable_property_fragment.h" + +namespace gs { + +enum class FragmentLoaderType { kCSVFragmentLoader }; + +// For different input format, we should implement different fragment loader. +class IFragmentLoader { + public: + virtual ~IFragmentLoader() = default; + // get the fragment loader type + virtual FragmentLoaderType GetFragmentLoaderType() const = 0; + virtual void LoadFragment(MutablePropertyFragment& fragment) = 0; +}; + +} // namespace gs + +#endif // STORAGES_RT_MUTABLE_GRAPH_LOADER_FRAGMENT_LOADER_H_ \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/loader/loader_factory.cc b/flex/storages/rt_mutable_graph/loader/loader_factory.cc new file mode 100644 index 000000000000..4cf80e049eaa --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/loader_factory.cc @@ -0,0 +1,30 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flex/storages/rt_mutable_graph/loader/loader_factory.h" + +namespace gs { + +std::shared_ptr LoaderFactory::CreateFragmentLoader( + const Schema& schema, const LoadingConfig& loading_config, int thread_num) { + if (loading_config.GetFormat() == "csv") { + return std::make_shared(schema, loading_config, + thread_num); + } else { + LOG(FATAL) << "Unsupported format: " << loading_config.GetFormat(); + } +} + +} // namespace gs \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/loader/loader_factory.h b/flex/storages/rt_mutable_graph/loader/loader_factory.h new file mode 100644 index 000000000000..7c45081402fb --- /dev/null +++ b/flex/storages/rt_mutable_graph/loader/loader_factory.h @@ -0,0 +1,33 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef STORAGES_RT_MUTABLE_GRAPH_LOADER_LOADER_FACTORY_H_ +#define STORAGES_RT_MUTABLE_GRAPH_LOADER_LOADER_FACTORY_H_ + +#include +#include "flex/storages/rt_mutable_graph/loader/csv_fragment_loader.h" +#include "flex/storages/rt_mutable_graph/loader/i_fragment_loader.h" +#include "flex/storages/rt_mutable_graph/loading_config.h" + +namespace gs { +class LoaderFactory { + public: + static std::shared_ptr CreateFragmentLoader( + const Schema& schema, const LoadingConfig& loading_config, + int thread_num); +}; +} // namespace gs + +#endif // STORAGES_RT_MUTABLE_GRAPH_LOADER_LOADER_FACTORY_H_ \ No newline at end of file diff --git a/flex/storages/rt_mutable_graph/loading_config.cc b/flex/storages/rt_mutable_graph/loading_config.cc index b6153dc10d6e..39e99ecb13e0 100644 --- a/flex/storages/rt_mutable_graph/loading_config.cc +++ b/flex/storages/rt_mutable_graph/loading_config.cc @@ -14,13 +14,16 @@ */ #include "flex/storages/rt_mutable_graph/loading_config.h" - #include #include +#include +#include #include #include +#include "flex/engines/hqps_db/core/utils/hqps_utils.h" namespace gs { + namespace config_parsing { // fetch the primary key of the src and dst vertex label in the edge file, @@ -47,8 +50,9 @@ static bool fetch_src_dst_column_mapping(const Schema& schema, YAML::Node node, for (auto i = 0; i < column_mappings.size(); ++i) { auto column_mapping = column_mappings[i]["column"]; auto name = column_mapping["name"].as(); - if (name != schema_primary_key[i].second) { - LOG(ERROR) << "Expect column name [" << schema_primary_key[i].second + if (name != std::get<1>(schema_primary_key[i])) { + LOG(ERROR) << "Expect column name [" + << std::get<1>(schema_primary_key[i]) << "] for source vertex mapping, at index: " << i << ", got: " << name; return false; @@ -65,11 +69,36 @@ static bool fetch_src_dst_column_mapping(const Schema& schema, YAML::Node node, return true; } +// Function to parse memory size represented as a string +uint64_t parse_block_size(const std::string& memorySizeStr) { + // Create a stringstream to parse the memory size string + std::istringstream ss(memorySizeStr); + + // Extract the numeric part of the string + uint64_t memorySize; + ss >> memorySize; + + // Handle unit prefixes (e.g., KB, MB, GB, etc.) + std::string unit; + ss >> unit; + + // Convert the value to bytes based on the unit + if (unit == "KB") { + memorySize *= 1024; + } else if (unit == "MB") { + memorySize *= 1024 * 1024; + } else if (unit == "GB") { + memorySize *= 1024 * 1024 * 1024; + } + + return memorySize; +} + // Parse the mappings and check whether property exists in the schema. template static bool parse_column_mappings( YAML::Node node, const Schema& schema, const std::string& label_name, - std::vector>& column_mappings, + std::vector>& column_mappings, FUNC condition) { if (!node.IsSequence()) { LOG(ERROR) << "column_mappings should be a sequence"; @@ -87,6 +116,12 @@ static bool parse_column_mappings( LOG(ERROR) << "Expect column index for column mapping"; return false; } + std::string column_name; + if (!get_scalar(column_mapping, "name", column_name)) { + VLOG(10) << "Column name for col_id: " << column_id + << " is not set, make it empty"; + } + std::string property_name; // property name is optional. if (!get_scalar(node[i], "property", property_name)) { LOG(ERROR) << "Expect property name for column mapping"; @@ -97,7 +132,7 @@ static bool parse_column_mappings( << "the schema for label : " << label_name; return false; } - column_mappings.emplace_back(column_id, property_name); + column_mappings.emplace_back(column_id, column_name, property_name); } // If no column mapping is set, use default mapping. return true; @@ -107,7 +142,8 @@ static bool parse_column_mappings( static bool parse_vertex_files( YAML::Node node, const Schema& schema, const std::string& data_location, std::unordered_map>& files, - std::unordered_map>>& + std::unordered_map< + label_t, std::vector>>& vertex_mapping) { std::string label_name; if (!get_scalar(node, "type_name", label_name)) { @@ -149,8 +185,8 @@ static bool parse_vertex_files( } else { // if no column_mappings is given, use default mapping. VLOG(10) << "No vertex mapping is given, use default mapping"; - vertex_mapping.emplace(label_id, - std::vector>()); + vertex_mapping.emplace( + label_id, std::vector>()); } if (files_node) { if (!files_node.IsSequence()) { @@ -177,7 +213,8 @@ static bool parse_vertex_files( static bool parse_vertices_files_schema( YAML::Node node, const Schema& schema, const std::string& data_location, std::unordered_map>& files, - std::unordered_map>>& + std::unordered_map< + label_t, std::vector>>& column_mappings) { if (!node.IsSequence()) { LOG(ERROR) << "vertex is not set properly"; @@ -198,10 +235,10 @@ static bool parse_edge_files( std::unordered_map< std::tuple, std::vector, boost::hash>>& files, - std::unordered_map>, - boost::hash>& - edge_mapping, + std::unordered_map< + typename LoadingConfig::edge_triplet_type, + std::vector>, + boost::hash>& edge_mapping, std::unordered_map, std::vector>, boost::hash>& @@ -302,8 +339,9 @@ static bool parse_edge_files( } else { VLOG(10) << "No edge column mapping is given, use default mapping"; // use default mapping - edge_mapping.emplace(std::tuple{src_label_id, dst_label_id, edge_label_id}, - std::vector>{}); + edge_mapping.emplace( + std::tuple{src_label_id, dst_label_id, edge_label_id}, + std::vector>{}); } YAML::Node files_node = node["inputs"]; @@ -335,10 +373,10 @@ static bool parse_edges_files_schema( std::unordered_map< std::tuple, std::vector, boost::hash>>& files, - std::unordered_map>, - boost::hash>& - edge_mapping, + std::unordered_map< + typename LoadingConfig::edge_triplet_type, + std::vector>, + boost::hash>& edge_mapping, std::unordered_map, std::pair, std::vector>, boost::hash>& @@ -365,8 +403,8 @@ static bool parse_bulk_load_config_file(const std::string& config_file, std::string data_location; load_config.scheme_ = "file"; // default data source is file load_config.method_ = "init"; - load_config.delimiter_ = "|"; load_config.format_ = "csv"; + if (root["loading_config"]) { auto loading_config_node = root["loading_config"]; if (loading_config_node["data_source"]) { @@ -379,9 +417,44 @@ static bool parse_bulk_load_config_file(const std::string& config_file, if (format_node) { get_scalar(format_node, "type", load_config.format_); if (load_config.format_ == "csv") { - if (format_node["meta_data"]) { - get_scalar(format_node["meta_data"], "delimiter", - load_config.delimiter_); + // set default delimiter before we parsing meta_data + load_config.metadata_[reader_options::DELIMITER] = "|"; + load_config.metadata_[reader_options::HEADER_ROW] = "true"; + load_config.metadata_[reader_options::QUOTING] = "false"; + load_config.metadata_[reader_options::QUOTE_CHAR] = "\""; + load_config.metadata_[reader_options::DOUBLE_QUOTE] = "false"; + load_config.metadata_[reader_options::ESCAPE_CHAR] = "\\"; + load_config.metadata_[reader_options::ESCAPING] = "false"; + load_config.metadata_[reader_options::BATCH_SIZE_KEY] = + std::to_string(reader_options::DEFAULT_BLOCK_SIZE); + load_config.metadata_[reader_options::BATCH_READER] = "false"; + // put all key values in meta_data into metadata_ + if (format_node["metadata"]) { + auto meta_data_node = format_node["metadata"]; + if (!meta_data_node.IsMap()) { + LOG(ERROR) << "metadata should be a map"; + return false; + } + for (auto it = meta_data_node.begin(); it != meta_data_node.end(); + ++it) { + // override previous settings. + auto key = it->first.as(); + VLOG(1) << "Got metadata key: " << key + << " value: " << it->second.as(); + if (reader_options::CSV_META_KEY_WORDS.find(key) != + reader_options::CSV_META_KEY_WORDS.end()) { + if (key == reader_options::BATCH_SIZE_KEY) { + // special case for block size + // parse block size (MB, b, KB, B) to bytes + auto block_size_str = it->second.as(); + auto block_size = parse_block_size(block_size_str); + load_config.metadata_[reader_options::BATCH_SIZE_KEY] = + std::to_string(block_size); + } else { + load_config.metadata_[key] = it->second.as(); + } + } + } } } else { LOG(ERROR) << "Only support csv format now"; @@ -389,13 +462,6 @@ static bool parse_bulk_load_config_file(const std::string& config_file, } } } - // only delimeter with | is supported now - if (load_config.GetValidDelimiters().find(load_config.delimiter_) == - load_config.GetValidDelimiters().end()) { - LOG(ERROR) << "Not valid delimiter: " << load_config.delimiter_ - << ", supported delimeters: '|'"; - return false; - } if (load_config.method_ != "init") { LOG(ERROR) << "Only support init method now"; return false; @@ -412,8 +478,10 @@ static bool parse_bulk_load_config_file(const std::string& config_file, } LOG(INFO) << "scheme: " << load_config.scheme_ << ", data_location: " << data_location - << ", method: " << load_config.method_ - << ", delimiter: " << load_config.delimiter_; + << ", method: " << load_config.method_ << ", delimiter: " + << load_config.metadata_[reader_options::DELIMITER] + << ", include header row: " + << load_config.metadata_[reader_options::HEADER_ROW]; if (root["vertex_mappings"]) { VLOG(10) << "vertex_mappings is set"; @@ -453,22 +521,16 @@ LoadingConfig LoadingConfig::ParseFromYaml(const Schema& schema, } LoadingConfig::LoadingConfig(const Schema& schema) - : schema_(schema), - scheme_("file"), - delimiter_("|"), - method_("init"), - format_("csv") {} + : schema_(schema), scheme_("file"), method_("init"), format_("csv") {} LoadingConfig::LoadingConfig(const Schema& schema, const std::string& data_source, const std::string& delimiter, const std::string& method, const std::string& format) - : schema_(schema), - scheme_(data_source), - delimiter_(delimiter), - method_(method), - format_(format) {} + : schema_(schema), scheme_(data_source), method_(method), format_(format) { + metadata_[reader_options::DELIMITER] = delimiter; +} bool LoadingConfig::AddVertexSources(const std::string& label, const std::string& file_path) { @@ -495,18 +557,63 @@ bool LoadingConfig::AddEdgeSources(const std::string& src_label, } void LoadingConfig::SetScheme(const std::string& scheme) { scheme_ = scheme; } -void LoadingConfig::SetDelimiter(const std::string& delimiter) { - delimiter_ = delimiter; +void LoadingConfig::SetDelimiter(const char& delimiter) { + metadata_[reader_options::DELIMITER] = std::string(1, delimiter); } void LoadingConfig::SetMethod(const std::string& method) { method_ = method; } // getters const std::string& LoadingConfig::GetScheme() const { return scheme_; } -const std::string& LoadingConfig::GetDelimiter() const { return delimiter_; } +const std::string& LoadingConfig::GetDelimiter() const { + return metadata_.at(reader_options::DELIMITER); +} + +bool LoadingConfig::GetHasHeaderRow() const { + auto str = metadata_.at(reader_options::HEADER_ROW); + return str == "true" || str == "True" || str == "TRUE"; +} + +const std::string& LoadingConfig::GetFormat() const { return format_; } const std::string& LoadingConfig::GetMethod() const { return method_; } +const std::string& LoadingConfig::GetEscapeChar() const { + return metadata_.at(reader_options::ESCAPE_CHAR); +} + +bool LoadingConfig::GetIsEscaping() const { + auto str = metadata_.at(reader_options::ESCAPING); + return str == "true" || str == "True" || str == "TRUE"; +} + +const std::string& LoadingConfig::GetQuotingChar() const { + return metadata_.at(reader_options::QUOTE_CHAR); +} + +bool LoadingConfig::GetIsQuoting() const { + auto str = metadata_.at(reader_options::QUOTING); + return str == "true" || str == "True" || str == "TRUE"; +} + +bool LoadingConfig::GetIsDoubleQuoting() const { + auto str = metadata_.at(reader_options::DOUBLE_QUOTE); + return str == "true" || str == "True" || str == "TRUE"; +} + +int32_t LoadingConfig::GetBatchSize() const { + if (metadata_.find(reader_options::BATCH_SIZE_KEY) == metadata_.end()) { + return reader_options::DEFAULT_BLOCK_SIZE; + } + auto str = metadata_.at(reader_options::BATCH_SIZE_KEY); + return std::stoi(str); +} + +bool LoadingConfig::GetIsBatchReader() const { + auto str = metadata_.at(reader_options::BATCH_READER); + return str == "true" || str == "True" || str == "TRUE"; +} + const std::unordered_map>& LoadingConfig::GetVertexLoadingMeta() const { @@ -520,14 +627,14 @@ LoadingConfig::GetEdgeLoadingMeta() const { return edge_loading_meta_; } -const std::vector>& +const std::vector>& LoadingConfig::GetVertexColumnMappings(label_t label_id) const { CHECK(vertex_column_mappings_.find(label_id) != vertex_column_mappings_.end()); return vertex_column_mappings_.at(label_id); } -const std::vector>& +const std::vector>& LoadingConfig::GetEdgeColumnMappings(label_t src_label_id, label_t dst_label_id, label_t edge_label_id) const { auto key = std::make_tuple(src_label_id, dst_label_id, edge_label_id); @@ -543,10 +650,4 @@ LoadingConfig::GetEdgeSrcDstCol(label_t src_label_id, label_t dst_label_id, return edge_src_dst_col_.at(key); } -const std::unordered_set& LoadingConfig::GetValidDelimiters() { - return LoadingConfig::valid_delimiter_; -} -// define delimeter here -const std::unordered_set LoadingConfig::valid_delimiter_ = {"|"}; - } // namespace gs diff --git a/flex/storages/rt_mutable_graph/loading_config.h b/flex/storages/rt_mutable_graph/loading_config.h index f5193e6105f4..15f2544fae2b 100644 --- a/flex/storages/rt_mutable_graph/loading_config.h +++ b/flex/storages/rt_mutable_graph/loading_config.h @@ -22,11 +22,41 @@ #include #include #include +#include "arrow/api.h" +#include "arrow/csv/options.h" #include "flex/storages/rt_mutable_graph/schema.h" +#include "flex/utils/arrow_utils.h" #include "flex/utils/yaml_utils.h" +#include "boost/algorithm/string.hpp" + namespace gs { +namespace reader_options { +static const int32_t DEFAULT_BLOCK_SIZE = (1 << 20); // 1MB + +// KEY_WORDS for configurations +static const char* DELIMITER = "delimiter"; +static const char* HEADER_ROW = "header_row"; +static const char* INCLUDE_COLUMNS = "include_columns"; +static const char* COLUMN_TYPES = "column_types"; +static const char* ESCAPING = "escaping"; +static const char* ESCAPE_CHAR = "escape_char"; +static const char* QUOTING = "quoting"; +static const char* QUOTE_CHAR = "quote_char"; +static const char* DOUBLE_QUOTE = "double_quote"; +static const char* BATCH_SIZE_KEY = "batch_size"; +// whether or not to use record batch reader. If true, the reader will read +// data in batches, otherwise, the reader will read data row by row. +static const char* BATCH_READER = "batch_reader"; + +static const std::unordered_set CSV_META_KEY_WORDS = { + DELIMITER, HEADER_ROW, INCLUDE_COLUMNS, COLUMN_TYPES, + ESCAPING, ESCAPE_CHAR, QUOTING, QUOTE_CHAR, + DOUBLE_QUOTE, BATCH_SIZE_KEY, BATCH_READER}; + +} // namespace reader_options + class LoadingConfig; namespace config_parsing { @@ -42,7 +72,6 @@ class LoadingConfig { using edge_triplet_type = std::tuple; // src_label_t, dst_label_t, edge_label_t - static const std::unordered_set valid_delimiter_; // Check whether loading config file is consistent with schema static LoadingConfig ParseFromYaml(const Schema& schema, @@ -66,13 +95,22 @@ class LoadingConfig { size_t dst_pri_key_ind, const std::string& file_path); void SetScheme(const std::string& data_source); - void SetDelimiter(const std::string& delimiter); + void SetDelimiter(const char& delimiter); void SetMethod(const std::string& method); // getters const std::string& GetScheme() const; const std::string& GetDelimiter() const; const std::string& GetMethod() const; + const std::string& GetFormat() const; + bool GetHasHeaderRow() const; + const std::string& GetEscapeChar() const; + bool GetIsEscaping() const; + const std::string& GetQuotingChar() const; + bool GetIsQuoting() const; + bool GetIsDoubleQuoting() const; + int32_t GetBatchSize() const; + bool GetIsBatchReader() const; const std::unordered_map>& GetVertexLoadingMeta() const; const std::unordered_map, @@ -81,33 +119,36 @@ class LoadingConfig { // Get vertex column mappings. Each element in the vector is a pair of // . - const std::vector>& GetVertexColumnMappings( - label_t label_id) const; + const std::vector>& + GetVertexColumnMappings(label_t label_id) const; // Get edge column mappings. Each element in the vector is a pair of - // . - const std::vector>& GetEdgeColumnMappings( - label_t src_label_id, label_t dst_label_id, label_t edge_label_id) const; + // . + const std::vector>& + GetEdgeColumnMappings(label_t src_label_id, label_t dst_label_id, + label_t edge_label_id) const; // Get src_id and dst_id column index for edge label. const std::pair, std::vector>& GetEdgeSrcDstCol( label_t src_label_id, label_t dst_label_id, label_t edge_label_id) const; - static const std::unordered_set& GetValidDelimiters(); - private: const Schema& schema_; - std::string scheme_; // "file", "hdfs", "oss", "s3" - std::string delimiter_; // "\t", ",", " ", "|" - std::string method_; // init, append, overwrite - std::string format_; // csv, tsv, json, parquet + std::string scheme_; // "file", "hdfs", "oss", "s3" + std::string method_; // init, append, overwrite + std::string format_; // csv, tsv, json, parquet + + // meta_data, stores all the meta info about loading + std::unordered_map metadata_; std::unordered_map> vertex_loading_meta_; // > std::unordered_map>> + std::vector>> vertex_column_mappings_; // match which column in file to which property - // in schema + // in schema. {col_ind, col_name, + // schema_prop_name} + // col_name can be empty std::unordered_map, boost::hash> @@ -116,10 +157,11 @@ class LoadingConfig { // // All Edge Files share the same File schema. std::unordered_map>, + std::vector>, boost::hash> edge_column_mappings_; // match which column in file to which property in - // schema + // schema, {col_ind, col_name, schema_prop_name} + // col_name can be empty std::unordered_map, std::vector>, diff --git a/flex/storages/rt_mutable_graph/mutable_csr.h b/flex/storages/rt_mutable_graph/mutable_csr.h index 5fa6b245103a..2ef6569eeff5 100644 --- a/flex/storages/rt_mutable_graph/mutable_csr.h +++ b/flex/storages/rt_mutable_graph/mutable_csr.h @@ -703,6 +703,51 @@ class SingleMutableCsr : public TypedMutableCsrBase { mmap_array nbr_list_; }; +template +class EmptyCsr : public TypedMutableCsrBase { + using slice_t = MutableNbrSlice; + + public: + EmptyCsr() = default; + ~EmptyCsr() = default; + + void batch_init(vid_t vnum, const std::vector& degree) override {} + + slice_t get_edges(vid_t i) const override { return slice_t::empty(); } + + void put_generic_edge(vid_t src, vid_t dst, const Any& data, timestamp_t ts, + ArenaAllocator& alloc) override {} + + void Serialize(const std::string& path) override {} + + void Deserialize(const std::string& path) override {} + + void batch_put_edge(vid_t src, vid_t dst, const EDATA_T& data, + timestamp_t ts = 0) override {} + + void ingest_edge(vid_t src, vid_t dst, grape::OutArchive& arc, timestamp_t ts, + ArenaAllocator& alloc) override { + EDATA_T value; + arc >> value; + } + + void peek_ingest_edge(vid_t src, vid_t dst, grape::OutArchive& arc, + const timestamp_t ts, ArenaAllocator& alloc) override {} + + std::shared_ptr edge_iter( + vid_t v) const override { + return std::make_shared>( + MutableNbrSlice::empty()); + } + MutableCsrConstEdgeIterBase* edge_iter_raw(vid_t v) const override { + return new TypedMutableCsrConstEdgeIter( + MutableNbrSlice::empty()); + } + std::shared_ptr edge_iter_mut(vid_t v) override { + return std::make_shared>( + MutableNbrSliceMut::empty()); + } +}; } // namespace gs #endif // GRAPHSCOPE_GRAPH_MUTABLE_CSR_H_ diff --git a/flex/storages/rt_mutable_graph/mutable_property_fragment.cc b/flex/storages/rt_mutable_graph/mutable_property_fragment.cc index 6312cb52f822..b4b6142ab7a8 100644 --- a/flex/storages/rt_mutable_graph/mutable_property_fragment.cc +++ b/flex/storages/rt_mutable_graph/mutable_property_fragment.cc @@ -15,44 +15,10 @@ #include "flex/storages/rt_mutable_graph/mutable_property_fragment.h" -namespace gs { - -void preprocess_line(char* line) { - size_t len = strlen(line); - while (len >= 0) { - if (line[len] != '\0' && line[len] != '\n' && line[len] != '\r' && - line[len] != ' ' && line[len] != '\t') { - break; - } else { - --len; - } - } - line[len + 1] = '\0'; -} +#include "flex/engines/hqps_db/core/utils/hqps_utils.h" +#include "flex/utils/property/types.h" -void get_header_row(const std::string& file_name, std::vector& header) { - char line_buf[4096]; - FILE* fin = fopen(file_name.c_str(), "r"); - if (fgets(line_buf, 4096, fin) == NULL) { - LOG(FATAL) << "Failed to read header from file: " << file_name; - return; - } - preprocess_line(line_buf); - ParseRecord(line_buf, header); -} - -std::vector> generate_default_column_mapping( - const std::string& file_name, std::string primary_key_name, - const std::vector& column_names) { - std::vector> column_mapping; - for (size_t i = 0; i < column_names.size(); ++i) { - auto col_name = column_names[i]; - if (col_name != primary_key_name) { - column_mapping.emplace_back(i, col_name); - } - } - return column_mapping; -} +namespace gs { MutablePropertyFragment::MutablePropertyFragment() {} @@ -69,471 +35,6 @@ MutablePropertyFragment::~MutablePropertyFragment() { } } -// vertex_column_mappings is a vector of pairs, each pair is (column_ind in -// file, the cooresponding property name in schema). -void MutablePropertyFragment::initVertices( - label_t v_label_i, const std::vector& filenames, - const std::vector>& vertex_column_mappings) { - // Check primary key num and type. - auto primary_keys = schema_.get_vertex_primary_key(v_label_i); - if (primary_keys.size() != 1) { - LOG(FATAL) << "Only support one primary key for vertex."; - } - if (primary_keys[0].first != PropertyType::kInt64) { - LOG(FATAL) << "Only support int64_t primary key for vertex."; - } - IdIndexer indexer; - std::string v_label_name = schema_.get_vertex_label_name(v_label_i); - VLOG(10) << "Start init vertices for label " << v_label_i << " with " - << filenames.size() << " files."; - auto& table = vertex_data_[v_label_i]; - auto& property_types = schema_.get_vertex_properties(v_label_name); - auto& property_names = schema_.get_vertex_property_names(v_label_name); - - // col num should be property num - 1, because one column will be used as - // primary key - CHECK(property_types.size() > 0); - size_t col_num = property_types.size(); - - // create real property_types_vec for table - VLOG(10) << "Init table for table: " << v_label_name - << ", with property num: " << col_num; - table.init(property_names, property_types, - schema_.get_vertex_storage_strategies(v_label_name), - schema_.get_max_vnum(v_label_name)); - // Match the records read from the file with the schema - parseVertexFiles(v_label_name, filenames, vertex_column_mappings, indexer); - if (indexer.bucket_count() == 0) { - indexer._rehash(schema_.get_max_vnum(v_label_name)); - } - build_lf_indexer(indexer, lf_indexers_[v_label_i]); -} - -template -class EmptyCsr : public TypedMutableCsrBase { - using slice_t = MutableNbrSlice; - - public: - EmptyCsr() = default; - ~EmptyCsr() = default; - - void batch_init(vid_t vnum, const std::vector& degree) override {} - - slice_t get_edges(vid_t i) const override { return slice_t::empty(); } - - void put_generic_edge(vid_t src, vid_t dst, const Any& data, timestamp_t ts, - ArenaAllocator& alloc) override {} - - void Serialize(const std::string& path) override {} - - void Deserialize(const std::string& path) override {} - - void batch_put_edge(vid_t src, vid_t dst, const EDATA_T& data, - timestamp_t ts = 0) override {} - - void ingest_edge(vid_t src, vid_t dst, grape::OutArchive& arc, timestamp_t ts, - ArenaAllocator& alloc) override { - EDATA_T value; - arc >> value; - } - - void peek_ingest_edge(vid_t src, vid_t dst, grape::OutArchive& arc, - const timestamp_t ts, ArenaAllocator& alloc) override {} - - std::shared_ptr edge_iter( - vid_t v) const override { - return std::make_shared>( - MutableNbrSlice::empty()); - } - MutableCsrConstEdgeIterBase* edge_iter_raw(vid_t v) const override { - return new TypedMutableCsrConstEdgeIter( - MutableNbrSlice::empty()); - } - std::shared_ptr edge_iter_mut(vid_t v) override { - return std::make_shared>( - MutableNbrSliceMut::empty()); - } -}; - -template -TypedMutableCsrBase* create_typed_csr(EdgeStrategy es) { - if (es == EdgeStrategy::kSingle) { - return new SingleMutableCsr(); - } else if (es == EdgeStrategy::kMultiple) { - return new MutableCsr(); - } else if (es == EdgeStrategy::kNone) { - return new EmptyCsr(); - } - LOG(FATAL) << "not support edge strategy or edge data type"; -} - -template -std::pair construct_empty_csr( - EdgeStrategy ie_strategy, EdgeStrategy oe_strategy) { - TypedMutableCsrBase* ie_csr = create_typed_csr(ie_strategy); - TypedMutableCsrBase* oe_csr = create_typed_csr(oe_strategy); - ie_csr->batch_init(0, {}); - oe_csr->batch_init(0, {}); - return std::make_pair(ie_csr, oe_csr); -} - -// each file name is tuple src_column_id indicate which column is src id, dst_column_id -// indicate which column is dst id default is 0, 1 -template -std::pair construct_csr( - const Schema& schema, const std::vector& filenames, - size_t src_col_ind, size_t dst_col_ind, - const std::vector& property_types, - const std::vector>& column_mappings, - EdgeStrategy ie_strategy, EdgeStrategy oe_strategy, - const LFIndexer& src_indexer, const LFIndexer& dst_indexer) { - TypedMutableCsrBase* ie_csr = create_typed_csr(ie_strategy); - TypedMutableCsrBase* oe_csr = create_typed_csr(oe_strategy); - - std::vector odegree(src_indexer.size(), 0); - std::vector idegree(dst_indexer.size(), 0); - - std::vector> parsed_edges; - vid_t src_index, dst_index; - char line_buf[4096]; - oid_t src, dst; - EDATA_T data; - - size_t col_num = property_types.size(); - std::vector header(col_num + 2); - for (auto& item : header) { - item.type = PropertyType::kString; - } - - // fetch header first - get_header_row(filenames[0], header); // filenames must not be empty - // check header matches schema - - for (auto filename : filenames) { - VLOG(10) << "processing " << filename << " with src_col_id " << src_col_ind - << " and dst_col_id " << dst_col_ind; - FILE* fin = fopen(filename.c_str(), "r"); - if (fgets(line_buf, 4096, fin) == NULL) { - continue; - } - preprocess_line(line_buf); // do nothing - - // if match the default configuration, use ParseRecordX to fasten the - // parsing - if (src_col_ind == 0 && dst_col_ind == 1) { - while (fgets(line_buf, 4096, fin) != NULL) { - // ParseRecord src_id, dst_id, data from row. - ParseRecordX(line_buf, src, dst, data); - src_index = src_indexer.get_index(src); - dst_index = dst_indexer.get_index(dst); - ++idegree[dst_index]; - ++odegree[src_index]; - parsed_edges.emplace_back(src_index, dst_index, data); - } - } else { - std::vector row(col_num + 2); - CHECK(src_col_ind < row.size() && dst_col_ind < row.size()); - row[src_col_ind].type = PropertyType::kInt64; - row[dst_col_ind].type = PropertyType::kInt64; - int32_t data_col_id = -1; - // the left column is must the edata. - for (auto i = 0; i < row.size(); ++i) { - if (row[i].type == PropertyType::kEmpty) { - // The index 0's type must exists - row[i].type = property_types[0]; - data_col_id = i; - break; - } - } - CHECK(data_col_id != -1); - while (fgets(line_buf, 4096, fin) != NULL) { - // ParseRecord src_id, dst_id, data from row. - ParseRecord(line_buf, row); - src_index = src_indexer.get_index(row[src_col_ind].AsInt64()); - dst_index = dst_indexer.get_index(row[dst_col_ind].AsInt64()); - ConvertAny::to(row[data_col_id], data); - ++idegree[dst_index]; - ++odegree[src_index]; - parsed_edges.emplace_back(src_index, dst_index, data); - } - } - - fclose(fin); - } - - ie_csr->batch_init(dst_indexer.size(), idegree); - oe_csr->batch_init(src_indexer.size(), odegree); - - for (auto& edge : parsed_edges) { - ie_csr->batch_put_edge(std::get<1>(edge), std::get<0>(edge), - std::get<2>(edge)); - oe_csr->batch_put_edge(std::get<0>(edge), std::get<1>(edge), - std::get<2>(edge)); - } - - return std::make_pair(ie_csr, oe_csr); -} - -void MutablePropertyFragment::initEdges( - label_t src_label_i, label_t dst_label_i, label_t edge_label_i, - const std::vector& filenames, - const std::vector>& column_mappings, - size_t src_col_ind, size_t dst_col_ind) { - auto src_label_name = schema_.get_vertex_label_name(src_label_i); - auto dst_label_name = schema_.get_vertex_label_name(dst_label_i); - auto edge_label_name = schema_.get_edge_label_name(edge_label_i); - if (filenames.size() <= 0) { - LOG(FATAL) << "No edge files found for src label: " << src_label_name - << " dst label: " << dst_label_name - << " edge label: " << edge_label_name; - } - if (filenames.size() <= 0) { - LOG(FATAL) << "No edge files found for src label: " << src_label_name - << " dst label: " << dst_label_name - << " edge label: " << edge_label_name; - } - VLOG(10) << "Init edges src label: " << src_label_name - << " dst label: " << dst_label_name - << " edge label: " << edge_label_name - << " filenames: " << filenames.size(); - auto& property_types = schema_.get_edge_properties( - src_label_name, dst_label_name, edge_label_name); - size_t col_num = property_types.size(); - CHECK_LE(col_num, 1) << "Only single or no property is supported for edge."; - - size_t index = src_label_i * vertex_label_num_ * edge_label_num_ + - dst_label_i * edge_label_num_ + edge_label_i; - EdgeStrategy oe_strtagy = schema_.get_outgoing_edge_strategy( - src_label_name, dst_label_name, edge_label_name); - EdgeStrategy ie_strtagy = schema_.get_incoming_edge_strategy( - src_label_name, dst_label_name, edge_label_name); - - { - // check column mappings consistent, - // TODO(zhanglei): Check column mappings after multiple property on edge is - // supported - if (column_mappings.size() > 1) { - LOG(FATAL) << "Edge column mapping must be less than 1"; - } - if (column_mappings.size() > 0) { - auto& mapping = column_mappings[0]; - if (mapping.first == src_col_ind || mapping.first == dst_col_ind) { - LOG(FATAL) << "Edge column mappings must not contain src_col_ind or " - "dst_col_ind"; - } - // check property exists in schema - if (!schema_.edge_has_property(src_label_name, dst_label_name, - edge_label_name, mapping.second)) { - LOG(FATAL) << "property " << mapping.second - << " not exists in schema for edge triplet " - << src_label_name << " -> " << edge_label_name << " -> " - << dst_label_name; - } - } - } - - if (col_num == 0) { - if (filenames.empty()) { - std::tie(ie_[index], oe_[index]) = - construct_empty_csr(ie_strtagy, oe_strtagy); - } else { - std::tie(ie_[index], oe_[index]) = construct_csr( - schema_, filenames, src_col_ind, dst_col_ind, property_types, - column_mappings, ie_strtagy, oe_strtagy, lf_indexers_[src_label_i], - lf_indexers_[dst_label_i]); - } - } else if (property_types[0] == PropertyType::kDate) { - if (filenames.empty()) { - std::tie(ie_[index], oe_[index]) = - construct_empty_csr(ie_strtagy, oe_strtagy); - } else { - std::tie(ie_[index], oe_[index]) = construct_csr( - schema_, filenames, src_col_ind, dst_col_ind, property_types, - column_mappings, ie_strtagy, oe_strtagy, lf_indexers_[src_label_i], - lf_indexers_[dst_label_i]); - } - } else if (property_types[0] == PropertyType::kInt32) { - if (filenames.empty()) { - std::tie(ie_[index], oe_[index]) = - construct_empty_csr(ie_strtagy, oe_strtagy); - } else { - std::tie(ie_[index], oe_[index]) = construct_csr( - schema_, filenames, src_col_ind, dst_col_ind, property_types, - column_mappings, ie_strtagy, oe_strtagy, lf_indexers_[src_label_i], - lf_indexers_[dst_label_i]); - } - } else if (property_types[0] == PropertyType::kInt64) { - if (filenames.empty()) { - std::tie(ie_[index], oe_[index]) = - construct_empty_csr(ie_strtagy, oe_strtagy); - } else { - std::tie(ie_[index], oe_[index]) = construct_csr( - schema_, filenames, src_col_ind, dst_col_ind, property_types, - column_mappings, ie_strtagy, oe_strtagy, lf_indexers_[src_label_i], - lf_indexers_[dst_label_i]); - } - } else if (property_types[0] == PropertyType::kString) { - if (filenames.empty()) { - std::tie(ie_[index], oe_[index]) = - construct_empty_csr(ie_strtagy, oe_strtagy); - } else { - LOG(FATAL) << "Unsupported edge property type."; - } - } else if (property_types[0] == PropertyType::kDouble) { - if (filenames.empty()) { - std::tie(ie_[index], oe_[index]) = - construct_empty_csr(ie_strtagy, oe_strtagy); - } else { - std::tie(ie_[index], oe_[index]) = construct_csr( - schema_, filenames, src_col_ind, dst_col_ind, property_types, - column_mappings, ie_strtagy, oe_strtagy, lf_indexers_[src_label_i], - lf_indexers_[dst_label_i]); - } - } else { - LOG(FATAL) << "Unsupported edge property type."; - } -} - -void MutablePropertyFragment::Init(const Schema& schema, - const LoadingConfig& loading_config, - int thread_num) { - schema_ = schema; - vertex_label_num_ = schema_.vertex_label_num(); - edge_label_num_ = schema_.edge_label_num(); - vertex_data_.resize(vertex_label_num_); - ie_.resize(vertex_label_num_ * vertex_label_num_ * edge_label_num_, NULL); - oe_.resize(vertex_label_num_ * vertex_label_num_ * edge_label_num_, NULL); - lf_indexers_.resize(vertex_label_num_); - - auto& vertex_sources = loading_config.GetVertexLoadingMeta(); - auto& edge_sources = loading_config.GetEdgeLoadingMeta(); - - if (thread_num == 1) { - if (vertex_sources.empty()) { - LOG(INFO) << "Skip loading vertices since no vertex source is specified."; - } else { - for (auto iter = vertex_sources.begin(); iter != vertex_sources.end(); - ++iter) { - auto v_label_id = iter->first; - auto v_files = iter->second; - initVertices(v_label_id, v_files, - loading_config.GetVertexColumnMappings(v_label_id)); - } - } - - if (edge_sources.empty()) { - LOG(INFO) << "Skip loading edges since no edge source is specified."; - } else { - LOG(INFO) << "Loading edges..."; - for (auto iter = edge_sources.begin(); iter != edge_sources.end(); - ++iter) { - // initEdges(iter->first, iter->second); - auto& src_label_id = std::get<0>(iter->first); - auto& dst_label_id = std::get<1>(iter->first); - auto& e_label_id = std::get<2>(iter->first); - auto& e_files = iter->second; - auto src_dst_col_pair = loading_config.GetEdgeSrcDstCol( - src_label_id, dst_label_id, e_label_id); - // We currenly only support one src primary key and one dst primary key - if (src_dst_col_pair.first.size() != 1 || - src_dst_col_pair.second.size() != 1) { - LOG(FATAL) << "We currenly only support one src primary key and one " - "dst primary key"; - } - initEdges(src_label_id, dst_label_id, e_label_id, e_files, - loading_config.GetEdgeColumnMappings( - src_label_id, dst_label_id, e_label_id), - src_dst_col_pair.first[0], src_dst_col_pair.second[0]); - } - } - - } else { - // copy vertex_sources and edge sources to vector, since we need to - // use multi-thread loading. - std::vector>> vertex_files; - for (auto iter = vertex_sources.begin(); iter != vertex_sources.end(); - ++iter) { - vertex_files.emplace_back(iter->first, iter->second); - } - std::vector>> - edge_files; - for (auto iter = edge_sources.begin(); iter != edge_sources.end(); ++iter) { - edge_files.emplace_back(iter->first, iter->second); - } - LOG(INFO) << "Parallel loading with " << thread_num << " threads, " - << " " << vertex_files.size() << " vertex files, " - << edge_files.size() << " edge files."; - { - if (vertex_sources.empty()) { - LOG(INFO) - << "Skip loading vertices since no vertex source is specified."; - } else { - std::atomic v_ind(0); - std::vector threads(thread_num); - for (int i = 0; i < thread_num; ++i) { - threads[i] = std::thread([&]() { - while (true) { - size_t cur = v_ind.fetch_add(1); - if (cur >= vertex_files.size()) { - break; - } - auto v_label_id = vertex_files[cur].first; - initVertices(v_label_id, vertex_files[cur].second, - loading_config.GetVertexColumnMappings(v_label_id)); - } - }); - } - for (auto& thrd : threads) { - thrd.join(); - } - - LOG(INFO) << "finished loading vertices"; - } - } - { - if (edge_sources.empty()) { - LOG(INFO) << "Skip loading edges since no edge source is specified."; - } else { - std::atomic e_ind(0); - std::vector threads(thread_num); - for (int i = 0; i < thread_num; ++i) { - threads[i] = std::thread([&]() { - while (true) { - size_t cur = e_ind.fetch_add(1); - if (cur >= edge_files.size()) { - break; - } - auto& edge_file = edge_files[cur]; - auto src_label_id = std::get<0>(edge_file.first); - auto dst_label_id = std::get<1>(edge_file.first); - auto e_label_id = std::get<2>(edge_file.first); - auto& file_names = edge_file.second; - auto src_dst_col_pair = loading_config.GetEdgeSrcDstCol( - src_label_id, dst_label_id, e_label_id); - if (src_dst_col_pair.first.size() != 1 || - src_dst_col_pair.second.size() != 1) { - LOG(FATAL) - << "We currenly only support one src primary key and one " - "dst primary key"; - } - initEdges(src_label_id, dst_label_id, e_label_id, file_names, - loading_config.GetEdgeColumnMappings( - src_label_id, dst_label_id, e_label_id), - src_dst_col_pair.first[0], src_dst_col_pair.second[0]); - } - }); - } - for (auto& thrd : threads) { - thrd.join(); - } - LOG(INFO) << "finished loading edges"; - } - } - } -} - void MutablePropertyFragment::IngestEdge(label_t src_label, vid_t src_lid, label_t dst_label, vid_t dst_lid, label_t edge_label, timestamp_t ts, @@ -802,150 +303,4 @@ const MutableCsrBase* MutablePropertyFragment::get_ie_csr( return ie_[index]; } -void MutablePropertyFragment::parseVertexFiles( - const std::string& vertex_label, const std::vector& filenames, - const std::vector>& - in_vertex_column_mappings, - IdIndexer& indexer) { - if (filenames.empty()) { - return; - } - LOG(INFO) << "Parsing vertex files for label " << vertex_label; - auto vertex_column_mappings = in_vertex_column_mappings; - - size_t label_index = schema_.get_vertex_label_id(vertex_label); - auto& table = vertex_data_[label_index]; - auto& property_types = schema_.get_vertex_properties(vertex_label); - size_t col_num = property_types.size(); - auto primary_key = schema_.get_vertex_primary_key(label_index)[0]; - auto primary_key_name = primary_key.second; - - // vertex_column_mappings can be empty, empty means the each column in the - // file is mapped to the same column in the table. - std::vector header(col_num + 1); - { - for (auto i = 0; i < header.size(); ++i) { - header[i].type = PropertyType::kString; - } - } - std::vector properties(col_num + 1); - std::vector column_names(col_num + 1); - size_t primary_key_ind = col_num + 1; - - // First get header - get_header_row(filenames[0], header); - // construct column_names - for (auto i = 0; i < header.size(); ++i) { - column_names[i] = - std::string(header[i].value.s.data(), header[i].value.s.size()); - } - - if (vertex_column_mappings.empty()) { - vertex_column_mappings = generate_default_column_mapping( - filenames[0], primary_key_name, column_names); - VLOG(10) << "vertex_column_mappings is empty, " - "generate_default_column_mapping returns " - << vertex_column_mappings.size() << " mappings"; - } - for (auto i = 0; i < properties.size(); ++i) { - if (column_names[i] == primary_key_name) { - primary_key_ind = i; - break; - } - VLOG(10) << " compare: " << column_names[i] << " " << primary_key_name; - } - CHECK(primary_key_ind != col_num + 1); - { - // reset header of table with primary key removed - std::vector header_col_names; - for (auto i = 0; i < column_names.size(); ++i) { - if (i != primary_key_ind) { - header_col_names.emplace_back(column_names[i]); - } - } - table.reset_header(header_col_names); - VLOG(10) << "reset header of table with primary key removed: " - << header_col_names.size(); - } - - for (auto i = 0; i < properties.size(); ++i) { - if (i < primary_key_ind) { - properties[i].type = property_types[i]; - } else if (i > primary_key_ind) { - properties[i].type = property_types[i - 1]; - } else { - properties[i].type = primary_key.first; - } - } - - char line_buf[4096]; - // we can't assume oid will be the first column. - oid_t oid; - vid_t v_index; - - std::vector file_col_to_schema_col_ind; - { - // parse from vertex_column_mappings, vertex_column_mappings doesn't - // contains primary key. - size_t max_ind = 0; - for (auto& pair : vertex_column_mappings) { - max_ind = std::max(max_ind, pair.first); - } - file_col_to_schema_col_ind.resize(max_ind + 1, -1); - for (auto& pair : vertex_column_mappings) { - // if meet primary key, skip it. - if (pair.second == primary_key_name) { - VLOG(10) << "Skip primary key column " << pair.first << ", " - << pair.second; - continue; - } - if (file_col_to_schema_col_ind[pair.first] == -1) { - if (schema_.vertex_has_property(vertex_label, pair.second)) { - auto& prop_names = schema_.get_vertex_property_names(vertex_label); - // find index of pair.second in prop_names - auto iter = - std::find(prop_names.begin(), prop_names.end(), pair.second); - // must be a valid iter. - if (iter == prop_names.end()) { - LOG(FATAL) << "Column " << pair.first << " is mapped to a column " - << "that does not exist in schema: " << pair.second; - } - file_col_to_schema_col_ind[pair.first] = - std::distance(prop_names.begin(), iter); - VLOG(10) << "Column " << std::to_string(pair.first) - << " is mapped to column " << pair.second << " in schema.: " - << std::to_string(file_col_to_schema_col_ind[pair.first]); - } else { - LOG(FATAL) << "Column " << pair.first << " is mapped to a column " - << "that does not exist in schema: " << pair.second; - } - } else { - LOG(FATAL) << "Column " << pair.first << " is mapped to multiple " - << "columns in bulk loading file."; - } - } - } - - for (auto filename : filenames) { - VLOG(10) << "Processing file: " << filename; - FILE* fin = fopen(filename.c_str(), "r"); - // Just read first line, and do nothing, the header of file is not needed. - if (fgets(line_buf, 4096, fin) == NULL) { - continue; - } - preprocess_line(line_buf); - while (fgets(line_buf, 4096, fin) != NULL) { - preprocess_line(line_buf); - ParseRecord(line_buf, properties); - oid = properties[primary_key_ind].AsInt64(); - if (indexer.add(oid, v_index)) { - // insert properties except for primary_key_ind - table.insert(v_index, properties, file_col_to_schema_col_ind); - } - } - - fclose(fin); - } -} - } // namespace gs diff --git a/flex/storages/rt_mutable_graph/mutable_property_fragment.h b/flex/storages/rt_mutable_graph/mutable_property_fragment.h index 67e34df41cdd..6fd2a7f4ae59 100644 --- a/flex/storages/rt_mutable_graph/mutable_property_fragment.h +++ b/flex/storages/rt_mutable_graph/mutable_property_fragment.h @@ -22,9 +22,9 @@ #include "flex/storages/rt_mutable_graph/schema.h" -#include "flex/storages/rt_mutable_graph/loading_config.h" #include "flex/storages/rt_mutable_graph/mutable_csr.h" #include "flex/storages/rt_mutable_graph/types.h" +#include "flex/utils/arrow_utils.h" #include "flex/utils/id_indexer.h" #include "flex/utils/property/table.h" #include "flex/utils/yaml_utils.h" @@ -39,9 +39,6 @@ class MutablePropertyFragment { ~MutablePropertyFragment(); - void Init(const Schema& schema, const LoadingConfig& loading_config, - int thread_num = 1); - void IngestEdge(label_t src_label, vid_t src_lid, label_t dst_label, vid_t dst_lid, label_t edge_label, timestamp_t ts, grape::OutArchive& arc, ArenaAllocator& alloc); @@ -95,23 +92,6 @@ class MutablePropertyFragment { const MutableCsrBase* get_ie_csr(label_t label, label_t neighbor_label, label_t edge_label) const; - void parseVertexFiles( - const std::string& vertex_label, - const std::vector& filenames, - const std::vector>& vertex_column_mappings, - IdIndexer& indexer); - - void initVertices(label_t v_label_i, - const std::vector& vertex_files, - const std::vector>& - vertex_column_mappings); - - void initEdges( - label_t src_label_i, label_t dst_label_i, label_t edge_label_i, - const std::vector& edge_files, - const std::vector>& edge_col_mappings, - size_t src_col_id, size_t dst_col_id); - Schema schema_; std::vector> lf_indexers_; std::vector ie_, oe_; diff --git a/flex/storages/rt_mutable_graph/schema.cc b/flex/storages/rt_mutable_graph/schema.cc index 26e5bc7e8d1b..b42f344dd519 100644 --- a/flex/storages/rt_mutable_graph/schema.cc +++ b/flex/storages/rt_mutable_graph/schema.cc @@ -25,7 +25,8 @@ Schema::~Schema() = default; void Schema::add_vertex_label( const std::string& label, const std::vector& property_types, const std::vector& property_names, - const std::vector>& primary_key, + const std::vector>& + primary_key, const std::vector& strategies, size_t max_vnum) { label_t v_label_id = vertex_label_to_index(label); vproperties_[v_label_id] = property_types; @@ -103,6 +104,12 @@ const std::vector& Schema::get_vertex_property_names( return vprop_names_[index]; } +const std::vector& Schema::get_vertex_property_names( + label_t label) const { + CHECK(label < vprop_names_.size()); + return vprop_names_[label]; +} + const std::vector& Schema::get_vertex_storage_strategies( const std::string& label) const { label_t index; @@ -137,6 +144,15 @@ const std::vector& Schema::get_edge_properties( return eproperties_.at(index); } +const std::vector& Schema::get_edge_properties( + label_t src_label, label_t dst_label, label_t label) const { + CHECK(src_label < vlabel_indexer_.size()); + CHECK(dst_label < vlabel_indexer_.size()); + CHECK(label < elabel_indexer_.size()); + uint32_t index = generate_edge_label(src_label, dst_label, label); + return eproperties_.at(index); +} + PropertyType Schema::get_edge_property(label_t src, label_t dst, label_t edge) const { uint32_t index = generate_edge_label(src, dst, edge); @@ -154,6 +170,16 @@ const std::vector& Schema::get_edge_property_names( return eprop_names_.at(index); } +const std::vector& Schema::get_edge_property_names( + const label_t& src_label, const label_t& dst_label, + const label_t& label) const { + CHECK(src_label < vlabel_indexer_.size()); + CHECK(dst_label < vlabel_indexer_.size()); + CHECK(label < elabel_indexer_.size()); + uint32_t index = generate_edge_label(src_label, dst_label, label); + return eprop_names_.at(index); +} + bool Schema::valid_edge_property(const std::string& src_label, const std::string& dst_label, const std::string& label) const { @@ -210,7 +236,7 @@ std::string Schema::get_edge_label_name(label_t index) const { return ret; } -const std::vector>& +const std::vector>& Schema::get_vertex_primary_key(label_t index) const { CHECK(v_primary_keys_.size() > index); return v_primary_keys_.at(index); @@ -511,7 +537,7 @@ static bool parse_vertex_schema(YAML::Node node, Schema& schema) { // remove primary key from properties. std::vector primary_key_inds(primary_key_node.size(), -1); - std::vector> primary_keys; + std::vector> primary_keys; for (auto i = 0; i < primary_key_node.size(); ++i) { auto cur_primary_key = primary_key_node[i]; std::string primary_key_name = primary_key_node[0].as(); @@ -531,7 +557,8 @@ static bool parse_vertex_schema(YAML::Node node, Schema& schema) { return false; } primary_keys.emplace_back(property_types[primary_key_inds[i]], - property_names[primary_key_inds[i]]); + property_names[primary_key_inds[i]], + primary_key_inds[i]); // remove primary key from properties. property_names.erase(property_names.begin() + primary_key_inds[i]); property_types.erase(property_types.begin() + primary_key_inds[i]); @@ -738,7 +765,7 @@ bool Schema::vertex_has_primary_key(const std::string& label, CHECK(v_label_id < vprop_names_.size()); auto& keys = v_primary_keys_[v_label_id]; for (auto i = 0; i < keys.size(); ++i) { - if (keys[i].second == prop) { + if (std::get<1>(keys[i]) == prop) { return true; } } diff --git a/flex/storages/rt_mutable_graph/schema.h b/flex/storages/rt_mutable_graph/schema.h index fe217b3a2bc7..c0a017088d79 100644 --- a/flex/storages/rt_mutable_graph/schema.h +++ b/flex/storages/rt_mutable_graph/schema.h @@ -23,11 +23,6 @@ namespace gs { -static constexpr const char* DT_SIGNED_INT32 = "DT_SIGNED_INT32"; -static constexpr const char* DT_STRING = "DT_STRING"; -static constexpr const char* DT_SIGNED_INT64 = "DT_SIGNED_INT64"; -static constexpr const char* DT_DOUBLE = "DT_DOUBLE"; - class Schema { public: using label_type = label_t; @@ -37,7 +32,8 @@ class Schema { void add_vertex_label( const std::string& label, const std::vector& property_types, const std::vector& property_names, - const std::vector>& primary_key, + const std::vector>& + primary_key, const std::vector& strategies = {}, size_t max_vnum = static_cast(1) << 32); @@ -69,6 +65,9 @@ class Schema { const std::vector& get_vertex_properties(label_t label) const; + const std::vector& get_vertex_property_names( + label_t label) const; + const std::vector& get_vertex_storage_strategies( const std::string& label) const; @@ -81,12 +80,20 @@ class Schema { const std::string& src_label, const std::string& dst_label, const std::string& label) const; + const std::vector& get_edge_properties(label_t src_label, + label_t dst_label, + label_t label) const; + PropertyType get_edge_property(label_t src, label_t dst, label_t edge) const; const std::vector& get_edge_property_names( const std::string& src_label, const std::string& dst_label, const std::string& label) const; + const std::vector& get_edge_property_names( + const label_t& src_label, const label_t& dst_label, + const label_t& label) const; + bool vertex_has_property(const std::string& label, const std::string& prop) const; @@ -124,9 +131,11 @@ class Schema { std::string get_edge_label_name(label_t index) const; - const std::vector>& + const std::vector>& get_vertex_primary_key(label_t index) const; + const std::string& get_vertex_primary_key_name(label_t index) const; + void Serialize(std::unique_ptr& writer); void Deserialize(std::unique_ptr& reader); @@ -150,8 +159,9 @@ class Schema { IdIndexer elabel_indexer_; std::vector> vproperties_; std::vector> vprop_names_; - std::vector>> - v_primary_keys_; + std::vector>> + v_primary_keys_; // the third element is the index of the property in the + // vertex property list std::vector> vprop_storage_; std::map> eproperties_; std::map> eprop_names_; diff --git a/flex/storages/rt_mutable_graph/types.h b/flex/storages/rt_mutable_graph/types.h index 3b9700b4c772..f73c7cc2fcd3 100644 --- a/flex/storages/rt_mutable_graph/types.h +++ b/flex/storages/rt_mutable_graph/types.h @@ -31,6 +31,11 @@ using vid_t = uint32_t; using oid_t = int64_t; using label_t = uint8_t; +static constexpr const char* DT_SIGNED_INT32 = "DT_SIGNED_INT32"; +static constexpr const char* DT_SIGNED_INT64 = "DT_SIGNED_INT64"; +static constexpr const char* DT_DOUBLE = "DT_DOUBLE"; +static constexpr const char* DT_STRING = "DT_STRING"; + } // namespace gs #endif // STORAGES_RT_MUTABLE_GRAPH_TYPES_H_ diff --git a/flex/tests/hqps/hqps_cypher_test.sh b/flex/tests/hqps/hqps_cypher_test.sh index 4de7a77e0805..5696a898f350 100644 --- a/flex/tests/hqps/hqps_cypher_test.sh +++ b/flex/tests/hqps/hqps_cypher_test.sh @@ -76,6 +76,9 @@ kill_service(){ info "Kill Service success" } +# kill service when exit +trap kill_service EXIT + create_ir_conf(){ rm ${HQPS_IR_CONF} || true echo "engine.type: hiactor" >> ${HQPS_IR_CONF} diff --git a/flex/tests/hqps/match_query.h b/flex/tests/hqps/match_query.h index 189885b12428..a5aa3c1c2686 100644 --- a/flex/tests/hqps/match_query.h +++ b/flex/tests/hqps/match_query.h @@ -519,5 +519,59 @@ class MatchQuery9 : public HqpsAppBase { } }; +class MatchQuery10 : public HqpsAppBase { + public: + using Engine = SyncEngine; + using label_id_t = typename gs::MutableCSRInterface::label_id_t; + using vertex_id_t = typename gs::MutableCSRInterface::vertex_id_t; + // Query function for query class + results::CollectiveResults Query(const gs::MutableCSRInterface& graph) const { + auto ctx0 = Engine::template ScanVertex( + graph, 1, Filter()); + + auto edge_expand_opt0 = + gs::make_edge_expand_multie_opt, + std::tuple>( + gs::Direction::Out, + std::array, 2>{ + std::array{1, 1, 8}, + std::array{1, 2, 9}}, + std::tuple{PropTupleArrayT>{"creationDate"}, + PropTupleArrayT>{"creationDate"}}); + auto ctx1 = + Engine::template EdgeExpandE( + graph, std::move(ctx0), std::move(edge_expand_opt0)); + + auto get_v_opt1 = make_getv_opt( + gs::VOpt::End, + std::array{(label_id_t) 0, (label_id_t) 1}); + auto ctx2 = Engine::template GetV( + graph, std::move(ctx1), std::move(get_v_opt1)); + auto ctx3 = Engine::Project( + graph, std::move(ctx2), + std::tuple{gs::make_mapper_with_variable( + gs::PropertySelector("")), + gs::make_mapper_with_variable( + gs::PropertySelector(""))}); + auto agg_func2 = gs::make_aggregate_prop( + std::tuple{gs::PropertySelector("None")}, + std::integer_sequence{}); + + auto ctx4 = Engine::GroupByWithoutKey(graph, std::move(ctx3), + std::tuple{std::move(agg_func2)}); + for (auto iter : ctx4) { + VLOG(10) << "ctx4: " << gs::to_string(iter.GetAllElement()); + } + return Engine::Sink(ctx4, std::array{2}); + } + // Wrapper query function for query class + results::CollectiveResults Query(const gs::MutableCSRInterface& graph, + Decoder& decoder) const override { + // decoding params from decoder, and call real query func + + return Query(graph); + } +}; + } // namespace gs #endif // TESTS_HQPS_MATCH_QUERY_H_ \ No newline at end of file diff --git a/flex/tests/hqps/query_test.cc b/flex/tests/hqps/query_test.cc index 13d1850d3e49..38eb0cea291c 100644 --- a/flex/tests/hqps/query_test.cc +++ b/flex/tests/hqps/query_test.cc @@ -36,145 +36,159 @@ int main(int argc, char** argv) { db.Init(schema, loading_config, data_dir, 1); auto& sess = gs::GraphDB::get().GetSession(0); - { - auto& graph = sess.graph(); - auto max_v_num = graph.vertex_num(1); - std::vector vids(max_v_num); - for (gs::MutableCSRInterface::vertex_id_t i = 0; i < max_v_num; ++i) { - vids[i] = i; - } - gs::MutableCSRInterface interface(sess); - std::array prop_names{"creationDate"}; - auto edges = - interface.GetEdges(1, 1, 8, vids, "Both", INT_MAX, prop_names); - double t = -grape::GetCurrentTime(); - size_t cnt = 0; - for (auto i = 0; i < vids.size(); ++i) { - auto adj_list = edges.get(i); - for (auto iter : adj_list) { - VLOG(10) << iter.neighbor() << ", " << gs::to_string(iter.properties()); - cnt += 1; - } - } - t += grape::GetCurrentTime(); - LOG(INFO) << "visiting edges: cost: " << t << ", num edges: " << cnt; - - // visiting vertices properties - auto vertex_prop = - interface.GetVertexPropsFromVid(1, vids, {"id"}); - for (auto i = 0; i < 10; ++i) { - VLOG(10) << "vid: " << vids[i] - << ", prop: " << gs::to_string(vertex_prop[i]); - } - } - - { - gs::SampleQuery query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - input_encoder.put_long(19791209300143); - input_encoder.put_long(1354060800000); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish Sample query"; - } - { - gs::MatchQuery query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery test"; - } - - { - gs::MatchQuery1 query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - auto res = query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery1 test"; - } - - { - gs::MatchQuery2 query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery2 test"; - } - - { - gs::MatchQuery3 query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery3 test"; - } - - { - gs::MatchQuery4 query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery4 test"; - } - - { - gs::MatchQuery5 query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery5 test"; - } - - { - gs::MatchQuery7 query; - std::vector encoder_array; - gs::Encoder input_encoder(encoder_array); - std::vector output_array; - gs::Encoder output(output_array); - gs::Decoder input(encoder_array.data(), encoder_array.size()); - - gs::MutableCSRInterface graph(sess); - query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery7 test"; - } + // { + // auto& graph = sess.graph(); + // auto max_v_num = graph.vertex_num(1); + // std::vector vids(max_v_num); + // for (gs::MutableCSRInterface::vertex_id_t i = 0; i < max_v_num; ++i) { + // vids[i] = i; + // } + // gs::MutableCSRInterface interface(sess); + // std::array prop_names{"creationDate"}; + // auto edges = + // interface.GetEdges(1, 1, 8, vids, "Both", INT_MAX, + // prop_names); + // double t = -grape::GetCurrentTime(); + // size_t cnt = 0; + // for (auto i = 0; i < vids.size(); ++i) { + // auto adj_list = edges.get(i); + // for (auto iter : adj_list) { + // VLOG(10) << iter.neighbor() << ", " << + // gs::to_string(iter.properties()); cnt += 1; + // } + // } + // t += grape::GetCurrentTime(); + // LOG(INFO) << "visiting edges: cost: " << t << ", num edges: " << cnt; + + // // visiting vertices properties + // auto vertex_prop = + // interface.GetVertexPropsFromVid(1, vids, {"id"}); + // for (auto i = 0; i < 10; ++i) { + // VLOG(10) << "vid: " << vids[i] + // << ", prop: " << gs::to_string(vertex_prop[i]); + // } + // } + + // { + // gs::SampleQuery query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // input_encoder.put_long(19791209300143); + // input_encoder.put_long(1354060800000); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish Sample query"; + // } + // { + // gs::MatchQuery query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery test"; + // } + + // { + // gs::MatchQuery1 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // auto res = query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery1 test"; + // } + + // { + // gs::MatchQuery2 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery2 test"; + // } + + // { + // gs::MatchQuery3 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery3 test"; + // } + + // { + // gs::MatchQuery4 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery4 test"; + // } + + // { + // gs::MatchQuery5 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery5 test"; + // } + + // { + // gs::MatchQuery7 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery7 test"; + // } + + // { + // gs::MatchQuery9 query; + // std::vector encoder_array; + // gs::Encoder input_encoder(encoder_array); + // std::vector output_array; + // gs::Encoder output(output_array); + // gs::Decoder input(encoder_array.data(), encoder_array.size()); + + // gs::MutableCSRInterface graph(sess); + // query.Query(graph, input); + // LOG(INFO) << "Finish MatchQuery9 test"; + // } { - gs::MatchQuery9 query; + gs::MatchQuery10 query; std::vector encoder_array; gs::Encoder input_encoder(encoder_array); std::vector output_array; @@ -183,7 +197,7 @@ int main(int argc, char** argv) { gs::MutableCSRInterface graph(sess); query.Query(graph, input); - LOG(INFO) << "Finish MatchQuery9 test"; + LOG(INFO) << "Finish MatchQuery10 test"; } LOG(INFO) << "Finish context test."; diff --git a/flex/tests/rt_mutable_graph/test_graph_loading.cc b/flex/tests/rt_mutable_graph/test_graph_loading.cc index 5c3d0ed36609..e0280a836b03 100644 --- a/flex/tests/rt_mutable_graph/test_graph_loading.cc +++ b/flex/tests/rt_mutable_graph/test_graph_loading.cc @@ -57,7 +57,7 @@ int main(int argc, char** argv) { } for (auto i = 0; i < graph.schema().edge_label_num(); ++i) { - LOG(INFO) << "vertex label " << i + LOG(INFO) << "edge label " << i << " name: " << graph.schema().get_edge_label_name(i); } diff --git a/flex/utils/arrow_utils.cc b/flex/utils/arrow_utils.cc new file mode 100644 index 000000000000..50356db839eb --- /dev/null +++ b/flex/utils/arrow_utils.cc @@ -0,0 +1,50 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "flex/utils/arrow_utils.h" + +namespace gs { +std::shared_ptr PropertyTypeToArrowType(PropertyType type) { + switch (type) { + case PropertyType::kInt32: + return arrow::int32(); + case PropertyType::kInt64: + return arrow::int64(); + case PropertyType::kDouble: + return arrow::float64(); + case PropertyType::kDate: + return arrow::timestamp(arrow::TimeUnit::MILLI); + case PropertyType::kString: + return arrow::large_utf8(); + case PropertyType::kEmpty: + return arrow::null(); + default: + LOG(FATAL) << "Unexpected property type: " << static_cast(type); + return nullptr; + } +} + +template +void emplace_into_vector(const std::shared_ptr& array, + std::vector& vec) { + using arrow_array_type = typename gs::CppTypeToArrowType::ArrayType; + for (auto i = 0; i < array->num_chunks(); ++i) { + auto casted = std::static_pointer_cast(array->chunk(i)); + for (auto k = 0; k < casted->length(); ++k) { + vec.emplace_back(AnyConverter::to_any(casted->Value(k))); + } + } +} + +} // namespace gs diff --git a/flex/utils/arrow_utils.h b/flex/utils/arrow_utils.h new file mode 100644 index 000000000000..d022ce56dbcc --- /dev/null +++ b/flex/utils/arrow_utils.h @@ -0,0 +1,91 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef UTILS_ARROW_UTILS_H_ +#define UTILS_ARROW_UTILS_H_ + +#include +#include +#include "flex/utils/property/types.h" + +namespace gs { + +// arrow related; + +// convert c++ type to arrow type. support other types likes emptyType, Date +template +struct CppTypeToArrowType {}; + +template <> +struct CppTypeToArrowType { + using Type = arrow::Int64Type; + using ArrayType = arrow::Int64Array; + static std::shared_ptr TypeValue() { return arrow::int64(); } +}; + +template <> +struct CppTypeToArrowType { + using Type = arrow::Int32Type; + using ArrayType = arrow::Int32Array; + static std::shared_ptr TypeValue() { return arrow::int32(); } +}; + +template <> +struct CppTypeToArrowType { + using Type = arrow::DoubleType; + using ArrayType = arrow::DoubleArray; + static std::shared_ptr TypeValue() { + return arrow::float64(); + } +}; + +template <> +struct CppTypeToArrowType { + using Type = arrow::Int64Type; + using ArrayType = arrow::Int64Array; + static std::shared_ptr TypeValue() { return arrow::int64(); } +}; + +template +struct CppTypeToPropertyType; + +template <> +struct CppTypeToPropertyType { + static constexpr PropertyType value = PropertyType::kInt32; +}; + +template <> +struct CppTypeToPropertyType { + static constexpr PropertyType value = PropertyType::kInt64; +}; + +template <> +struct CppTypeToPropertyType { + static constexpr PropertyType value = PropertyType::kDouble; +}; + +template <> +struct CppTypeToPropertyType { + static constexpr PropertyType value = PropertyType::kString; +}; + +template <> +struct CppTypeToPropertyType { + static constexpr PropertyType value = PropertyType::kString; +}; + +std::shared_ptr PropertyTypeToArrowType(PropertyType type); +} // namespace gs + +#endif // UTILS_ARROW_UTILS_H_ diff --git a/flex/utils/property/table.cc b/flex/utils/property/table.cc index ec60f06e42c1..20a2aa836f75 100644 --- a/flex/utils/property/table.cc +++ b/flex/utils/property/table.cc @@ -134,6 +134,8 @@ const std::shared_ptr Table::get_column_by_id(size_t index) const { size_t Table::col_num() const { return columns_.size(); } std::vector>& Table::columns() { return columns_; } +// get column pointers +std::vector& Table::column_ptrs() { return column_ptrs_; } void Table::insert(size_t index, const std::vector& values) { assert(values.size() == columns_.size()); diff --git a/flex/utils/property/table.h b/flex/utils/property/table.h index 5f5b1eca4aeb..f6f05aa5248a 100644 --- a/flex/utils/property/table.h +++ b/flex/utils/property/table.h @@ -59,6 +59,7 @@ class Table { size_t col_num() const; std::vector>& columns(); + std::vector& column_ptrs(); void insert(size_t index, const std::vector& values); diff --git a/flex/utils/property/types.cc b/flex/utils/property/types.cc index 461dcd165238..1412378bde84 100644 --- a/flex/utils/property/types.cc +++ b/flex/utils/property/types.cc @@ -20,97 +20,6 @@ namespace gs { -inline void ParseInt32(const std::string_view& str, int& val) { - sscanf(str.data(), "%d", &val); -} - -inline void ParseInt64(const std::string_view& str, int64_t& val) { -#ifdef __APPLE__ - sscanf(str.data(), "%lld", &val); -#else - sscanf(str.data(), "%" SCNd64, &val); -#endif -} - -inline void ParseDate(const std::string_view& str, Date& date) { - date.reset(str.data()); -} - -inline void ParseString(const std::string_view& str, std::string_view& val) { - val = str; -} - -inline void ParseDouble(const std::string_view& str, double& val) { - sscanf(str.data(), "%lf", &val); -} - -void ParseRecord(const char* line, std::vector& rec) { - const char* cur = line; - for (auto& item : rec) { - const char* ptr = cur; - while (*ptr != '\0' && *ptr != '|') { - ++ptr; - } - std::string_view sv(cur, ptr - cur); - if (item.type == PropertyType::kInt32) { - ParseInt32(sv, item.value.i); - } else if (item.type == PropertyType::kInt64) { - ParseInt64(sv, item.value.l); - } else if (item.type == PropertyType::kDate) { - ParseDate(sv, item.value.d); - } else if (item.type == PropertyType::kString) { - ParseString(sv, item.value.s); - } else if (item.type == PropertyType::kDouble) { - ParseDouble(sv, item.value.db); - } - cur = ptr + 1; - } -} - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, int& prop) { -#ifdef __APPLE__ - sscanf(line, "%lld|%lld|%d", &src, &dst, &prop); -#else - sscanf(line, "%" SCNd64 "|%" SCNd64 "|%d", &src, &dst, &prop); -#endif -} - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, Date& prop) { -#ifdef __APPLE__ - sscanf(line, "%lld|%lld", &src, &dst); -#else - sscanf(line, "%" SCNd64 "|%" SCNd64 "", &src, &dst); -#endif - const char* ptr = strrchr(line, '|') + 1; - prop.reset(ptr); -} - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, - grape::EmptyType& prop) { -#ifdef __APPLE__ - sscanf(line, "%lld|%lld", &src, &dst); -#else - sscanf(line, "%" SCNd64 "|%" SCNd64 "", &src, &dst); -#endif -} - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, double& prop) { -#ifdef __APPLE__ - sscanf(line, "%lld|%lld|%lf", &src, &dst, &prop); -#else - sscanf(line, "%" SCNd64 "|%" SCNd64 "|%lf", &src, &dst, &prop); -#endif -} - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, int64_t& prop) { -#ifdef __APPLE__ - // parseRecordX for edge with int64 property - sscanf(line, "%lld|%lld|%lld", &src, &dst, &prop); -#else - sscanf(line, "%" SCNd64 "|%" SCNd64 "|%" SCNd64 "", &src, &dst, &prop); -#endif -} - grape::InArchive& operator<<(grape::InArchive& in_archive, const Any& value) { switch (value.type) { case PropertyType::kInt32: @@ -177,67 +86,7 @@ grape::OutArchive& operator>>(grape::OutArchive& out_archive, return out_archive; } -// date format: -// YYYY-MM-DD'T'hh:mm:ss.SSSZZZZ -// 2010-04-25T05:45:11.772+0000 - -inline static uint32_t char_to_digit(char c) { return (c - '0'); } - -inline static uint32_t str_4_to_number(const char* str) { - return char_to_digit(str[0]) * 1000u + char_to_digit(str[1]) * 100u + - char_to_digit(str[2]) * 10u + char_to_digit(str[3]); -} - -inline static uint32_t str_3_to_number(const char* str) { - return char_to_digit(str[0]) * 100u + char_to_digit(str[1]) * 10u + - char_to_digit(str[2]); -} - -inline static uint32_t str_2_to_number(const char* str) { - return char_to_digit(str[0]) * 10u + char_to_digit(str[1]); -} - Date::Date(int64_t x) : milli_second(x) {} -Date::Date(const char* str) { reset(str); } - -void Date::reset(const char* str) { - if (str[4] == '-') { - struct tm v; - memset(&v, 0, sizeof(v)); - v.tm_year = str_4_to_number(str) - 1900; - v.tm_mon = str_2_to_number(&str[5]) - 1; - v.tm_mday = str_2_to_number(&str[8]); - if (str[10] == '|') { - milli_second = mktime(&v); - milli_second *= 1000l; - milli_second += 8 * 60 * 60 * 1000l; - return; - } - v.tm_hour = str_2_to_number(&str[11]); - v.tm_min = str_2_to_number(&str[14]); - v.tm_sec = str_2_to_number(&str[17]); - - milli_second = (mktime(&v)); - - milli_second *= 1000l; - milli_second += str_3_to_number(&str[20]); - bool zone_flag = (str[23] == '+') ? 1u : 0u; - uint32_t zone_hour = str_2_to_number(&str[24]); - uint32_t zone_minute = str_2_to_number(&str[26]); - milli_second += 8 * 60 * 60 * 1000l; - if (zone_flag) { - milli_second += (zone_hour * 60 * 60l + zone_minute * 60l) * 1000l; - } else { - milli_second -= (zone_hour * 60 * 60l + zone_minute * 60l) * 1000l; - } - } else { -#ifdef __APPLE__ - sscanf(str, "%lld", &milli_second); -#else - sscanf(str, "%" SCNd64, &milli_second); -#endif - } -} std::string Date::to_string() const { return std::to_string(milli_second); } diff --git a/flex/utils/property/types.h b/flex/utils/property/types.h index eda3841e5d11..39a3b4887fa5 100644 --- a/flex/utils/property/types.h +++ b/flex/utils/property/types.h @@ -45,9 +45,7 @@ struct Date { Date() = default; ~Date() = default; Date(int64_t x); - Date(const char* str); - void reset(const char* str); std::string to_string() const; int64_t milli_second; @@ -155,6 +153,28 @@ struct Any { return AnyConverter::to_any(value); } + bool operator==(const Any& other) const { + if (type == other.type) { + if (type == PropertyType::kInt32) { + return value.i == other.value.i; + } else if (type == PropertyType::kInt64) { + return value.l == other.value.l; + } else if (type == PropertyType::kDate) { + return value.d.milli_second == other.value.d.milli_second; + } else if (type == PropertyType::kString) { + return value.s == other.value.s; + } else if (type == PropertyType::kEmpty) { + return true; + } else if (type == PropertyType::kDouble) { + return value.db == other.value.db; + } else { + return false; + } + } else { + return false; + } + } + PropertyType type; AnyValue value; }; @@ -392,18 +412,6 @@ struct AnyConverter { } }; -void ParseRecord(const char* line, std::vector& rec); - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, int& prop); - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, Date& prop); - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, - grape::EmptyType& prop); -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, double& prop); - -void ParseRecordX(const char* line, int64_t& src, int64_t& dst, int64_t& prop); - grape::InArchive& operator<<(grape::InArchive& in_archive, const Any& value); grape::OutArchive& operator>>(grape::OutArchive& out_archive, Any& value); diff --git a/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/config/YamlConfigs.java b/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/config/YamlConfigs.java index 460e0ad2fc0b..a87192612e03 100644 --- a/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/config/YamlConfigs.java +++ b/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/config/YamlConfigs.java @@ -225,6 +225,7 @@ private static void flattenAndConvert( if (value instanceof Map) { flattenAndConvert((Map) value, properties, key); } else { + System.out.println("key: " + key + ", value: " + value); properties.put(key, value.toString()); } } diff --git a/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/ir/meta/reader/LocalMetaDataReader.java b/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/ir/meta/reader/LocalMetaDataReader.java index 09ffb4b5eede..0672077739d8 100644 --- a/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/ir/meta/reader/LocalMetaDataReader.java +++ b/interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/ir/meta/reader/LocalMetaDataReader.java @@ -58,7 +58,10 @@ public List getStoredProcedures() throws IOException { List procedureInputs = Lists.newArrayList(); if (enableProcedureList.isEmpty()) { for (File file : procedureDir.listFiles()) { - procedureInputs.add(new FileInputStream(file)); + // if file is .yaml or .yml file + if (file.getName().endsWith(".yaml") || file.getName().endsWith(".yml")){ + procedureInputs.add(new FileInputStream(file)); + } } } else { Map procedureInputMap =