From 88ee80581b9ee4a7abab0d0dde1198f0ee2f1efd Mon Sep 17 00:00:00 2001 From: cooper-lzy <78672629+cooper-lzy@users.noreply.github.com> Date: Wed, 15 Feb 2023 10:28:21 +0800 Subject: [PATCH] exchange update export --- .../about-exchange/ex-ug-what-is-exchange.md | 2 +- .../use-exchange/ex-ug-export-from-nebula.md | 253 ++++++++++++++---- 2 files changed, 205 insertions(+), 50 deletions(-) diff --git a/docs-2.0/nebula-exchange/about-exchange/ex-ug-what-is-exchange.md b/docs-2.0/nebula-exchange/about-exchange/ex-ug-what-is-exchange.md index 797acfefde6..52752caaccd 100644 --- a/docs-2.0/nebula-exchange/about-exchange/ex-ug-what-is-exchange.md +++ b/docs-2.0/nebula-exchange/about-exchange/ex-ug-what-is-exchange.md @@ -113,7 +113,7 @@ Exchange {{exchange.release}} supports converting data from the following format In addition to importing data as nGQL statements, Exchange supports generating SST files for data sources and then [importing SST](../use-exchange/ex-ug-import-from-sst.md) files via Console. -In addition, Exchange Enterprise Edition also supports [exporting data to a CSV file](../use-exchange/ex-ug-export-from-nebula.md) using NebulaGraph as data sources. +In addition, Exchange Enterprise Edition also supports [exporting data to a CSV file or another graph space](../use-exchange/ex-ug-export-from-nebula.md) using NebulaGraph as data sources. ## Release note diff --git a/docs-2.0/nebula-exchange/use-exchange/ex-ug-export-from-nebula.md b/docs-2.0/nebula-exchange/use-exchange/ex-ug-export-from-nebula.md index b6bbd818784..3266b123991 100644 --- a/docs-2.0/nebula-exchange/use-exchange/ex-ug-export-from-nebula.md +++ b/docs-2.0/nebula-exchange/use-exchange/ex-ug-export-from-nebula.md @@ -1,14 +1,10 @@ # Export data from NebulaGraph -This topic uses an example to illustrate how to use Exchange to export data from NebulaGraph to a CSV file. +The Exchange allows you to export data from NebulaGraph to a CSV file or another NebulaGraph space (supporting different NebulaGraph clusters). This topic describes the specific procedure. !!! enterpriseonly - Only Exchange Enterprise Edition supports exporting data from NebulaGraph to a CSV file. - -!!! note - - SSL encryption is not supported when exporting data from NebulaGraph. + Only Exchange Enterprise Edition supports exporting data from NebulaGraph. ## Preparation @@ -53,81 +49,238 @@ As the data source, NebulaGraph stores the [basketballplayer dataset](https://do 2. Modify the configuration file. - Exchange Enterprise Edition provides the configuration template `export_application.conf` for exporting NebulaGraph data. For details, see [Exchange parameters](../parameter-reference/ex-ug-parameter.md). The core content of the configuration file used in this example is as follows: + Exchange Enterprise Edition provides the configuration template `export_to_csv.conf` and `export_to_nebula.conf` for exporting NebulaGraph data. For details, see [Exchange parameters](../parameter-reference/ex-ug-parameter.md). The core content of the configuration file used in this example is as follows: + - Export to a CSV file: + ```conf - ... + # Use the command to submit the exchange job: + + # spark-submit \ + # --master "spark://master_ip:7077" \ + # --driver-memory=2G --executor-memory=30G \ + # --total-executor-cores=60 --executor-cores=20 \ + # --class com.vesoft.nebula.exchange.Exchange \ + # nebula-exchange-3.0-SNAPSHOT.jar -c export_to_csv.conf + + { + # Spark config + spark: { + app: { + name: NebulaGraph Exchange + } + } + + # Nebula Graph config + # if you export nebula data to csv, please ignore these nebula config + nebula: { + address:{ + graph:["127.0.0.1:9669"] + + # the address of any of the meta services + meta:["127.0.0.1:9559"] + } + user: root + pswd: nebula + space: test + + # nebula client connection parameters + connection { + # socket connect & execute timeout, unit: millisecond + timeout: 30000 + } + + error: { + # max number of failures, if the number of failures is bigger than max, then exit the application. + max: 32 + # failed data will be recorded in output path, format with ngql + output: /tmp/errors + } + + # use google's RateLimiter to limit the requests send to NebulaGraph + rate: { + # the stable throughput of RateLimiter + limit: 1024 + # Acquires a permit from RateLimiter, unit: MILLISECONDS + # if it can't be obtained within the specified timeout, then give up the request. + timeout: 1000 + } + } # Processing tags - # There are tag config examples for different dataSources. tags: [ - # export NebulaGraph tag data to csv, only support export to CSV for now. { - name: player + # you can ignore the tag name when export nebula data to csv + name: tag-name-1 type: { - source: Nebula - sink: CSV + source: nebula + sink: csv } - # the path to save the NebulaGrpah data, make sure the path doesn't exist. - path:"hdfs://192.168.8.177:9000/vertex/player" - # if no need to export any properties when export NebulaGraph tag data - # if noField is configured true, just export vertexId - noField:false - # define properties to export from NebulaGraph tag data - # if return.fields is configured as empty list, then export all properties - return.fields:[] - # nebula space partition number - partition:10 - } - - ... + # config the fields you want to export from nebula + fields: [nebula-field-0, nebula-field-1, nebula-field-2] + noFields:false # default false, if true, just export id + partition: 60 + # config the path to save your csv file. if your file in not in hdfs, config "file:///path/ test.csv" + path: "hdfs://ip:port/path/person" + separator: "," + header: true + } ] - # Processing edges - # There are edge config examples for different dataSources. + # process edges edges: [ - # export NebulaGraph tag data to csv, only support export to CSV for now. { - name: follow + # you can ignore the edge name when export nebula data to csv + name: edge-name-1 type: { - source: Nebula - sink: CSV + source: nebula + sink: csv } - # the path to save the NebulaGrpah data, make sure the path doesn't exist. - path:"hdfs://192.168.8.177:9000/edge/follow" - # if no need to export any properties when export NebulaGraph edge data - # if noField is configured true, just export src,dst,rank - noField:false - # define properties to export from NebulaGraph edge data - # if return.fields is configured as empty list, then export all properties - return.fields:[] - # nebula space partition number - partition:10 + # config the fields you want to export from nebula + fields: [nebula-field-0, nebula-field-1, nebula-field-2] + noFields:false # default false, if true, just export id + partition: 60 + # config the path to save your csv file. if your file in not in hdfs, config "file:///path/ test.csv" + path: "hdfs://ip:port/path/friend" + separator: "," + header: true } + ] + } + ``` + + - Export to another graph space: + + ```conf + # Use the command to submit the exchange job: + + # spark-submit \ + # --master "spark://master_ip:7077" \ + # --driver-memory=2G --executor-memory=30G \ + # --total-executor-cores=60 --executor-cores=20 \ + # --class com.vesoft.nebula.exchange.Exchange \ + # nebula-exchange-3.0-SNAPSHOT.jar -c export_to_nebula.conf - ... + { + # Spark config + spark: { + app: { + name: NebulaGraph Exchange + } + } + # Nebula Graph config, just config the sink nebula information + nebula: { + address:{ + graph:["127.0.0.1:9669"] + + # the address of any of the meta services + meta:["127.0.0.1:9559"] + } + user: root + pswd: nebula + space: test + + # nebula client connection parameters + connection { + # socket connect & execute timeout, unit: millisecond + timeout: 30000 + } + + error: { + # max number of failures, if the number of failures is bigger than max, then exit the application. + max: 32 + # failed data will be recorded in output path, format with ngql + output: /tmp/errors + } + + # use google's RateLimiter to limit the requests send to NebulaGraph + rate: { + # the stable throughput of RateLimiter + limit: 1024 + # Acquires a permit from RateLimiter, unit: MILLISECONDS + # if it can't be obtained within the specified timeout, then give up the request. + timeout: 1000 + } + } + + # Processing tags + tags: [ + { + name: tag-name-1 + type: { + source: nebula + sink: client + } + # data source nebula config + metaAddress:"127.0.0.1:9559" + space:"test" + label:"person" + # mapping the fields of the original NebulaGraph to the fields of the target NebulaGraph. + fields: [source_nebula-field-0, source_nebula-field-1, source_nebula-field-2] + nebula.fields: [target_nebula-field-0, target_nebula-field-1, target_nebula-field-2] + limit:10000 + vertex: _vertexId # must be `_vertexId` + batch: 2000 + partition: 60 + } ] + + # process edges + edges: [ + { + name: edge-name-1 + type: { + source: csv + sink: client + } + # data source nebula config + metaAddress:"127.0.0.1:9559" + space:"test" + label:"friend" + fields: [source_nebula-field-0, source_nebula-field-1, source_nebula-field-2] + nebula.fields: [target_nebula-field-0, target_nebula-field-1, target_nebula-field-2] + limit:1000 + source: _srcId # must be `_srcId` + target: _dstId # must be `_dstId` + ranking: source_nebula-field-2 + batch: 2000 + partition: 60 + } + ] } ``` 3. Export data from NebulaGraph with the following command. + !!! note + + The parameters of the Driver and Executor process can be modified based on your own machine configuration. + ```bash - /bin/spark-submit --master "local" --class com.vesoft.nebula.exchange.Exchange nebula-exchange-x.y.z.jar_path> -c + /bin/spark-submit --master "spark://:7077" \ + --driver-memory=2G --executor-memory=30G \ + --total-executor-cores=60 --executor-cores=20 \ + --class com.vesoft.nebula.exchange.Exchange nebula-exchange-x.y.z.jar_path> \ + -c ``` - The command used in this example is as follows. + The following is an example command to export the data to a CSV file. ```bash - $ ./spark-submit --master "local" --class com.vesoft.nebula.exchange.Exchange \ - ~/exchange-ent/nebula-exchange-ent-{{exchange.release}}.jar -c ~/exchange-ent/export_application.conf + $ ./spark-submit --master "spark://192.168.10.100:7077" \ + --driver-memory=2G --executor-memory=30G \ + --total-executor-cores=60 --executor-cores=20 \ + --class com.vesoft.nebula.exchange.Exchange ~/exchange-ent/nebula-exchange-ent-{{exchange.release}}.jar \ + -c ~/exchange-ent/export_to_csv.conf ``` 4. Check the exported data. - 1. Check whether the CSV file is successfully generated under the target path. + - Export to a CSV file: + + Check whether the CSV file is successfully generated under the target path, and check the contents of the CSV file to ensure that the data export is successful. ```bash $ hadoop fs -ls /vertex/player @@ -145,4 +298,6 @@ As the data source, NebulaGraph stores the [basketballplayer dataset](https://do -rw-r--r-- 3 nebula supergroup 119 2021-11-05 07:36 /vertex/player/ part-00009-17293020-ba2e-4243-b834-34495c0536b3-c000.csv ``` - 2. Check the contents of the CSV file to ensure that the data export is successful. + - Export to another graph space: + + Log in to the new graph space and check the statistics through `SUBMIT JOB STATS` and `SHOW STATS` commands to ensure the data export is successful.