From 06e8900af5e7a34b6de1957840ad275c5a632296 Mon Sep 17 00:00:00 2001 From: Anqi Date: Tue, 7 Mar 2023 16:31:37 +0800 Subject: [PATCH] fix version validation (#88) * fix the version validation info * update version match * fix assert info * update Nebula to NebulaGraph --- README.md | 92 ++++++++++++------- README_CN.md | 50 +++++++--- .../nebula/connector/NebulaConfig.scala | 3 +- .../connector/utils/SparkValidate.scala | 4 +- 4 files changed, 97 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 430fcc5c..29ad0aee 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ -# Nebula Spark Connector +# NebulaGraph Spark Connector [中文版](https://github.com/vesoft-inc/nebula-spark-connector/blob/master/README_CN.md) ## Introduction -Nebula Spark Connector 2.0/3.0 only supports Nebula Graph 2.x/3.x. If you are using Nebula Graph v1.x, please use [Nebula Spark Connector v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/nebula-spark) . +NebulaGraph Spark Connector 2.0/3.0 only supports NebulaGraph 2.x/3.x. If you are using NebulaGraph v1.x, please use [NebulaGraph Spark Connector v1.0](https://github.com/vesoft-inc/nebula-java/tree/v1.0/tools/nebula-spark) . -Nebula Spark Connector support spark 2.2 and 2.4. +NebulaGraph Spark Connector support spark 2.2 and 2.4. ## How to Compile -1. Package Nebula Spark Connector. +1. Package NebulaGraph Spark Connector. ```bash $ git clone https://github.com/vesoft-inc/nebula-spark-connector.git @@ -24,27 +24,43 @@ Nebula Spark Connector support spark 2.2 and 2.4. After the packaging, you can see the newly generated nebula-spark-connector-3.0-SNAPSHOT.jar under the nebula-spark-connector/nebula-spark-connector/target/ directory. -## New Features (Compared to Nebula Spark Connector 1.0) +## New Features (Compared to NebulaGraph Spark Connector 1.0) * Supports more connection configurations, such as timeout, connectionRetry, and executionRetry. * Supports more data configurations, such as whether vertexId can be written as vertex's property, whether srcId, dstId and rank can be written as edge's properties. * Spark Reader Supports non-property, all-property, and specific-properties read. -* Spark Reader Supports reading data from Nebula Graph to Graphx as VertexRD and EdgeRDD, it also supports String type vertexId. -* Nebula Spark Connector 2.0 uniformly uses SparkSQL's DataSourceV2 for data source expansion. -* Nebula Spark Connector 2.1.0 support UPDATE write mode to NebulaGraph, see [Update Vertex](https://docs.nebula-graph.io/2.0.1/3.ngql-guide/12.vertex-statements/2.update-vertex/) . -* Nebula Spark Connector 2.5.0 support DELETE write mode to NebulaGraph, see [Delete Vertex](https://docs.nebula-graph.io/master/3.ngql-guide/12.vertex-statements/4.delete-vertex/) +* Spark Reader Supports reading data from NebulaGraph to Graphx as VertexRD and EdgeRDD, it also supports String type vertexId. +* NebulaGraph Spark Connector 2.0 uniformly uses SparkSQL's DataSourceV2 for data source expansion. +* NebulaGraph Spark Connector 2.1.0 support UPDATE write mode to NebulaGraph, see [Update Vertex](https://docs.nebula-graph.io/2.0.1/3.ngql-guide/12.vertex-statements/2.update-vertex/) . +* NebulaGraph Spark Connector 2.5.0 support DELETE write mode to NebulaGraph, see [Delete Vertex](https://docs.nebula-graph.io/master/3.ngql-guide/12.vertex-statements/4.delete-vertex/) ## How to Use - If you use Maven to manage your project, add the following dependency to your pom.xml: + If you use Maven to manage your project, add one of the following dependency to your pom.xml: + ``` + com.vesoft nebula-spark-connector 3.0-SNAPSHOT + + + + com.vesoft + nebula-spark-connector_2.2 + 3.0-SNAPSHOT + + + + + com.vesoft + nebula-spark-connector_3.0 + 3.0-SNAPSHOT + ``` - Write DataFrame `INSERT` into Nebula Graph as Vertices: + Write DataFrame `INSERT` into NebulaGraph as Vertices: ``` val config = NebulaConnectionConfig .builder() @@ -61,7 +77,7 @@ Nebula Spark Connector support spark 2.2 and 2.4. .build() df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() ``` - Write DataFrame `UPDATE` into Nebula Graph as Vertices: + Write DataFrame `UPDATE` into NebulaGraph as Vertices: ``` val config = NebulaConnectionConfig .builder() @@ -79,7 +95,7 @@ Nebula Spark Connector support spark 2.2 and 2.4. .build() df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() ``` - Write DataFrame `DELETE` into Nebula Graph as Vertices: + Write DataFrame `DELETE` into NebulaGraph as Vertices: ``` val config = NebulaConnectionConfig .builder() @@ -96,7 +112,7 @@ Nebula Spark Connector support spark 2.2 and 2.4. .build() df.write.nebula(config, nebulaWriteVertexConfig).writeVertices() ``` - Read vertices from Nebula Graph: + Read vertices from NebulaGraph: ``` val config = NebulaConnectionConfig .builder() @@ -115,7 +131,7 @@ Nebula Spark Connector support spark 2.2 and 2.4. val vertex = spark.read.nebula(config, nebulaReadVertexConfig).loadVerticesToDF() ``` - Read vertices and edges from Nebula Graph to construct Graphx's graph: + Read vertices and edges from NebulaGraph to construct Graphx's graph: ``` val config = NebulaConnectionConfig .builder() @@ -148,7 +164,7 @@ Nebula Spark Connector support spark 2.2 and 2.4. For more information on usage, please refer to [Example](https://github.com/vesoft-inc/nebula-spark-connector/tree/master/example/src/main/scala/com/vesoft/nebula/examples/connector). -## PySpark with Nebula Spark Connector +## PySpark with NebulaGraph Spark Connector Below is an example of calling nebula-spark-connector jar package in pyspark. @@ -276,7 +292,7 @@ For more options, i.e. delete edge with vertex being deleted, refer to [nebula/c val DELETE_EDGE: String = "deleteEdge" ``` -### Call Nebula Spark Connector in PySpark shell and .py file +### Call NebulaGraph Spark Connector in PySpark shell and .py file Also, below are examples on how we run above code with pyspark shell or in python code files: @@ -307,21 +323,27 @@ df = spark.read.format( "partitionNumber", 1).load() ``` -## Version match - -There are the version correspondence between Nebula Spark Connector and Nebula: - -| Nebula Spark Connector Version | Nebula Version | -|:------------------------------:|:--------------:| -| 2.0.0 | 2.0.0, 2.0.1 | -| 2.0.1 | 2.0.0, 2.0.1 | -| 2.1.0 | 2.0.0, 2.0.1 | -| 2.5.0 | 2.5.0, 2.5.1 | -| 2.5.1 | 2.5.0, 2.5.1 | -| 2.6.0 | 2.6.0, 2.6.1 | -| 2.6.1 | 2.6.0, 2.6.1 | -| 3.0.0 | 3.0.x, 3.1.x | -| 3.0-SNAPSHOT | nightly | +## Compatibility matrix + +There are the version correspondence between NebulaGraph Spark Connector and Nebula、Spark: + +| NebulaGraph Spark Connector Version | NebulaGraph Version | Spark Version | +|:-----------------------------------------:|:--------------:|:-------------:| +|nebula-spark-connector-2.0.0.jar | 2.0.0, 2.0.1 | 2.4.* | +|nebula-spark-connector-2.0.1.jar | 2.0.0, 2.0.1 | 2.4.* | +|nebula-spark-connector-2.1.0.jar | 2.0.0, 2.0.1 | 2.4.* | +|nebula-spark-connector-2.5.0.jar | 2.5.0, 2.5.1 | 2.4.* | +|nebula-spark-connector-2.5.1.jar | 2.5.0, 2.5.1 | 2.4.* | +|nebula-spark-connector-2.6.0.jar | 2.6.0, 2.6.1 | 2.4.* | +|nebula-spark-connector-2.6.1.jar | 2.6.0, 2.6.1 | 2.4.* | +|nebula-spark-connector-3.0.0.jar | 3.x | 2.4.* | +|nebula-spark-connector-3.3.0.jar | 3.x | 2.4.* | +|nebula-spark-connector_2.2-3.3.0.jar | 3.x | 2.2.* | +|nebula-spark-connector-3.4.0.jar | 3.x | 2.4.* | +|nebula-spark-connector_2.2-3.4.0.jar | 3.x | 2.2.* | +|nebula-spark-connector-3.0-SNAPSHOT.jar | nightly | 2.4.* | +|nebula-spark-connector_2.2-3.0-SNAPSHOT.jar| nightly | 2.2.* | +|nebula-spark-connector_3.0-3.0-SNAPSHOT.jar| nightly | 3.* | ## Performance We use LDBC dataset to test nebula-spark-connector's performance, here's the result. @@ -332,7 +354,7 @@ We choose tag Comment and edge REPLY_OF for space sf30 and sf100 to test the con And the application's resources are: standalone mode with three workers, 2G driver-memory, 3 num-executors, 30G executor-memory and 20 executor-cores. The ReadNebulaConfig has 2000 limit and 100 partitionNum, -the same partition number with nebula space parts. +the same partition number with NebulaGraph space parts. |data type|ldbc 67.12million with No Property| ldbc 220 million with No Property|ldbc 67.12million with All Property|ldbc 220 million with All Property| @@ -360,8 +382,8 @@ The writeConfig has 2000 batch sizes, and the DataFrame has 60 partitions. ## How to Contribute -Nebula Spark Connector is a completely opensource project, opensource enthusiasts are welcome to participate in the following ways: +NebulaGraph Spark Connector is a completely opensource project, opensource enthusiasts are welcome to participate in the following ways: -- Go to [Nebula Graph Forum](https://discuss.nebula-graph.com.cn/ "go to“Nebula Graph Forum") to discuss with other users. You can raise your own questions, help others' problems, share your thoughts. +- Go to [NebulaGraph Forum](https://discuss.nebula-graph.com.cn/ "go to“NebulaGraph Forum") to discuss with other users. You can raise your own questions, help others' problems, share your thoughts. - Write or improve documents. - Submit code to add new features or fix bugs. diff --git a/README_CN.md b/README_CN.md index 3040d932..1d1dc7ac 100644 --- a/README_CN.md +++ b/README_CN.md @@ -29,13 +29,28 @@ Nebula Spark Connector 支持 Spark 2.2 和 2.4. * Nebula Spark Connector 2.5.0 增加了 DELETE 写入模式,相关说明参考[Delete Vertex](https://docs.nebula-graph.com.cn/2.5.1/3.ngql-guide/12.vertex-statements/4.delete-vertex/) ## 使用说明 - 如果你使用Maven管理项目,请在pom.xml文件中增加依赖: + 如果你使用Maven管理项目,请在pom.xml文件中增加下列某一项依赖: ``` + com.vesoft nebula-spark-connector 3.0-SNAPSHOT + + + + com.vesoft + nebula-spark-connector_2.2 + 3.0-SNAPSHOT + + + + + com.vesoft + nebula-spark-connector_3.0 + 3.0-SNAPSHOT + ``` 将 DataFrame 作为点 `INSERT` 写入 Nebula Graph : @@ -305,19 +320,26 @@ df = spark.read.format( ``` ## 版本匹配 -Nebula Spark Connector 和 Nebula 的版本对应关系如下: - -| Nebula Spark Connector Version | Nebula Version | -|:------------------------------:|:--------------:| -| 2.0.0 | 2.0.0, 2.0.1 | -| 2.0.1 | 2.0.0, 2.0.1 | -| 2.1.0 | 2.0.0, 2.0.1 | -| 2.5.0 | 2.5.0, 2.5.1 | -| 2.5.1 | 2.5.0, 2.5.1 | -| 2.6.0 | 2.6.0, 2.6.1 | -| 2.6.1 | 2.6.0, 2.6.1 | -| 3.0.0 | 3.0.0 | -| 3.0-SNAPSHOT | nightly | +Nebula Spark Connector 和 Nebula 、Spark 的版本对应关系如下: + +| Nebula Spark Connector Version | Nebula Version | Spark Version | +|:-----------------------------------------:|:--------------:|:-------------:| +|nebula-spark-connector-2.0.0.jar | 2.0.0, 2.0.1 | 2.4.* | +|nebula-spark-connector-2.0.1.jar | 2.0.0, 2.0.1 | 2.4.* | +|nebula-spark-connector-2.1.0.jar | 2.0.0, 2.0.1 | 2.4.* | +|nebula-spark-connector-2.5.0.jar | 2.5.0, 2.5.1 | 2.4.* | +|nebula-spark-connector-2.5.1.jar | 2.5.0, 2.5.1 | 2.4.* | +|nebula-spark-connector-2.6.0.jar | 2.6.0, 2.6.1 | 2.4.* | +|nebula-spark-connector-2.6.1.jar | 2.6.0, 2.6.1 | 2.4.* | +|nebula-spark-connector-3.0.0.jar | 3.x | 2.4.* | +|nebula-spark-connector-3.3.0.jar | 3.x | 2.4.* | +|nebula-spark-connector_2.2-3.3.0.jar | 3.x | 2.2.* | +|nebula-spark-connector-3.4.0.jar | 3.x | 2.4.* | +|nebula-spark-connector_2.2-3.4.0.jar | 3.x | 2.2.* | +|nebula-spark-connector-3.0-SNAPSHOT.jar | nightly | 2.4.* | +|nebula-spark-connector_2.2-3.0-SNAPSHOT.jar| nightly | 2.2.* | +|nebula-spark-connector_3.0-3.0-SNAPSHOT.jar| nightly | 3.* | + ## 性能 我们使用LDBC数据集进行Nebula-Spark-Connector的性能测试,测试结果如下: diff --git a/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/NebulaConfig.scala b/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/NebulaConfig.scala index f6fa9629..2aadccde 100644 --- a/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/NebulaConfig.scala +++ b/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/NebulaConfig.scala @@ -784,7 +784,8 @@ object ReadNebulaConfig { assert(space != null && !space.isEmpty, s"config space is empty.") assert(label != null && !label.isEmpty, s"config label is empty.") assert(limit > 0, s"config limit must be positive, your limit is $limit") - assert(partitionNum > 0, s"config partitionNum must be positive, your partitionNum is $limit") + assert(partitionNum > 0, + s"config partitionNum must be positive, your partitionNum is $partitionNum") if (noColumn && returnCols.nonEmpty) { LOG.warn( s"noColumn is true, returnCols will be invalidate " diff --git a/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/utils/SparkValidate.scala b/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/utils/SparkValidate.scala index 6f507d77..fc9d231b 100644 --- a/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/utils/SparkValidate.scala +++ b/nebula-spark-common/src/main/scala/com/vesoft/nebula/connector/utils/SparkValidate.scala @@ -12,8 +12,8 @@ object SparkValidate { val sparkVersion = SparkSession.getActiveSession.map(_.version).getOrElse("UNKNOWN") if (sparkVersion != "UNKNOWN" && !supportedVersions.exists(sparkVersion.matches)) { throw new RuntimeException( - s"""Your current spark version ${sparkVersion} is not supported by the current NebulaGraph Exchange. - | please visit https://github.com/vesoft-inc/nebula-exchange#version-match to know which Exchange you need. + s"""Your current spark version ${sparkVersion} is not supported by the current NebulaGraph Spark Connector. + | please visit https://github.com/vesoft-inc/nebula-spark-connector#version-match to know which Connector you need. | """.stripMargin) } }