In [1]:
# Get Connector JAR name
import glob
import os

files = glob.glob("/spark-connector/connector/target/scala-2.12/spark-vertica-connector-assembly-*")
os.environ["CONNECTOR_JAR"] = files[0]
print(os.environ["CONNECTOR_JAR"])

/spark-connector/connector/target/scala-2.12/spark-vertica-connector-assembly-3.3.3.jar


In [2]:
# Create the Spark session and context
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .config("spark.master", "spark://spark:7077")
    .config("spark.driver.memory", "2G")
    .config("spark.executor.memory", "1G")
    .config("spark.jars", os.environ["CONNECTOR_JAR"])
    .getOrCreate())
sc = spark.sparkContext

In [3]:
# Display the context information
print(sc.version)
print(sc.master)
display(sc.getConf().getAll())

3.3.0
spark://spark:7077


[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.driver.memory', '2G'),
 ('spark.app.startTime', '1665765493603'),
 ('spark.repl.local.jars',
  'file:///spark-connector/connector/target/scala-2.12/spark-vertica-connector-assembly-3.3.3.jar'),
 ('spark.master', 'spark://spa

In [4]:
# Perform a simple write then read using the Spark Connector
columns = ["language", "rating"]
data = [("Scala", 71), ("Java", 89), ("C++", 67), ("Python", 94)]
rdd = sc.parallelize(data)
df = rdd.toDF(columns)

df.write.mode("overwrite").save(format="com.vertica.spark.datasource.VerticaSource",
    host="vertica",
    user="dbadmin",
    password="",
    db="docker",
    table="jupytertest",
    staging_fs_url="webhdfs://hdfs:50070/jupytertest")

df = spark.read.load(format="com.vertica.spark.datasource.VerticaSource",
    host="vertica",
    user="dbadmin",
    password="",
    db="docker",
    table="jupytertest",
    staging_fs_url="webhdfs://hdfs:50070/jupytertest")
df.rdd.collect()
df.show()

+--------+------+
|language|rating|
+--------+------+
|     C++|    67|
|    Java|    89|
|  Python|    94|
|   Scala|    71|
+--------+------+

