In [None]:
# Get Connector JAR name
import glob
import os

files = glob.glob("/spark-connector/vertica-spark-*")
os.environ["CONNECTOR_JAR"] = files[0]
print(os.environ["CONNECTOR_JAR"])

In [None]:
# Create the Spark session and context
from pyspark.sql import *

spark = (SparkSession.builder
    .config("spark.master", "spark://spark:7077")
    .config("spark.driver.memory", "2G")
    .config("spark.executor.memory", "1G")
    .config("spark.jars", os.environ["CONNECTOR_JAR"])
    .getOrCreate())

sc = spark.sparkContext

In [None]:
# Display the context information
print(sc.version)
print(sc.master)
display(sc.getConf().getAll())

In [None]:
# Load some example data from a csv file into a dataframe and show some if it's contents

df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("/spark-connector/examples/jupyter/faithful.csv")
df.show()
df.printSchema()

# Write the data into a table named LR_Example on the Vertica DB

df.write.mode("overwrite").format("com.vertica.spark.datasource.VerticaSource").options(
    host="vertica",
    user="dbadmin",
    password="",
    db="docker",
    table = "LR_Example",
    staging_fs_url="webhdfs://hdfs:50070/jupytertest").save()



In [None]:
# Import Spark's ML Regression tool
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Read our data from Vertica into a Spark dataframe
df = spark.read.load(format="com.vertica.spark.datasource.VerticaSource",
    host="vertica",
    user="dbadmin",
    password="",
    db="docker",
    table="LR_Example",
    staging_fs_url="webhdfs://hdfs:50070/jupytertest")

# Spark's Linear Regression tool requires an array of the features we want to use. Since we only have one in this case, we add "waiting"
featureassembler = VectorAssembler(inputCols = ["waiting"], outputCol = "features")

output = featureassembler.transform(df)

# Show our new table with a features column added
output.show()

# Create our model using the features to predict eruption duration
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol= 'features', labelCol='eruptions')
lr = lr.fit(output)

# Show our new table with the predictions
pred_results = lr.evaluate(output)
pred_results.predictions.show()

In [None]:
# Some graphs to visualize this data
import matplotlib.pyplot as plt

x = output.select('id').collect()
y2 = pred_results.predictions.select('prediction').collect()
y1 = pred_results.predictions.select('eruptions').collect()

plt.plot(x, y1, label = "predicted duration")
plt.plot(x, y2, label = "true duration")

plt.xlabel('id')
plt.ylabel('eruption duration')

plt.title('Duration of Each Faithful Geyser Eruption')
plt.legend()

plt.show()