# Introduction

This notebook shows how to use the confluent-weaviate connector

# Imports

In [1]:
from pyspark.sql import SparkSession
from confluent_kafka.schema_registry import SchemaRegistryClient
import pyspark.sql.functions as fn
from pyspark.sql.types import StringType
from pyspark.sql.avro.functions import from_avro
import weaviate
import time
import os
import json

# Setup

Setup weaviate (embedded):

In [2]:
client = weaviate.Client(
  embedded_options=weaviate.embedded.EmbeddedOptions()
)

client.schema.delete_all()
weaviate_url = client._connection.url
weaviate_host = weaviate_url.split("://")[1]

Started /home/vscode/.cache/weaviate-embedded: process ID 36101


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-09-06T18:28:48Z"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-09-06T18:28:48Z"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"clickstream_Z9fQTswAb5Ga","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-09-06T18:28:48Z","took":41208}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50051","time":"2023-09-06T18:28:48Z"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:6666","time":"2023-09-06T18:28:49Z"}


Setup the spark session:

In [3]:
jar_packages = [
    "org.apache.spark:spark-avro_2.12:3.4.1",
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1",
]

CONFLUENT_WEAVIATE_JAR = "../target/scala-2.12/confluent-connector-assembly-0.0.1.jar"

spark = (
    SparkSession.builder.appName("demo-confluent-weaviate-integration")
    .config("spark.jars.packages", ",".join(jar_packages))
    .config("spark.jars", CONFLUENT_WEAVIATE_JAR)
    .getOrCreate()
)

:: loading settings :: url = jar:file:/home/vscode/.local/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/vscode/.ivy2/cache
The jars for the packages stored in: /home/vscode/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f8e54b8a-8cfc-45d7-b5cd-503d0316f094;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.4.1 in central
	found org.tukaani#xz;1.9 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.find

Grab the creds:

In [None]:
confluentClusterName = os.environ.get("CONFLUENT_CLUSTER_NAME")
confluentBootstrapServers = os.environ.get("CONFLUENT_BOOTSTRAP_SERVERS")
confluentTopicName = os.environ.get("CONFLUENT_TOPIC_NAME")
schemaRegistryUrl = os.environ.get("SCHEMA_REGISTRY_URL")
confluentApiKey = os.environ.get("CONFLUENT_API_KEY")
confluentSecret = os.environ.get("CONFLUENT_SECRET")
confluentRegistryApiKey = os.environ.get("CONFLUENT_REGISTRY_API_KEY")
confluentRegistrySecret = os.environ.get("CONFLUENT_REGISTRY_SECRET")

# Demo

Create the schema in Weaviate:

In [None]:
with open('../src/it/resources/schema.json', 'r') as f:
    weaviate_schema = json.load(f)

client.schema.create_class(weaviate_schema)


{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"clickstream_Z9fQTswAb5Ga","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-09-06T18:22:42Z","took":46166}


Create a stream to read from Confluent:

In [None]:
clickstreamDF = spark.readStream\
    .format("kafka") \
    .option("kafka.bootstrap.servers", confluentBootstrapServers) \
    .option("subscribe", confluentTopicName) \
    .option("startingOffsets", "latest") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option(
        "kafka.sasl.jaas.config",
        "org.apache.kafka.common.security.plain.PlainLoginModule required username='{}' password='{}';".format(
            confluentApiKey, confluentSecret
        ),
    ) \
    .option("kafka.ssl.endpoint.identification.algorithm", "https") \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("failOnDataLoss", "false") \
    .option("name", "clickStreamReadFromConfluent") \
    .load()

Define a function to run on each microbatch writing:

In [None]:
def f(df, ephoch_id):
	df.write.format("io.weaviate.confluent.Weaviate") \
		.option("batchsize", 200) \
		.option("scheme", "http") \
		.option("host", weaviate_host) \
        .option("className", weaviate_schema["class"]) \
		.option("schemaRegistryUrl", ...) \
		.option("schemaRegistryApiKey", ...) \
        .option("schemaRegistryApiSecret", ...) \
		.mode("append")\
		.save()

Start writinng:

In [None]:
query = clickstreamDF.writeStream \
  .foreachBatch(f) \
  .queryName("write_stream_to_weaviate") \
  .start()

# query = clickstreamDF.writeStream \
#     .foreachBatch(f) \
#     .outputMode("append") \
#     .format("console") \
#     .start()

23/09/06 18:22:43 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-e90dea43-b317-404b-a1f5-c990a9e71084. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/09/06 18:22:43 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Stop writing after 1 minute:

In [None]:
time.sleep(30)
query.stop()

23/09/06 18:22:44 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
23/09/06 18:22:53 ERROR Utils: Aborting task                        (0 + 1) / 1]
java.lang.IllegalArgumentException: Unknown timestampString: 55651-07-02 19:54:41.0
	at io.weaviate.confluent.writer.WeaviateDataWriter.convertTimestampToRfc3339(WeaviateDataWriter.scala:151)
	at io.weaviate.confluent.writer.WeaviateDataWriter.getValueFromField(WeaviateDataWriter.scala:222)
	at io.weaviate.confluent.writer.WeaviateDataWriter.$anonfun$buildWeaviateObject$1(WeaviateDataWriter.scala:109)
	at io.weaviate.confluent.writer.WeaviateDataWriter.$anonfun$buildWeaviateObject$1$adapted(WeaviateDataWriter.scala:106)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.co

Check the data in Weaviate:

In [None]:
client.data_object.get(class_name=weaviate_schema["class"])

{'deprecations': [], 'objects': []}