# Introduction

This notebook shows how to use the confluent-weaviate connector

# Imports

In [None]:
import json
import os
import time

import weaviate
from pyspark.sql import SparkSession

# Setup

Setup weaviate (embedded):

In [None]:
client = weaviate.Client(embedded_options=weaviate.embedded.EmbeddedOptions())

client.schema.delete_all()
weaviate_url = client._connection.url
weaviate_host = weaviate_url.split("://")[1]

Setup the spark session:

In [None]:
jar_packages = [
    "org.apache.spark:spark-avro_2.12:3.4.1",
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1",
]

CONFLUENT_WEAVIATE_JAR = "../target/scala-2.12/confluent-connector_2.12-3.4.0_0.0.1.jar"

spark = (
    SparkSession.builder.appName("demo-confluent-weaviate-integration")
    .config("spark.jars.packages", ",".join(jar_packages))
    .config("spark.jars", CONFLUENT_WEAVIATE_JAR)
    .config("spark.streaming.stopGracefullyOnShutdown", "true")
    .getOrCreate()
)

Grab the creds:

In [None]:
confluentClusterName = os.environ.get("CONFLUENT_CLUSTER_NAME")
confluentBootstrapServers = os.environ.get("CONFLUENT_BOOTSTRAP_SERVERS")
confluentTopicName = os.environ.get("CONFLUENT_TOPIC_NAME")
schemaRegistryUrl = os.environ.get("SCHEMA_REGISTRY_URL")
confluentApiKey = os.environ.get("CONFLUENT_API_KEY")
confluentSecret = os.environ.get("CONFLUENT_SECRET")
confluentRegistryApiKey = os.environ.get("CONFLUENT_REGISTRY_API_KEY")
confluentRegistrySecret = os.environ.get("CONFLUENT_REGISTRY_SECRET")

# Demo

Create the schema in Weaviate:

In [None]:
with open("../src/it/resources/schema.json", "r") as f:
    weaviate_schema = json.load(f)

client.schema.create_class(weaviate_schema)

Create a Spark Structured Streaming `DataFrame` to read streaming data from a Confluent Kafka topic:

In [None]:
clickstreamDF = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", confluentBootstrapServers)
    .option("subscribe", confluentTopicName)
    .option("startingOffsets", "latest")
    .option("kafka.security.protocol", "SASL_SSL")
    .option(
        "kafka.sasl.jaas.config",
        "org.apache.kafka.common.security.plain.PlainLoginModule required username='{}' password='{}';".format(
            confluentApiKey, confluentSecret
        ),
    )
    .option("kafka.ssl.endpoint.identification.algorithm", "https")
    .option("kafka.sasl.mechanism", "PLAIN")
    .option("failOnDataLoss", "false")
    .option("name", "clickStreamReadFromConfluent")
    .load()
)

Define a function to run on each microbatch:

In [None]:
total_rows_processed = 0


def f(df, batch_id):
    global total_rows_processed
    row_count = df.count()
    total_rows_processed += row_count

    print(f"Number of rows in the batch with batch id {batch_id}: {row_count}")
    df.write.format("io.weaviate.confluent.Weaviate").option("batchsize", 200).option(
        "scheme", "http"
    ).option("host", weaviate_host).option(
        "className", weaviate_schema["class"]
    ).option(
        "schemaRegistryUrl", ...
    ).option(
        "schemaRegistryApiKey", ...
    ).option(
        "schemaRegistryApiSecret", ...
    ).mode(
        "append"
    ).save()

Start writinng the stream:

In [None]:
query = (
    clickstreamDF.writeStream.foreachBatch(f)
    .queryName("write_stream_to_weaviate")
    .start()
)

Stop writing after 30 seconds:

In [None]:
# this does not gracefully shutdown the stream!
# easiest way to gracefully shutdown is to pause the source connector
time.sleep(30)
query.stop()

Compare the number of rows processed and the number of objects in Weaviate:

In [None]:
results = client.query.aggregate(weaviate_schema["class"]).with_meta_count().do()
total_objects_in_weaviate = results["data"]["Aggregate"][weaviate_schema["class"]][0][
    "meta"
]["count"]

assert (
    total_rows_processed == total_objects_in_weaviate
), f"Total rows processed {total_rows_processed} does not match total objects in weaviate {total_objects_in_weaviate}"

Look at some of the objects in Weaviate:

In [None]:
client.data_object.get(class_name=weaviate_schema["class"], limit=3)