In [17]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
        .builder
        .remote("sc://spark-connect-server:15002")
        .appName("streaming-example")
        .config("hive.metastore.uris", "thrift://hive-cluster-metastore:9083")
        .enableHiveSupport()
        .getOrCreate()
)

spark.addArtifacts("/stackable/spark/connect/spark-connect-4.0.1.jar")

In [14]:
# Clear checkpoint directory
# This will delete the checkpoint directory to reset word counts

CHECKPOINT_LOCATION = "/tmp/wordcount-checkpoint"

def delete_checkpoint():
    """Delete the checkpoint directory to reset word counts using a regular Spark session"""
    # Stop the query first if it exists
    try:
        if 'query' in globals():
            query.stop()
            print("✓ Stopped streaming query")
    except Exception as e:
        print(f"Note stopping query: {e}")
    
    # Create a regular Spark session (not Spark Connect) for filesystem operations
    # This session has access to SparkContext which is needed for filesystem APIs
    deleted = False
    spark_fs = None
    
    try:
        # Create a regular Spark session for filesystem operations
        # Using master "local[*]" for local execution, or configure for your cluster
        spark_fs = (
            SparkSession
            .builder
            .appName("checkpoint-delete")
            .master("local[*]")  # Change this to your cluster master if needed
            .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.1")
            .getOrCreate()
        )
        
        # Now we can access SparkContext
        sc = spark_fs.sparkContext
        fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(
            sc._jvm.java.net.URI("file:///"),
            sc._jsc.hadoopConfiguration()
        )
        path = sc._jvm.org.apache.hadoop.fs.Path(CHECKPOINT_LOCATION)
        
        if fs.exists(path):
            fs.delete(path, True)  # True = recursive
            print(f"✓ Deleted checkpoint directory: {CHECKPOINT_LOCATION}")
            deleted = True
        else:
            print(f"✓ Checkpoint directory does not exist: {CHECKPOINT_LOCATION}")
            deleted = True  # Consider it "deleted" if it doesn't exist
            
    except Exception as e:
        print(f"Could not delete checkpoint using Spark filesystem APIs: {e}")
        print("\nNote: If the checkpoint is on a remote server's local filesystem,")
        print("you may need to delete it manually using kubectl.")
    finally:
        # Clean up the filesystem Spark session
        if spark_fs is not None:
            try:
                spark_fs.stop()
            except:
                pass
    
    if not deleted:
        print(f"\n⚠ Could not delete checkpoint automatically from notebook.")
        print(f"Checkpoint location: {CHECKPOINT_LOCATION}")
        print("\nPlease delete manually using kubectl:")
        print(f"  kubectl exec -n bigdata -it deployment/spark-connect-server -- rm -rf {CHECKPOINT_LOCATION}")
        print("\nOr change CHECKPOINT_LOCATION in Cell 2 to a new path to start fresh.")

# Uncomment the line below to delete the checkpoint
delete_checkpoint()


✓ Stopped streaming query
Could not delete checkpoint using Spark filesystem APIs: [CONNECT_URL_NOT_SET] Cannot create a Spark Connect session because the Spark Connect remote URL has not been set. Please define the remote URL by setting either the 'spark.remote' option or the 'SPARK_REMOTE' environment variable.

Note: If the checkpoint is on a remote server's local filesystem,
you may need to delete it manually using kubectl.

⚠ Could not delete checkpoint automatically from notebook.
Checkpoint location: /tmp/wordcount-checkpoint

Please delete manually using kubectl:
  kubectl exec -n bigdata -it deployment/spark-connect-server -- rm -rf /tmp/wordcount-checkpoint

Or change CHECKPOINT_LOCATION in Cell 2 to a new path to start fresh.


In [None]:
from pyspark.sql.functions import explode, split, col, lower, regexp_replace, count as spark_count, concat_ws, lit, to_json, struct

# Stop any existing streaming query
try:
    if 'query' in globals():
        query.stop()
        print("Stopped previous query")
except NameError:
    pass
except Exception as e:
    print(f"Note: {e}")

# Read streaming data from Kafka
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-broker.bigdata.svc.cluster.local:9092") \
    .option("subscribe", "sparktest") \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \
    .load()

# Process the stream for word count
# 1. Extract the value as string
# 2. Split into words (split on whitespace and punctuation)
# 3. Explode to get one word per row
# 4. Clean words (lowercase, remove empty strings)
# 5. Count occurrences
words_df = df.selectExpr("CAST(value AS STRING) as text") \
    .select(explode(split(lower(regexp_replace(col("text"), "[^a-zA-Z0-9\\s]", " ")), "\\s+")).alias("word")) \
    .filter(col("word") != "") \
    .groupBy("word") \
    .agg(spark_count("*").alias("count"))

# Format word counts as JSON for Kafka output
# Each row will be: {"word": "the", "count": 5}
# Kafka requires key and value to be binary, so we cast to string then to binary
output_df = words_df.select(
    col("word").cast("string").alias("key"),
    to_json(struct("word", "count")).cast("string").alias("value")
)

# Start the streaming query writing to Kafka topic
# Using trigger with 1 second interval for faster batch processing (demo purposes)
query = output_df \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-broker.bigdata.svc.cluster.local:9092") \
    .option("topic", "wordcount-output") \
    .option("checkpointLocation", "/tmp/wordcount-checkpoint") \
    .outputMode("update") \
    .trigger(processingTime="1 second") \
    .start()

print("Word count streaming query started!")
print("Word counts are being written to 'wordcount-output' Kafka topic.")
print("\nTo view the results, run:")
print("kubectl exec -n bigdata -it deployment/broker -- /opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server broker:29092 --topic wordcount-output --from-beginning")

Stopped previous query
Word count streaming query started!
Word counts are being written to 'wordcount-output' Kafka topic.

To view the results, run:
kubectl exec -n bigdata -it deployment/broker -- /opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server broker:29092 --topic wordcount-output --from-beginning


In [None]:
# Commands to view word count results:
#
# 1. Create the output topic (if it doesn't exist):
#    kubectl exec -n bigdata -it deployment/broker -- /opt/kafka/bin/kafka-topics.sh --create --topic wordcount-output --bootstrap-server broker:29092 --partitions 1 --replication-factor 1
#
# 2. View word counts from the output topic (from beginning):
#    kubectl exec -n bigdata -it deployment/broker -- /opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server broker:29092 --topic wordcount-output --from-beginning
#
# 3. View only new messages (real-time):
#    kubectl exec -n bigdata -it deployment/broker -- /opt/kafka/bin/kafka-console-consumer.sh --bootstrap-server broker:29092 --topic wordcount-output
#
# 4. List all topics:
#    kubectl exec -n bigdata -it deployment/broker -- /opt/kafka/bin/kafka-topics.sh --list --bootstrap-server broker:29092


In [None]:
# Keep the query running
# Press Ctrl+C or interrupt the kernel to stop
query.awaitTermination()


In [6]:
!pip install kafka-python

Defaulting to user installation because normal site-packages is not writeable
Collecting kafka-python
  Downloading kafka_python-2.2.15-py2.py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.8/309.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-2.2.15


In [None]:
# Helper: Send test messages with text for word counting
import kafka
import json
import time

# Create producer
producer = kafka.KafkaProducer(
    bootstrap_servers=['kafka-broker.bigdata.svc.cluster.local:9092'],
    value_serializer=lambda v: v.encode('utf-8') if isinstance(v, str) else v
)

# Sample sentences for word count testing
test_messages = [
    "The quick brown fox jumps over the lazy dog",
    "Spark streaming is awesome for real-time processing",
    "Kafka and Spark work great together",
    "Word count is a classic example of stream processing",
    "The fox jumps and the dog runs",
    "Real-time analytics with Spark and Kafka",
    "Streaming data processing made easy",
    "Count words in real-time with Spark Streaming"
]

print("Sending test messages for word counting...")
for i, message in enumerate(test_messages):
    producer.send('sparktest', value=message)
    print(f"Sent: {message}")
    time.sleep(0.5)  # Wait 0.5 seconds between messages (faster for demo)

producer.flush()
print("\nAll test messages sent! Check the streaming output above for word counts.")


Sending test messages for word counting...
Sent: The quick brown fox jumps over the lazy dog
Sent: Spark streaming is awesome for real-time processing
Sent: Kafka and Spark work great together
Sent: Word count is a classic example of stream processing
Sent: The fox jumps and the dog runs
Sent: Real-time analytics with Spark and Kafka
Sent: Streaming data processing made easy
Sent: Count words in real-time with Spark Streaming

All test messages sent! Check the streaming output above for word counts.
