In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, sum, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from datetime import datetime
import os
import psycopg2

In [None]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PurchaseEventStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.apache.kafka:kafka-clients:2.8.0") \
    .config("spark.streaming.kafka.consumer.cache.enabled", "false") \
    .config("spark.sql.streaming.schemaInference", "true") \
    .getOrCreate()

In [3]:
# Define schema for purchase events
schema = StructType([
    StructField("timestamp", StringType()),
    StructField("product_id", IntegerType()),
    StructField("quantity", IntegerType()),
    StructField("price", DoubleType())
])

In [4]:
kafka_broker = 'kafka-1:29092,kafka-2:29093,kafka-3:29094'
kafka_topic = "purchase_event"

# PostgreSQL connection details
pg_host = "postgres_container"
pg_db = "mydb"
pg_user = "postgres"
pg_password = "secretpassword"

In [None]:
# Read streaming data from Kafka using Confluent Kafka format
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "PLAINTEXT") \
    .load()

In [None]:
# Parse JSON from value and process
parsed_df = df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")

# Calculate monthly sales average
sales_data = parsed_df \
    .withColumn("timestamp", to_date("timestamp")) \
    .groupBy("timestamp") \
    .agg(sum(col("quantity") * col("price")).alias("total_sales"))

In [None]:
query = sales_data.writeStream \
    .format("console") \
    .outputMode("complete") \
    .trigger(processingTime= '5 seconds') \
    .foreachBatch(save_to_postgres) \
    .start()

query.awaitTermination()