In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, sum, to_timestamp, window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from datetime import datetime
import psycopg2
import os

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PurchaseEventStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.apache.kafka:kafka-clients:2.8.0") \
    .config("spark.streaming.kafka.consumer.cache.enabled", "false") \
    .config("spark.sql.streaming.schemaInference", "true") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-64ecf6b3-c469-4ae9-a78d-de134af9cd81;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found com.github.luben#zstd-jni;1.4.9-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.1 in central
	found org.slf4j#slf4j-api;1.7.30 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.1.2/spark-sql-kafka-0-10_2.12-3.1.2.jar ...
	[SUCCE

In [3]:
# Define schema for purchase events
schema = StructType([
    StructField("timestamp", StringType()),
    StructField("product_id", IntegerType()),
    StructField("quantity", IntegerType()),
    StructField("price", DoubleType())
])

In [4]:
kafka_broker = os.environ.get('KAFKA_BROKER')
kafka_topic = "purchase_event"

# PostgreSQL connection details
pg_host = os.environ.get('POSTGRES_HOST')
pg_db = os.environ.get('POSTGRES_DB')
pg_user = os.environ.get('POSTGRES_USER')
pg_password = os.environ.get('POSTGRES_PASSWORD')

In [8]:
def save_to_postgres(df, epoch_id):
    # Create a connection to PostgreSQL
    conn = psycopg2.connect(host=pg_host, database=pg_db, user=pg_user, password=pg_password)
    cur = conn.cursor()

    # Create table if not exists
    cur.execute("""
    CREATE TABLE IF NOT EXISTS running_total (
        timestamp TIMESTAMP PRIMARY KEY,
        running_total DOUBLE PRECISION
    )
    """)

    # Insert or update data
    print(f"\n--- Running Total Updated at {datetime.now()} ---")
    print("Timestamp | Running Total")
    print("-----------+---------------")
    for row in df.collect():
        window_end = row.window_end
        running_total = row.running_total
        
        if window_end is not None and running_total is not None:
            cur.execute("""
            INSERT INTO running_total (timestamp, running_total)
            VALUES (%s, %s)
            ON CONFLICT (timestamp) DO UPDATE
            SET running_total = EXCLUDED.running_total
            """, (window_end, running_total))
            
            print(f"{window_end} | {running_total:.2f}")
        else:
            print(f"Skipping row due to None values: window_end={window_end}, running_total={running_total}")

    # Commit and close
    conn.commit()
    cur.close()
    conn.close()
    
    print("\n")

In [9]:
# Read streaming data from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "PLAINTEXT") \
    .load()

# Parse JSON from value and process
parsed_df = df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

In [10]:
# Convert timestamp to proper format and calculate running total for 1 day (you can change window by the preferred time you want)
sales_df = parsed_df \
    .withColumn("timestamp", to_timestamp("timestamp")) \
    .withColumn("sales", col("quantity") * col("price")) \
    .groupBy(window("timestamp", "1 day")) \
    .agg(sum("sales").alias("running_total")) \
    .select(
        col("window.end").alias("window_end"),
        col("running_total")
    )

In [None]:
# Start the streaming query
query = sales_df.writeStream \
    .outputMode("complete") \
    .foreachBatch(save_to_postgres) \
    .trigger(processingTime='10 seconds') \
    .start()

query.awaitTermination()

24/09/08 12:54:21 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-23602c73-8676-4564-91d0-fd999a751137. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/09/08 12:54:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/09/08 12:54:21 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/09/08 12:54:21 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/09/08 12:54:21 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/09/08 12:54:21 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con


--- Running Total Updated at 2024-09-08 12:54:22.866591 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 26460.79



--- Running Total Updated at 2024-09-08 12:54:30.112054 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 32681.36



--- Running Total Updated at 2024-09-08 12:54:40.086570 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 33030.12



--- Running Total Updated at 2024-09-08 12:54:50.085703 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 34682.54



--- Running Total Updated at 2024-09-08 12:55:00.083290 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 37394.41



--- Running Total Updated at 2024-09-08 12:55:10.092295 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 39462.16



--- Running Total Updated at 2024-09-08 12:55:20.088150 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 40452.38



--- Running Total Updated at 2024-09-08 12:55:30.069380 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 46331.97



--- Running Total Updated at 2024-09-08 12:55:40.075821 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 49176.69



--- Running Total Updated at 2024-09-08 12:55:50.073820 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 51398.69



--- Running Total Updated at 2024-09-08 12:56:00.065750 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 60196.93



--- Running Total Updated at 2024-09-08 12:56:10.080801 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 66645.25



--- Running Total Updated at 2024-09-08 12:56:20.063057 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 68033.95


