In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, sum, to_timestamp, window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from datetime import datetime
import psycopg2
import os

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("PurchaseEventStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.apache.kafka:kafka-clients:2.8.0") \
    .config("spark.streaming.kafka.consumer.cache.enabled", "false") \
    .config("spark.sql.streaming.schemaInference", "true") \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ef693dfe-945e-4c3c-9dbf-d3fd95dce410;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found com.github.luben#zstd-jni;1.4.9-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.1 in central
	found org.slf4j#slf4j-api;1.7.30 in central
:: resolution report :: resolve 162ms :: artifacts dl 6ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.9-1 from central in [default]


In [5]:
# Define schema for purchase events
schema = StructType([
    StructField("timestamp", StringType()),
    StructField("product_id", IntegerType()),
    StructField("quantity", IntegerType()),
    StructField("price", DoubleType())
])

In [10]:
kafka_broker = os.environ.get('KAFKA_BROKER')
kafka_topic = "purchase_event"

# PostgreSQL connection details
pg_host = os.environ.get('POSTGRES_HOST')
pg_db = os.environ.get('POSTGRES_DB')
pg_user = os.environ.get('POSTGRES_USER')
pg_password = os.environ.get('POSTGRES_PASSWORD')

In [11]:
def save_to_postgres(df, epoch_id):
    # Create a connection to PostgreSQL
    conn = psycopg2.connect(host=pg_host, database=pg_db, user=pg_user, password=pg_password)
    cur = conn.cursor()

    # Create table if not exists
    cur.execute("""
    CREATE TABLE IF NOT EXISTS running_total (
        timestamp TIMESTAMP PRIMARY KEY,
        running_total DOUBLE PRECISION
    )
    """)

    # Insert or update data
    print(f"\n--- Running Total Updated at {datetime.now()} ---")
    print("Timestamp | Running Total")
    print("-----------+---------------")
    for row in df.collect():
        window_end = row.window_end
        running_total = row.running_total
        
        if window_end is not None and running_total is not None:
            cur.execute("""
            INSERT INTO running_total (timestamp, running_total)
            VALUES (%s, %s)
            ON CONFLICT (timestamp) DO UPDATE
            SET running_total = EXCLUDED.running_total
            """, (window_end, running_total))
            
            print(f"{window_end} | {running_total:.2f}")
        else:
            print(f"Skipping row due to None values: window_end={window_end}, running_total={running_total}")

    # Commit and close
    conn.commit()
    cur.close()
    conn.close()
    
    print("\n")

In [12]:
# Read streaming data from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_broker) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "PLAINTEXT") \
    .load()

# Parse JSON from value and process
parsed_df = df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

In [13]:
# Convert timestamp to proper format and calculate running total
sales_df = parsed_df \
    .withColumn("timestamp", to_timestamp("timestamp")) \
    .withColumn("sales", col("quantity") * col("price")) \
    .groupBy(window("timestamp", "1 day")) \
    .agg(sum("sales").alias("running_total")) \
    .select(
        col("window.end").alias("window_end"),
        col("running_total")
    )

In [None]:
# Start the streaming query
query = sales_df.writeStream \
    .outputMode("complete") \
    .foreachBatch(save_to_postgres) \
    .trigger(processingTime='10 seconds') \
    .start()

query.awaitTermination()

24/09/08 12:02:36 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1159a6f1-0bf7-4ffc-b885-5ab2c6cf085a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/09/08 12:02:36 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/09/08 12:02:36 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/09/08 12:02:36 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/09/08 12:02:36 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/09/08 12:02:36 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con


--- Running Total Updated at 2024-09-08 12:02:37.598939 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 22504.11



--- Running Total Updated at 2024-09-08 12:02:50.106909 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 28978.13



--- Running Total Updated at 2024-09-08 12:03:00.097060 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 29250.57



--- Running Total Updated at 2024-09-08 12:03:10.090399 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 30832.33



--- Running Total Updated at 2024-09-08 12:03:20.082268 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 33094.15



--- Running Total Updated at 2024-09-08 12:03:30.077674 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 41067.17



--- Running Total Updated at 2024-09-08 12:03:40.075699 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 48181.81



--- Running Total Updated at 2024-09-08 12:03:50.078862 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 52504.13



--- Running Total Updated at 2024-09-08 12:04:00.084342 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 54451.79



--- Running Total Updated at 2024-09-08 12:04:10.076751 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 57006.91



--- Running Total Updated at 2024-09-08 12:04:20.069216 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 66938.41



--- Running Total Updated at 2024-09-08 12:04:30.085859 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 70312.27



--- Running Total Updated at 2024-09-08 12:04:40.070814 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 75095.45



--- Running Total Updated at 2024-09-08 12:04:50.067079 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 76639.11



--- Running Total Updated at 2024-09-08 12:05:00.101977 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 83026.47



--- Running Total Updated at 2024-09-08 12:05:10.064338 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 83541.37



--- Running Total Updated at 2024-09-08 12:05:20.068195 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 88764.49



--- Running Total Updated at 2024-09-08 12:05:30.065485 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 90036.94



--- Running Total Updated at 2024-09-08 12:05:40.070821 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 92770.72



--- Running Total Updated at 2024-09-08 12:05:50.069411 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 97344.32



--- Running Total Updated at 2024-09-08 12:06:00.070100 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 104155.33



--- Running Total Updated at 2024-09-08 12:06:10.071787 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 105828.48



--- Running Total Updated at 2024-09-08 12:06:20.062343 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 114262.19



--- Running Total Updated at 2024-09-08 12:06:30.063601 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 118398.77



--- Running Total Updated at 2024-09-08 12:06:40.060757 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 124068.79



--- Running Total Updated at 2024-09-08 12:06:50.064230 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 126062.03



--- Running Total Updated at 2024-09-08 12:07:00.064357 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 138281.64



--- Running Total Updated at 2024-09-08 12:07:10.061414 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 140930.30



--- Running Total Updated at 2024-09-08 12:07:20.062464 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 141256.52



--- Running Total Updated at 2024-09-08 12:07:30.062996 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 150767.82



--- Running Total Updated at 2024-09-08 12:07:40.078270 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 156752.06



--- Running Total Updated at 2024-09-08 12:07:50.062657 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 166011.66



--- Running Total Updated at 2024-09-08 12:08:00.068513 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 169964.91



--- Running Total Updated at 2024-09-08 12:08:10.068148 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 171387.55



--- Running Total Updated at 2024-09-08 12:08:20.064557 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 171451.55



--- Running Total Updated at 2024-09-08 12:08:30.060778 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 172362.54



--- Running Total Updated at 2024-09-08 12:08:40.077804 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 173644.18



--- Running Total Updated at 2024-09-08 12:08:50.056648 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 173838.85



--- Running Total Updated at 2024-09-08 12:09:00.065711 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 179820.29



--- Running Total Updated at 2024-09-08 12:09:10.063228 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 180236.27



--- Running Total Updated at 2024-09-08 12:09:20.061261 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 180320.78



--- Running Total Updated at 2024-09-08 12:09:30.064346 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 185584.99



--- Running Total Updated at 2024-09-08 12:09:40.061113 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 187696.43



--- Running Total Updated at 2024-09-08 12:09:50.057323 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 193183.73



--- Running Total Updated at 2024-09-08 12:10:00.062529 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 197767.26



--- Running Total Updated at 2024-09-08 12:10:10.063871 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 201302.28



--- Running Total Updated at 2024-09-08 12:10:20.059329 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 204972.83



--- Running Total Updated at 2024-09-08 12:10:30.057202 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 210668.13



--- Running Total Updated at 2024-09-08 12:10:40.069974 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 214678.64



--- Running Total Updated at 2024-09-08 12:10:50.058237 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 217376.24



--- Running Total Updated at 2024-09-08 12:11:00.064855 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 217873.16



--- Running Total Updated at 2024-09-08 12:11:10.059693 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 219048.48



--- Running Total Updated at 2024-09-08 12:11:20.061516 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 219226.18



--- Running Total Updated at 2024-09-08 12:11:30.061529 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 220402.42



--- Running Total Updated at 2024-09-08 12:11:40.071346 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 220419.22



--- Running Total Updated at 2024-09-08 12:11:50.060234 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 227316.91



--- Running Total Updated at 2024-09-08 12:12:00.060636 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 229724.23



--- Running Total Updated at 2024-09-08 12:12:10.066607 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 238584.82



--- Running Total Updated at 2024-09-08 12:12:20.058638 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 248875.30



--- Running Total Updated at 2024-09-08 12:12:30.066240 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 251107.60



--- Running Total Updated at 2024-09-08 12:12:40.073911 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 252097.15



--- Running Total Updated at 2024-09-08 12:12:50.058785 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 253088.88



--- Running Total Updated at 2024-09-08 12:13:00.058598 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 255109.65



--- Running Total Updated at 2024-09-08 12:13:10.064578 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 261683.93



--- Running Total Updated at 2024-09-08 12:13:20.053787 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 263624.45



--- Running Total Updated at 2024-09-08 12:13:30.054058 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 265820.79



--- Running Total Updated at 2024-09-08 12:13:40.069797 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 266737.27



--- Running Total Updated at 2024-09-08 12:13:50.058562 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 271876.36



--- Running Total Updated at 2024-09-08 12:14:00.060879 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 274130.66



--- Running Total Updated at 2024-09-08 12:14:10.070894 ---
Timestamp | Running Total
-----------+---------------


                                                                                

2024-09-09 00:00:00 | 276283.58


