In [52]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType
import json

In [53]:
def create_spark_connection():
    s_conn = None

    try:
        s_conn = SparkSession.builder \
            .appName('SparkDataStreaming') \
            .config('spark.jars.packages', "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
            .getOrCreate()

        s_conn.sparkContext.setLogLevel("ERROR")
        print("Spark connection created successfully!")
    except Exception as e:
        print("Error creating Spark connection: {0}".format(e))
    return s_conn

In [54]:
spark_conn = create_spark_connection()

Spark connection created successfully!


In [55]:
def connect_to_kafka(spark_conn):
    spark_df = None
    try:
        spark_df = spark_conn.readStream \
            .format('kafka') \
            .option('kafka.bootstrap.servers', 'localhost:9092') \
            .option('subscribe', 'trades_topic') \
            .option('startingOffsets', 'earliest') \
            .load()
        print("kafka dataframe created successfully")
    except Exception as e:
        print(f"kafka dataframe could not be created because: {e}")

    return spark_df

In [56]:
spark_df = connect_to_kafka(spark_conn)

kafka dataframe created successfully


In [57]:
# message example
data = '{"data":[{"c":null,"p":34851.9,"s":"BINANCE:BTCUSDT","t":1699351036478,"v":0.00014},{"c":null,"p":34851.89,"s":"BINANCE:BTCUSDT","t":1699351036955,"v":0.01},{"c":null,"p":34851.89,"s":"BINANCE:BTCUSDT","t":1699351037011,"v":0.00029}],"type":"trade"}'
print(json.dumps(json.loads(data), indent=4))

{
    "data": [
        {
            "c": null,
            "p": 34851.9,
            "s": "BINANCE:BTCUSDT",
            "t": 1699351036478,
            "v": 0.00014
        },
        {
            "c": null,
            "p": 34851.89,
            "s": "BINANCE:BTCUSDT",
            "t": 1699351036955,
            "v": 0.01
        },
        {
            "c": null,
            "p": 34851.89,
            "s": "BINANCE:BTCUSDT",
            "t": 1699351037011,
            "v": 0.00029
        }
    ],
    "type": "trade"
}


s: Symbol

p: Last price

t: UNIX milliseconds timestamp

v: Volume

c: List of trade conditions.

In [58]:
from pyspark.sql.types import FloatType, LongType

def create_selection_df_from_kafka(spark_df):
    schema = StructType([
        StructField("s", StringType(), True),
        StructField("p", FloatType(), True),
        StructField("t", LongType(), True),
        StructField("v", FloatType(), True),
        StructField("c", StringType(), True)
    ])

    sel = spark_df.selectExpr("CAST(value AS STRING)") \
        .select(from_json(col('value'), schema).alias('data')) \
        .select("data.*")
    
    return sel

In [59]:
selection_df = create_selection_df_from_kafka(spark_df)

In [60]:
selection_df

DataFrame[s: string, p: float, t: bigint, v: float, c: string]

In [61]:
streaming_query = selection_df \
    .writeStream \
    .format("console") \
    .start()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+----+----+----+----+
|   s|   p|   t|   v|   c|
+----+----+----+----+----+
|NULL|NULL|NULL|NULL|NULL|
|NULL|NULL|NULL|NULL|NULL|
|NULL|NULL|NULL|NULL|NULL|
|NULL|NULL|NULL|NULL|NULL|
|NULL|NULL|NULL|NULL|NULL|
|NULL|NULL|NULL|NULL|NULL|
+----+----+----+----+----+

