In [21]:
# Import the necessary libraries from PySpark
from pyspark.sql import SparkSession

# Create a SparkSession AND include the necessary Kafka package
spark = SparkSession.builder \
    .appName("FlightPulse-Streaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()

print("SparkSession created successfully with Kafka package!")

SparkSession created successfully with Kafka package!


In [22]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# Schema definition (Corrected)
schema = StructType([
    StructField("FLIGHT_DATE", StringType(), True),
    StructField("FLIGHT_TIME", StringType(), True),
    StructField("TIME_OF_DAY", StringType(), True),
    StructField("AIRLINE_CD", StringType(), True),
    StructField("FLIGHT_NO", StringType(), True),
    StructField("DEPARTURE_STATION_CD", StringType(), True),
    StructField("ARRIVAL_STATION_CD", StringType(), True),
    StructField("ARRIVAL_COUNTRY", StringType(), True),
    StructField("ARRIVAL_REGION", StringType(), True),
    StructField("HAUL", StringType(), True),
    StructField("AIRCRAFT_TYPE", StringType(), True),
    StructField("FIRST_CLASS_SEATS", IntegerType(), True),
    StructField("BUSINESS_CLASS_SEATS", IntegerType(), True),
    StructField("ECONOMY_SEATS", IntegerType(), True),
    StructField("TIER1_ELIGIBLE_PAX", IntegerType(), True),
    StructField("TIER2_ELIGIBLE_PAX", IntegerType(), True),
    StructField("TIER3_ELIGIBLE_PAX", IntegerType(), True)
])

# --- Create a Streaming DataFrame ---
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "host.docker.internal:9092") \
    .option("subscribe", "flight_events") \
    .load()

# --- Parse the Data ---
parsed_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")

# --- Start the Streaming Query ---
query = parsed_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()