In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, avg, to_timestamp
import pandas as pd
import time
import requests

airport_url = "https://raw.githubusercontent.com/databricks/LearningSparkV2/master/databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt"
delays_url = "https://raw.githubusercontent.com/databricks/LearningSparkV2/master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"

with open("airport-codes-na.txt", "wb") as f:
    f.write(requests.get(airport_url).content)
with open("departuredelays.csv", "wb") as f:
    f.write(requests.get(delays_url).content)

spark = SparkSession.builder \
    .appName("Simulated Real-Time Flight Delay Processing") \
    .getOrCreate()

airport_df = spark.read.option("sep", "\t").option("header", "true").csv("airport-codes-na.txt")

delays_pd = pd.read_csv("departuredelays.csv")
delays_pd['timestamp'] = pd.to_datetime(delays_pd['date'].astype(str).str.zfill(8), format="%m%d%H%M", errors='coerce')

chunk_size = 50
for start in range(0, len(delays_pd), chunk_size):
    end = min(start + chunk_size, len(delays_pd))
    chunk = delays_pd.iloc[start:end]

    spark_chunk = spark.createDataFrame(chunk)

    enriched_chunk = spark_chunk.join(
        airport_df.select("IATA", "City"),
        spark_chunk["origin"] == airport_df["IATA"],
        "left"
    ).drop("IATA")

    metrics = enriched_chunk \
        .withColumn("timestamp", to_timestamp(col("timestamp"))) \
        .groupBy(window("timestamp", "1 hour"), "City") \
        .agg(
            count("delay").alias("flight_count"),
            avg("delay").alias("avg_delay")
        )

    print(f"Metrics for chunk {start//chunk_size + 1} (rows {start}-{end-1}):")
    metrics.show(truncate=False)

    time.sleep(2)

spark.stop()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|{1900-01-01 14:00:00, 1900-01-01 15:00:00}|Austin|1           |7.0               |
|{1900-01-03 14:00:00, 1900-01-03 15:00:00}|Austin|1           |11.0              |
|{1900-01-04 06:00:00, 1900-01-04 07:00:00}|Austin|1           |5.0               |
|{1900-01-03 07:00:00, 1900-01-03 08:00:00}|Austin|3           |12.666666666666666|
|{1900-01-02 19:00:00, 1900-01-02 20:00:00}|Austin|2           |25.0              |
|{1900-01-01 07:00:00, 1900-01-01 08:00:00}|Austin|2           |-0.5              |
|{1900-01-02 07:00:00, 1900-01-02 08:00:00}|Austin|3           |17.333333333333332|
|{1900-01-01 19:00:00, 1900-01-01 20:00:00}|Austin|2           |46.0              |
|{1900-01-03 06:00:00, 1900-01-03 07:00:00}|Austin|1           |9.0               |
|{1900-01-06 14:00:00, 1900-01-06 15:00:00}|Austin|1           |68.0              |
|{1900-01-07 07:00:00, 1900-01-07 08:00:00}|Austin|2           |19.0              |
|{1900-01-0

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=48>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in sen

Metrics for chunk 960 (rows 47950-47999):


Py4JError: An error occurred while calling o49950.showString