In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp, window, avg
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# ----------------------------------------------------------------------------
# 1) SparkSession erstellen (inkl. Kafka-Connector)
# ----------------------------------------------------------------------------
spark = SparkSession.builder \
    .appName("OpenWeatherKafkaStream") \
    .master("spark://172.29.16.102:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

# ----------------------------------------------------------------------------
# 2) Schema für die JSON-Nachrichten aus dem Kafka-Topic definieren
# ----------------------------------------------------------------------------
schema = StructType([
    StructField("city_id",        StringType(), nullable=False),
    StructField("city_name",      StringType(), nullable=False),
    StructField("latitude",       DoubleType(), nullable=True),
    StructField("longitude",      DoubleType(), nullable=True),
    StructField("timestamp_utc",  StringType(), nullable=False),
    StructField("temp_celsius",   DoubleType(), nullable=True),
    StructField("temp_min_c",     DoubleType(), nullable=True),
    StructField("temp_max_c",     DoubleType(), nullable=True),
    StructField("pressure_hpa",   DoubleType(), nullable=True),
    StructField("humidity_pct",   DoubleType(), nullable=True),
    StructField("wind_speed_kph", DoubleType(), nullable=True),
    StructField("wind_direction", DoubleType(), nullable=True)
])

# ----------------------------------------------------------------------------
# 3) Streaming-DataFrame: Lesen aus dem Kafka-Topic "current-weather-api"
# ----------------------------------------------------------------------------
df_raw = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "172.29.16.101:9092") \
    .option("subscribe", "current-weather-api") \
    .option("startingOffsets", "earliest") \
    .load()

# "value" (binary) → STRING → JSON parsen
df_str  = df_raw.selectExpr("CAST(value AS STRING) AS json_str")
df_json = df_str.select(from_json(col("json_str"), schema).alias("data")).select("data.*")

# ----------------------------------------------------------------------------
# 4) "timestamp_utc" (STRING) in TimestampType umwandeln
# ----------------------------------------------------------------------------
df = df_json.withColumn(
    "ts", 
    to_timestamp(col("timestamp_utc"), "yyyy-MM-dd HH:mm:ss")
).drop("timestamp_utc")

# ----------------------------------------------------------------------------
# 5) Beispiel‐Aggregation: 10-Minuten-Durchschnittstemperatur pro Stadt
# ----------------------------------------------------------------------------
agg = df.withWatermark("ts", "30 minutes") \
    .groupBy(
        window(col("ts"), "10 minutes"),
        col("city_name")
    ).agg(
        avg("temp_celsius").alias("avg_temp_10min")
    )

# ----------------------------------------------------------------------------
# 6) Ergebnis in die Konsole schreiben (zum Testen)
# ----------------------------------------------------------------------------
query = agg.writeStream \
    .format("console") \
    .outputMode("update") \
    .option("truncate", "false") \
    .option("numRows", 50) \
    .start()


query.awaitTermination()


:: loading settings :: url = jar:file:/opt/spark-3.5.1/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/bdeng-g10/.ivy2/cache
The jars for the packages stored in: /home/bdeng-g10/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d5fe2bcd-baf9-4001-b657-103ff3735633;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 558ms :: artifacts dl 28

-------------------------------------------
Batch: 0
-------------------------------------------
+------+---------+--------------+
|window|city_name|avg_temp_10min|
+------+---------+--------------+
+------+---------+--------------+

