# Analyse ZAMG Daten mit Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession \
    .builder \
    .appName("Read Data") \
    .getOrCreate()

df_raw = spark.read.option("delimiter", ";").csv("../data/zamg/zamg_burgenland.csv")
column_names=["location","alt",'deg', 'hum', 'wind_speed', 'wind_speed_max', 'rain', 'sun', 'press', 'timestamp']
df_raw = df_raw.toDF(*column_names)
df_raw.printSchema()
df_raw.show()

root
 |-- location: string (nullable = true)
 |-- alt: string (nullable = true)
 |-- deg: string (nullable = true)
 |-- hum: string (nullable = true)
 |-- wind_speed: string (nullable = true)
 |-- wind_speed_max: string (nullable = true)
 |-- rain: string (nullable = true)
 |-- sun: string (nullable = true)
 |-- press: string (nullable = true)
 |-- timestamp: string (nullable = true)

+-----------------+----+-----+----+---------------+--------------+------+-----+----------+-------------------+
|         location| alt|  deg| hum|     wind_speed|wind_speed_max|  rain|  sun|     press|          timestamp|
+-----------------+----+-----+----+---------------+--------------+------+-----+----------+-------------------+
|     Bruckneudorf|166m|21.1°|46 %| Südost, 9 km/h|       25 km/h|0.0 mm| 28 %|1007.0 hPa|2022-05-23 19:50:03|
|  Neusiedl am See|119m|21.4°|49 %|   Süd, 11 km/h|       27 km/h|0.0 mm| 32 %|1007.3 hPa|2022-05-23 19:50:03|
|       Podersdorf|116m|21.8°|44 %|   Süd, 10 km/h|      

In [3]:
# convert strings to correct datatypes
## datetime
df = df_raw.select(col("location") \
          , substring_index(col("deg"), "°", 1).cast('double').alias("deg")  \
          , substring_index(col("rain"), " mm", 1).cast('double').alias("rain")  \
          , to_timestamp(col("timestamp"),"yyyy-MM-dd HH:mm:ss").alias("timestamp"))
df.printSchema()
df.show()

root
 |-- location: string (nullable = true)
 |-- deg: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)

+-----------------+----+----+-------------------+
|         location| deg|rain|          timestamp|
+-----------------+----+----+-------------------+
|     Bruckneudorf|21.1| 0.0|2022-05-23 19:50:03|
|  Neusiedl am See|21.4| 0.0|2022-05-23 19:50:03|
|       Podersdorf|21.8| 0.0|2022-05-23 19:50:03|
|       Eisenstadt|21.6| 0.0|2022-05-23 19:50:03|
|            Andau|21.0| 0.0|2022-05-23 19:50:03|
|      Mattersburg|20.4| 0.0|2022-05-23 19:50:03|
|  Neudorf/Landsee|18.9| 0.0|2022-05-23 19:50:03|
|    Lutzmannsburg|21.2| 0.0|2022-05-23 19:50:03|
|        Bernstein|17.9| 0.0|2022-05-23 19:50:03|
|         Rechnitz|20.6| 0.0|2022-05-23 19:50:03|
|Bad Tatzmannsdorf|20.1| 0.0|2022-05-23 19:50:03|
|         Kroisegg|19.7| 0.0|2022-05-23 19:50:03|
|      Kleinzicken|20.8| 0.0|2022-05-23 19:50:03|
|       Wörterberg|19.6| 0.0|2022-05-23

In [4]:
df.groupBy("location").agg(sum("rain"),avg('deg'),min('timestamp'),max('timestamp')).show()

+-----------------+------------------+------------------+-------------------+-------------------+
|         location|         sum(rain)|          avg(deg)|     min(timestamp)|     max(timestamp)|
+-----------------+------------------+------------------+-------------------+-------------------+
|  Neudorf/Landsee| 53.50000000000001| 15.66851063829787|2022-05-23 19:50:03|2022-06-02 13:50:03|
|          Güssing| 49.00000000000001| 16.75531914893617|2022-05-23 19:50:03|2022-06-02 13:50:03|
|       Wörterberg|56.800000000000004|15.670212765957451|2022-05-23 19:50:03|2022-06-02 13:50:03|
|Bad Tatzmannsdorf| 60.40000000000001| 16.31914893617022|2022-05-23 19:50:03|2022-06-02 13:50:03|
|        Bernstein| 76.50000000000001|14.379999999999999|2022-05-23 19:50:03|2022-06-02 13:50:03|
|    Lutzmannsburg| 70.00000000000001|16.628510638297872|2022-05-23 19:50:03|2022-06-02 13:50:03|
|      Mattersburg| 40.20000000000001|16.528936170212763|2022-05-23 19:50:03|2022-06-02 13:50:03|
|         Rechnitz| 

In [5]:
df = df.withColumn("date",to_date(col("timestamp")))
dfDay = df.groupBy("location", "date").agg(round(sum('rain'), 1), round(avg('deg'), 1))

In [6]:
column_names=['location', 'date', 'rain', 'deg']
dfDay = dfDay.toDF(*column_names)

dfDay.printSchema()

root
 |-- location: string (nullable = true)
 |-- date: date (nullable = true)
 |-- rain: double (nullable = true)
 |-- deg: double (nullable = true)



In [7]:
dfDay.show()

+-----------------+----------+----+----+
|         location|      date|rain| deg|
+-----------------+----------+----+----+
|  Neudorf/Landsee|2022-05-25| 6.7|15.2|
|       Eisenstadt|2022-05-26| 0.0|19.0|
|     Bruckneudorf|2022-05-29| 0.0|12.9|
|Bad Tatzmannsdorf|2022-05-29| 6.9|10.4|
|    Lutzmannsburg|2022-06-01| 1.4|19.6|
|Bad Tatzmannsdorf|2022-06-02|10.3|18.0|
|  Neusiedl am See|2022-05-26| 0.2|18.4|
|      Mattersburg|2022-05-29| 2.9|10.9|
|  Neusiedl am See|2022-05-27| 0.0|21.7|
|  Neusiedl am See|2022-05-28| 0.0|16.3|
|    Lutzmannsburg|2022-05-29| 7.0|11.2|
|     Bruckneudorf|2022-05-23| 0.0|19.5|
|            Andau|2022-05-23| 0.0|19.1|
|  Neusiedl am See|2022-05-24| 2.1|18.5|
|         Rechnitz|2022-06-01| 0.7|19.3|
|          Güssing|2022-05-30| 0.0|11.3|
|          Güssing|2022-05-24|15.1|19.7|
|         Kroisegg|2022-05-28| 0.0|14.3|
|       Podersdorf|2022-05-30| 0.0|14.6|
|Bad Tatzmannsdorf|2022-05-27| 0.0|19.8|
+-----------------+----------+----+----+
only showing top