In [69]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, struct, collect_list, element_at
import requests
import json

In [70]:
spark = SparkSession.builder \
    .appName("mini-project") \
    .getOrCreate()

In [71]:
api = "https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json?includeTimeseries=true&hasTimeseries=WV&includeForecastTimeseries=true"
responses = requests.get(api).json()
print(responses[0])

{'uuid': 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'number': '10089006', 'shortname': 'VILSHOFEN', 'longname': 'VILSHOFEN', 'km': 2249.5, 'agency': 'STANDORT REGENSBURG', 'longitude': 13.182352727075328, 'latitude': 48.63719446076585, 'water': {'shortname': 'DONAU', 'longname': 'DONAU'}, 'timeseries': [{'shortname': 'W', 'longname': 'WASSERSTAND ROHDATEN', 'unit': 'cm', 'equidistance': 15, 'gaugeZero': {'unit': 'm. ü. NHN', 'value': 297.043, 'validFrom': '2019-01-01'}}, {'shortname': 'WV', 'longname': 'WASSERSTANDVORHERSAGE', 'unit': 'cm', 'equidistance': 120, 'start': '2024-07-12T17:00:00+02:00', 'end': '2024-07-16T05:00:00+02:00', 'comment': {'shortDescription': 'nwv-bfg', 'longDescription': 'Vorhersagen und Abschätzungen vom: 12.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}}, {'shortname': 'LT', 'longname': 'LUFTTEMPERATUR', 'unit': '°C

In [72]:
timeseries_data = [{'uuid': item['uuid'], 'timeseries_data': item['timeseries']} for item in responses]
[item.pop('timeseries') for item in responses]

[[{'shortname': 'W',
   'longname': 'WASSERSTAND ROHDATEN',
   'unit': 'cm',
   'equidistance': 15,
   'gaugeZero': {'unit': 'm. ü. NHN',
    'value': 297.043,
    'validFrom': '2019-01-01'}},
  {'shortname': 'WV',
   'longname': 'WASSERSTANDVORHERSAGE',
   'unit': 'cm',
   'equidistance': 120,
   'start': '2024-07-12T17:00:00+02:00',
   'end': '2024-07-16T05:00:00+02:00',
   'comment': {'shortDescription': 'nwv-bfg',
    'longDescription': 'Vorhersagen und Abschätzungen vom: 12.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}},
  {'shortname': 'LT',
   'longname': 'LUFTTEMPERATUR',
   'unit': '°C',
   'equidistance': 60},
  {'shortname': 'WT',
   'longname': 'WASSERTEMPERATUR',
   'unit': '°C',
   'equidistance': 60,
   'comment': {'shortDescription': 'Wassertemperatursonde ausgefallen.',
    'longDescription': 'Wassertemperatursonde ausge

In [73]:
response_str = json.dumps(responses)

# Load the JSON string into a Spark DataFrame
response_df = spark.read.json(spark.sparkContext.parallelize([response_str]))

# Flatten the dataframe
df_flattened = response_df.select(
    col("agency"),
    col("km").cast("double"),
    col("latitude").cast("double"),
    col("longitude").cast("double"),
    col("longname").cast("double"),
    col("number"),
    col("shortname"),
    col("uuid"),
    col("water.longname").alias("water_longname"),
    col("water.shortname").alias("water_shortname")
)

In [74]:
df_flattened.printSchema()
df_flattened.show()

root
 |-- agency: string (nullable = true)
 |-- km: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- longname: double (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- water_longname: string (nullable = true)
 |-- water_shortname: string (nullable = true)

+-------------------+------+------------------+------------------+--------+--------+--------------------+--------------------+--------------+---------------+
|             agency|    km|          latitude|         longitude|longname|  number|           shortname|                uuid|water_longname|water_shortname|
+-------------------+------+------------------+------------------+--------+--------+--------------------+--------------------+--------------+---------------+
|STANDORT REGENSBURG|2249.5| 48.63719446076585|13.182352727075328|    NULL|10089006|           VILSHOFEN|e6d68ab7-5c27-

In [75]:
"""
    Basically, there are five types of data the timeseries contains:
    - WASSERSTAND ROHDATEN: water level raw data
    - WASSERSTANDVORHERSAGE: water level forecast
    - WASSERTEMPERATUR: water temperature
    - LUFTTEMPERATUR: air temperature 
    - ABFLUSS: drain
    We will divide it into five dataframes, each correspond to a table in the db
"""

timeseries_df = spark.createDataFrame(timeseries_data)
timeseries_df = timeseries_df.withColumn("timeseries_data", explode("timeseries_data")).select("*")
timeseries_df = timeseries_df.withColumn("category",element_at(col("timeseries_data"),"longname"))
timeseries_grouped = timeseries_df.groupBy("category").agg(collect_list(struct(col("uuid"), col("timeseries_data"))).alias("timeseries_data"))
timeseries_grouped.show(truncate=True)

+--------------------+--------------------+
|            category|     timeseries_data|
+--------------------+--------------------+
|WASSERSTANDVORHER...|[{e6d68ab7-5c27-4...|
|      LUFTTEMPERATUR|[{e6d68ab7-5c27-4...|
|    WASSERTEMPERATUR|[{e6d68ab7-5c27-4...|
|WASSERSTAND ROHDATEN|[{e6d68ab7-5c27-4...|
|             ABFLUSS|[{616dd98e-816d-4...|
|    ABFLUSS_ROHDATEN|[{7cb7461b-3530-4...|
|WASSERTEMPERATUR ...|[{a6ee8177-107b-4...|
+--------------------+--------------------+



In [76]:
water_level_raw_df = timeseries_grouped.filter(timeseries_grouped["category"] == "ASSERSTAND ROHDATEN") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_level_forecast_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERSTANDVORHERSAGE") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
air_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "LUFTTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
drain_df = timeseries_grouped.filter(timeseries_grouped["category"] == "ABFLUSS") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))

water_temperature_df.show(truncate=False)

+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category        |timeseries_data                                                                                                                                                                                                                                                                                   |
+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|WASSERTEMPERATUR|{e6d68ab7-5c27-4f25-896f-11dbf04056cd, {longname -> 

In [77]:
!pip install psycopg2-binary



In [78]:
import psycopg2
from psycopg2.extras import execute_values

conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS station_info (
    agency VARCHAR,
    km DOUBLE PRECISION,
    latitude DOUBLE PRECISION,
    longitude DOUBLE PRECISION,
    longname VARCHAR,
    number VARCHAR,
    shortname VARCHAR,
    uuid UUID,
    water_longname VARCHAR,
    water_shortname VARCHAR
);

CREATE TABLE IF NOT EXISTS timeseries_info (
    category VARCHAR,
    timeseries_data JSONB
);

""")
conn.commit()

In [79]:
# Function to insert data into PostgreSQL
def insert_data(table, data, columns):
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    insert_query = f"INSERT INTO {table} ({', '.join(columns)}) VALUES %s"
    execute_values(cursor, insert_query, data)
    conn.commit()

# Convert Spark DataFrame to list of tuples for insertion into PostgreSQL
flattened_data = [tuple(row) for row in df_flattened.collect()]

# Convert the timeseries data to JSON strings
timeseries_data_list = [(row['category'], json.dumps(row['timeseries_data'])) for row in timeseries_grouped.collect()]

# Insert flattened data into PostgreSQL
flattened_columns = ["agency", "km", "latitude", "longitude", "longname", "number", "shortname", "uuid", "water_longname", "water_shortname"]
insert_data("station_info", flattened_data, flattened_columns)

# Insert timeseries data into PostgreSQL
timeseries_columns = ["category", "timeseries_data"]
insert_data("timeseries_info", timeseries_data_list, timeseries_columns)
cursor.execute("SELECT * FROM station_info")
rows = cursor.fetchall()
for row in rows:
    print(row)
    
cursor.close()
conn.close()

('STANDORT REGENSBURG', 2249.5, 48.63719446076585, 13.182352727075328, None, '10089006', 'VILSHOFEN', 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'DONAU', 'DONAU')
('STANDORT REGENSBURG', 2256.9, 48.676629660669875, 13.115159742918724, None, '10088003', 'HOFKIRCHEN', '616dd98e-816d-4e17-b8cd-57b163dbc8a2', 'DONAU', 'DONAU')
('STANDORT REGENSBURG', 2284.4, 48.824943730974475, 12.962349954256105, None, '10081004', 'DEGGENDORF', '53d40547-8a09-4b25-988c-2e6d8d8d98ee', 'DONAU', 'DONAU')
('STANDORT REGENSBURG', 2305.5, 48.87976613950202, 12.74721651873093, None, '10078000', 'PFELLING', '7fe63a95-8ff6-4cff-9a29-a124136516b5', 'DONAU', 'DONAU')
('STANDORT DRESDEN', 2.05, 50.87576232293828, 14.235228651495222, None, '501010', 'SCHÖNA', '7cb7461b-3530-4c01-8978-7f676b8f71ed', 'ELBE', 'ELBE')
('STANDORT DRESDEN', 55.63, 51.054459765598125, 13.738831783620384, None, '501060', 'DRESDEN', '70272185-b2b3-4178-96b8-43bea330dcae', 'ELBE', 'ELBE')
('STANDORT DRESDEN', 108.4, 51.31148159700113, 13.293340521