In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, explode, struct,collect_list, from_json, element_at
import requests
import json

In [2]:
spark = SparkSession.builder \
    .appName("example-app") \
    .getOrCreate()

In [19]:
api = "https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json?includeTimeseries=true&hasTimeseries=WV&includeForecastTimeseries=true"
responses = requests.get(api).json()
print(responses[0])

{'uuid': 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'number': '10089006', 'shortname': 'VILSHOFEN', 'longname': 'VILSHOFEN', 'km': 2249.5, 'agency': 'STANDORT REGENSBURG', 'longitude': 13.182352727075328, 'latitude': 48.63719446076585, 'water': {'shortname': 'DONAU', 'longname': 'DONAU'}, 'timeseries': [{'shortname': 'W', 'longname': 'WASSERSTAND ROHDATEN', 'unit': 'cm', 'equidistance': 15, 'gaugeZero': {'unit': 'm. ü. NHN', 'value': 297.043, 'validFrom': '2019-01-01'}}, {'shortname': 'WV', 'longname': 'WASSERSTANDVORHERSAGE', 'unit': 'cm', 'equidistance': 120, 'start': '2024-07-11T19:00:00+02:00', 'end': '2024-07-15T05:00:00+02:00', 'comment': {'shortDescription': 'nwv-bfg', 'longDescription': 'Vorhersagen und Abschätzungen vom: 11.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}}, {'shortname': 'LT', 'longname': 'LUFTTEMPERATUR', 'unit': '°C

In [21]:
timeseries_data = [{'uuid': item['uuid'], 'timeseries_data': item['timeseries']} for item in responses]
[item.pop('timeseries') for item in responses]

[[{'shortname': 'W',
   'longname': 'WASSERSTAND ROHDATEN',
   'unit': 'cm',
   'equidistance': 15,
   'gaugeZero': {'unit': 'm. ü. NHN',
    'value': 297.043,
    'validFrom': '2019-01-01'}},
  {'shortname': 'WV',
   'longname': 'WASSERSTANDVORHERSAGE',
   'unit': 'cm',
   'equidistance': 120,
   'start': '2024-07-11T19:00:00+02:00',
   'end': '2024-07-15T05:00:00+02:00',
   'comment': {'shortDescription': 'nwv-bfg',
    'longDescription': 'Vorhersagen und Abschätzungen vom: 11.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}},
  {'shortname': 'LT',
   'longname': 'LUFTTEMPERATUR',
   'unit': '°C',
   'equidistance': 60},
  {'shortname': 'WT',
   'longname': 'WASSERTEMPERATUR',
   'unit': '°C',
   'equidistance': 60,
   'comment': {'shortDescription': 'Wassertemperatursonde ausgefallen.',
    'longDescription': 'Wassertemperatursonde ausge

In [30]:
response_str = json.dumps(responses)

# Load the JSON string into a Spark DataFrame
response_df = spark.read.json(spark.sparkContext.parallelize([response_str]))

# Flatten the dataframe
df_flattened = response_df.select(
    col("agency"),
    col("km").cast("double"),
    col("latitude").cast("double"),
    col("longitude").cast("double"),
    col("longname").cast("double"),
    col("number"),
    col("shortname"),
    col("uuid"),
    col("water.longname").alias("water_longname"),
    col("water.shortname").alias("water_shortname")
)

In [31]:
df_flattened.printSchema()
df_flattened.show()

root
 |-- agency: string (nullable = true)
 |-- km: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- longname: double (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- water_longname: string (nullable = true)
 |-- water_shortname: string (nullable = true)

+-------------------+------+------------------+------------------+--------+--------+--------------------+--------------------+--------------+---------------+
|             agency|    km|          latitude|         longitude|longname|  number|           shortname|                uuid|water_longname|water_shortname|
+-------------------+------+------------------+------------------+--------+--------+--------------------+--------------------+--------------+---------------+
|STANDORT REGENSBURG|2249.5| 48.63719446076585|13.182352727075328|    NULL|10089006|           VILSHOFEN|e6d68ab7-5c27-

In [83]:
"""
    Basically, there are five types of data the timeseries contains:
    - WASSERSTAND ROHDATEN: water level raw data
    - WASSERSTANDVORHERSAGE: water level forecast
    - WASSERTEMPERATUR: water temperature
    - LUFTTEMPERATUR: air temperature 
    - ABFLUSS: drain
    We will divide it into five dataframes, each correspond to a table in the db
"""

timeseries_df = spark.createDataFrame(timeseries_data)
timeseries_df = timeseries_df.withColumn("timeseries_data", explode("timeseries_data")).select("*")
timeseries_df = timeseries_df.withColumn("category",element_at(col("timeseries_data"),"longname"))
timeseries_grouped = timeseries_df.groupBy("category").agg(collect_list(struct(col("uuid"), col("timeseries_data"))).alias("timeseries_data"))
timeseries_grouped.show(truncate=True)

+--------------------+--------------------+
|            category|     timeseries_data|
+--------------------+--------------------+
|WASSERSTANDVORHER...|[{e6d68ab7-5c27-4...|
|      LUFTTEMPERATUR|[{e6d68ab7-5c27-4...|
|    WASSERTEMPERATUR|[{e6d68ab7-5c27-4...|
|WASSERSTAND ROHDATEN|[{e6d68ab7-5c27-4...|
|             ABFLUSS|[{616dd98e-816d-4...|
|    ABFLUSS_ROHDATEN|[{7cb7461b-3530-4...|
|WASSERTEMPERATUR ...|[{a6ee8177-107b-4...|
+--------------------+--------------------+



In [79]:
water_level_raw_df = timeseries_grouped.filter(timeseries_grouped["category"] == "ASSERSTAND ROHDATEN") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_level_forecast_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERSTANDVORHERSAGE") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
air_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "LUFTTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
drain_df = timeseries_grouped.filter(timeseries_grouped["category"] == "ABFLUSS") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))

water_temperature_df.show(truncate=False)

+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category        |timeseries_data                                                                                                                                                                                                                                                                                   |
+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|WASSERTEMPERATUR|{e6d68ab7-5c27-4f25-896f-11dbf04056cd, {longname -> 

In [7]:
!pip install psycopg2-binary



In [8]:
import psycopg2

conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE IF NOT EXISTS test_table (
    id SERIAL PRIMARY KEY,
    first_name VARCHAR(50),
    last_name VARCHAR(50)
)
""")
conn.commit()

cursor.execute("""
INSERT INTO test_table (first_name, last_name)
VALUES (%s, %s), (%s, %s), (%s, %s)
""", ('James', 'Smith', 'Anna', 'Rose', 'Robert', 'Williams'))
conn.commit()

cursor.execute("SELECT * FROM test_table")
rows = cursor.fetchall()
for row in rows:
    print(row)

cursor.close()
conn.close()


(1, 'James', 'Smith')
(2, 'Anna', 'Rose')
(3, 'Robert', 'Williams')
