In [252]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, struct, element_at, to_timestamp, collect_list, from_json, first
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType, DateType
import requests
import json
!pip install psycopg2-binary
import psycopg2
from psycopg2.extras import execute_values



In [253]:
spark = SparkSession.builder \
    .appName("mini-project") \
    .getOrCreate()

In [254]:
api = "https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json?includeTimeseries=true&hasTimeseries=WV&includeForecastTimeseries=true"
responses = requests.get(api).json()
print(responses[0])

{'uuid': 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'number': '10089006', 'shortname': 'VILSHOFEN', 'longname': 'VILSHOFEN', 'km': 2249.5, 'agency': 'STANDORT REGENSBURG', 'longitude': 13.182352727075328, 'latitude': 48.63719446076585, 'water': {'shortname': 'DONAU', 'longname': 'DONAU'}, 'timeseries': [{'shortname': 'W', 'longname': 'WASSERSTAND ROHDATEN', 'unit': 'cm', 'equidistance': 15, 'gaugeZero': {'unit': 'm. ü. NHN', 'value': 297.043, 'validFrom': '2019-01-01'}}, {'shortname': 'WV', 'longname': 'WASSERSTANDVORHERSAGE', 'unit': 'cm', 'equidistance': 120, 'start': '2024-07-13T15:00:00+02:00', 'end': '2024-07-17T05:00:00+02:00', 'comment': {'shortDescription': 'nwv-bfg', 'longDescription': 'Vorhersagen und Abschätzungen vom: 13.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}}, {'shortname': 'LT', 'longname': 'LUFTTEMPERATUR', 'unit': '°C

In [255]:
timeseries_data = [{'uuid': item['uuid'], 'timeseries_data': item['timeseries']} for item in responses]
[item.pop('timeseries') for item in responses]

[[{'shortname': 'W',
   'longname': 'WASSERSTAND ROHDATEN',
   'unit': 'cm',
   'equidistance': 15,
   'gaugeZero': {'unit': 'm. ü. NHN',
    'value': 297.043,
    'validFrom': '2019-01-01'}},
  {'shortname': 'WV',
   'longname': 'WASSERSTANDVORHERSAGE',
   'unit': 'cm',
   'equidistance': 120,
   'start': '2024-07-13T15:00:00+02:00',
   'end': '2024-07-17T05:00:00+02:00',
   'comment': {'shortDescription': 'nwv-bfg',
    'longDescription': 'Vorhersagen und Abschätzungen vom: 13.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}},
  {'shortname': 'LT',
   'longname': 'LUFTTEMPERATUR',
   'unit': '°C',
   'equidistance': 60},
  {'shortname': 'WT',
   'longname': 'WASSERTEMPERATUR',
   'unit': '°C',
   'equidistance': 60,
   'comment': {'shortDescription': 'Wassertemperatursonde ausgefallen.',
    'longDescription': 'Wassertemperatursonde ausge

In [256]:
station_schema = StructType([
    StructField("uuid", StringType(), True),
    StructField("number", StringType(), True),
    StructField("shortname", StringType(), True),
    StructField("longname", StringType(), True),
    StructField("km", DoubleType(), True),
    StructField("agency", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("water_shortname", StringType(), True),
    StructField("water_longname", StringType(), True)
])

In [257]:
response_str = json.dumps(responses)

# Load the JSON string into a Spark DataFrame
response_df = spark.read.json(spark.sparkContext.parallelize([response_str]), schema=station_schema)

# Flatten the dataframe
df_flattened = response_df.select(
    col("uuid"),
    col("number"),
    col("shortname"),
    col("longname").cast("double"),
    col("km").cast("double"),
    col("agency"),
    col("longitude").cast("double"),
    col("latitude").cast("double"),
    col("water_shortname"),
    col("water_longname")
)

In [258]:
df_flattened.printSchema()
df_flattened.show()

root
 |-- uuid: string (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- longname: double (nullable = true)
 |-- km: double (nullable = true)
 |-- agency: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- water_shortname: string (nullable = true)
 |-- water_longname: string (nullable = true)

+--------------------+--------+--------------------+--------+------+-------------------+------------------+------------------+---------------+--------------+
|                uuid|  number|           shortname|longname|    km|             agency|         longitude|          latitude|water_shortname|water_longname|
+--------------------+--------+--------------------+--------+------+-------------------+------------------+------------------+---------------+--------------+
|e6d68ab7-5c27-4f2...|10089006|           VILSHOFEN|    NULL|2249.5|STANDORT REGENSBURG|13.182352727075328| 48.63719446

In [259]:
"""
    Basically, there are five types of data the timeseries contains:
    - WASSERSTAND ROHDATEN: water level raw data
    - WASSERSTANDVORHERSAGE: water level forecast
    - WASSERTEMPERATUR: water temperature
    - LUFTTEMPERATUR: air temperature 
    - ABFLUSS: drain
    We will divide it into five dataframes, each correspond to a table in the db
"""

# Explode the timeseries_data array into rows
timeseries_df = spark.createDataFrame(timeseries_data)
timeseries_df = timeseries_df.withColumn("timeseries_data", explode("timeseries_data")).select("*")
timeseries_df = timeseries_df.withColumn("category",element_at(col("timeseries_data"),"longname"))
timeseries_grouped = timeseries_df.groupBy("category").agg(collect_list(struct(col("uuid"), col("timeseries_data"))).alias("timeseries_data"))
timeseries_grouped.show(truncate=True)

+--------------------+--------------------+
|            category|     timeseries_data|
+--------------------+--------------------+
|WASSERSTANDVORHER...|[{e6d68ab7-5c27-4...|
|      LUFTTEMPERATUR|[{e6d68ab7-5c27-4...|
|    WASSERTEMPERATUR|[{e6d68ab7-5c27-4...|
|WASSERSTAND ROHDATEN|[{e6d68ab7-5c27-4...|
|             ABFLUSS|[{616dd98e-816d-4...|
|    ABFLUSS_ROHDATEN|[{7cb7461b-3530-4...|
|WASSERTEMPERATUR ...|[{a6ee8177-107b-4...|
+--------------------+--------------------+



In [260]:
water_level_raw_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERSTAND ROHDATEN") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_level_forecast_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERSTANDVORHERSAGE") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"].isin("WASSERTEMPERATUR","WASSERTEMPERATUR ROHDATEN")) \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
air_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "LUFTTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
drain_df = timeseries_grouped.filter(timeseries_grouped["category"].isin("ABFLUSS","ABFLUSS_ROHDATEN")) \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))

water_temperature_df.show(truncate=False)

+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category                 |timeseries_data                                                                                                                                                                                                                                                                                   |
+-------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|WASSERTEMPERATUR         |{e6d68ab7-5c27-4

In [261]:
water_level_raw_df = water_level_raw_df.select(col("timeseries_data.uuid"), col("timeseries_data.timeseries_data"))
water_level_raw_df = water_level_raw_df.select("uuid",explode("timeseries_data")).groupBy("uuid").pivot("key").agg(first("value"))
water_level_forecast_df = water_level_forecast_df.select(col("timeseries_data.uuid"), col("timeseries_data.timeseries_data"))
water_level_forecast_df = water_level_forecast_df.select("uuid",explode("timeseries_data")).groupBy("uuid").pivot("key").agg(first("value"))
water_temperature_df = water_temperature_df.select(col("timeseries_data.uuid"), col("timeseries_data.timeseries_data"))
water_temperature_df = water_temperature_df.select("uuid",explode("timeseries_data")).groupBy("uuid").pivot("key").agg(first("value"))
air_temperature_df = air_temperature_df.select(col("timeseries_data.uuid"), col("timeseries_data.timeseries_data"))
air_temperature_df = air_temperature_df.select("uuid",explode("timeseries_data")).groupBy("uuid").pivot("key").agg(first("value"))
drain_df = drain_df.select(col("timeseries_data.uuid"), col("timeseries_data.timeseries_data"))
drain_df = drain_df.select("uuid",explode("timeseries_data")).groupBy("uuid").pivot("key").agg(first("value"))

water_level_raw_df.show(truncate=False)

+------------------------------------+------------+-----------------------------------------------------+--------------------+---------+----+
|uuid                                |equidistance|gaugeZero                                            |longname            |shortname|unit|
+------------------------------------+------------+-----------------------------------------------------+--------------------+---------+----+
|070b1eb4-3872-4e07-b2e5-e25fd9251b93|15          |{unit=m. ü. NHN, validFrom=2019-11-01, value=62.45}  |WASSERSTAND ROHDATEN|W        |cm  |
|094b96e5-caeb-46d3-a8ee-d44182add069|15          |{unit=m. ü. NHN, validFrom=2019-11-01, value=50.217} |WASSERSTAND ROHDATEN|W        |cm  |
|13e91b77-90f3-41a5-a320-641748e9c311|15          |{unit=m. ü. NHN, validFrom=2019-11-01, value=27.556} |WASSERSTAND ROHDATEN|W        |cm  |
|1d26e504-7f9e-480a-b52c-5932be6549ab|15          |{unit=m. ü. NHN, validFrom=2019-11-01, value=67.669} |WASSERSTAND ROHDATEN|W        |cm  |
|1edc5

In [262]:
# Define the PostgreSQL connection parameters
conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

# Create table queries
create_table_queries = {
    "stations": """
        CREATE TABLE IF NOT EXISTS stations (
            uuid UUID PRIMARY KEY,
            number VARCHAR(50),
            shortname VARCHAR(50),
            longname VARCHAR(255),
            km DOUBLE PRECISION,
            agency VARCHAR(255),
            longitude DOUBLE PRECISION,
            latitude DOUBLE PRECISION,
            water_shortname VARCHAR(50),
            water_longname VARCHAR(255)
        );
    """,
    "water_level_raw": """
        CREATE TABLE IF NOT EXISTS water_level_raw (
            -- id SERIAL PRIMARY KEY,
            station_uuid UUID,
            timeseries_end TIMESTAMPTZ,
            timeseries_unit VARCHAR(50),
            timeseries_start TIMESTAMPTZ,
            timeseries_comment TEXT,
            timeseries_longname VARCHAR(255),
            timeseries_shortname VARCHAR(50),
            timeseries_equidistance VARCHAR(50),
            FOREIGN KEY (station_uuid) REFERENCES stations (uuid)
        );
    """,
    "water_level_forecast": """
        CREATE TABLE IF NOT EXISTS water_level_forecast (
            -- id SERIAL PRIMARY KEY,
            station_uuid UUID,
            timeseries_end TIMESTAMPTZ,
            timeseries_unit VARCHAR(50),
            timeseries_start TIMESTAMPTZ,
            timeseries_comment TEXT,
            timeseries_longname VARCHAR(255),
            timeseries_shortname VARCHAR(50),
            timeseries_equidistance VARCHAR(50),
            FOREIGN KEY (station_uuid) REFERENCES stations (uuid)
        );
    """,
    "water_temperature": """
        CREATE TABLE IF NOT EXISTS water_temperature (
            -- id SERIAL PRIMARY KEY,
            station_uuid UUID,
            timeseries_end TIMESTAMPTZ,
            timeseries_unit VARCHAR(50),
            timeseries_start TIMESTAMPTZ,
            timeseries_comment TEXT,
            timeseries_longname VARCHAR(255),
            timeseries_shortname VARCHAR(50),
            timeseries_equidistance VARCHAR(50),
            FOREIGN KEY (station_uuid) REFERENCES stations (uuid)
        );
    """,
    "air_temperature": """
        CREATE TABLE IF NOT EXISTS air_temperature (
            -- id SERIAL PRIMARY KEY,
            station_uuid UUID,
            timeseries_end TIMESTAMPTZ,
            timeseries_unit VARCHAR(50),
            timeseries_start TIMESTAMPTZ,
            timeseries_comment TEXT,
            timeseries_longname VARCHAR(255),
            timeseries_shortname VARCHAR(50),
            timeseries_equidistance VARCHAR(50),
            FOREIGN KEY (station_uuid) REFERENCES stations (uuid)
        );
    """,
    "drain": """
        CREATE TABLE IF NOT EXISTS drain (
            -- id SERIAL PRIMARY KEY,
            station_uuid UUID,
            timeseries_end TIMESTAMPTZ,
            timeseries_unit VARCHAR(50),
            timeseries_start TIMESTAMPTZ,
            timeseries_comment TEXT,
            timeseries_longname VARCHAR(255),
            timeseries_shortname VARCHAR(50),
            timeseries_equidistance VARCHAR(50),
            FOREIGN KEY (station_uuid) REFERENCES stations (uuid)
        );
    """
}

In [263]:
def create_tables():
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    for query in create_table_queries.values():
        cursor.execute(query)
    conn.commit()
    cursor.close()
    conn.close()

create_tables()

# Function to insert data into PostgreSQL
def insert_data(table, data):
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    insert_query = f"INSERT INTO {table} VALUES %s"
    execute_values(cursor, insert_query, data)
    conn.commit()
    cursor.close()
    conn.close()

# Insert data into the stations table
# stations_data = [(row["uuid"], row["number"], row["shortname"], row["longname"], row["km"], row["agency"], row["longitude"], row["latitude"], row["water_shortname"], row["water_longname"]) for row in df_flattened.collect()]
# insert_data("stations", stations_data, ["uuid", "number", "shortname", "longname", "km", "agency", "longitude", "latitude", "water_shortname", "water_longname"])

# Prepare data for insertion into each table
def prepare_data_for_insertion(df):
    columns = df.columns
    data = [tuple(row[col] for col in columns) for row in df.collect()]
    return data

# Insert data into PostgreSQL tables
def insert_data_into_postgres():
    # Insert into stations table
    stations_data = prepare_data_for_insertion(df_flattened)
    insert_data("stations", stations_data)

    # Insert into water_level_raw table
    water_level_raw_data = prepare_data_for_insertion(water_level_raw_df)
    insert_data("water_level_raw", water_level_raw_data)

    # Insert into water_level_forecast table
    water_level_forecast_data = prepare_data_for_insertion(water_level_forecast_df)
    insert_data("water_level_forecast", water_level_forecast_data)

    # Insert into water_temperature table
    water_temperature_data = prepare_data_for_insertion(water_temperature_df)
    insert_data("water_temperature", water_temperature_data)

    # Insert into air_temperature table
    air_temperature_data = prepare_data_for_insertion(air_temperature_df)
    insert_data("air_temperature", air_temperature_data)

    # Insert into drain table
    drain_data = prepare_data_for_insertion(drain_df)
    insert_data("drain", drain_data)

# Execute table creation and data insertion
create_tables()
insert_data_into_postgres()


DatetimeFieldOverflow: date/time field value out of range: "15"
LINE 1: ...aw VALUES ('070b1eb4-3872-4e07-b2e5-e25fd9251b93','15','{uni...
                                                             ^
HINT:  Perhaps you need a different "datestyle" setting.


In [None]:
# Verification
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

# Print sample rows from each table
def print_sample_rows(table):
    cursor.execute(f"SELECT * FROM {table} LIMIT 5")
    print(f"\nSample rows from {table}:")
    rows = cursor.fetchall()
    for row in rows:
        print(row)

for table in ["stations", "water_level_raw", "water_level_forecast", "water_temperature", "air_temperature", "drain"]:
    print_sample_rows(table)

cursor.close()
conn.close()