In [140]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, explode, struct, collect_list, element_at, lit
import requests
import json
import psycopg2
from psycopg2.extras import execute_values

In [141]:
spark = SparkSession.builder \
    .appName("mini-project") \
    .getOrCreate()

In [142]:
api = "https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json?includeTimeseries=true&hasTimeseries=WV&includeForecastTimeseries=true"
responses = requests.get(api).json()

In [143]:
stations_data = []
timeseries_data = []

In [144]:
for station in responses:
    # Station entry
    station_entry = {
        "uuid": station["uuid"],
        "number": station["number"],
        "shortname": station["shortname"],
        "longname": station["longname"],
        "km": station["km"],
        "agency": station["agency"],
        "longitude": station["longitude"],
        "latitude": station["latitude"],
        "water_shortname": station["water"]["shortname"],
        "water_longname": station["water"]["longname"]
    }
    
    # Append station data if not already in list
    if station_entry not in stations_data:
        stations_data.append(station_entry)

    # Timeseries data for the current station
    for ts in station["timeseries"]:
        timeseries_entry = {
            "station_uuid": station["uuid"],
            "shortname": ts.get("shortname"),
            "longname": ts.get("longname"),
            "unit": ts.get("unit"),
            "equidistance": ts.get("equidistance"),
            "timeseries_start": ts.get("start"),
            "timeseries_end": ts.get("end"),
            "comment_shortDescription": ts.get("comment", {}).get("shortDescription"),
            "comment_longDescription": ts.get("comment", {}).get("longDescription"),
            "gaugeZero_unit": ts.get("gaugeZero", {}).get("unit"),
            "gaugeZero_value": ts.get("gaugeZero", {}).get("value"),
            "gaugeZero_validFrom": ts.get("gaugeZero", {}).get("validFrom")
        }
        # Append timeseries entry
        timeseries_data.append(timeseries_entry)

In [145]:
station_schema = StructType([
    StructField("uuid", StringType(), True),
    StructField("number", StringType(), True),
    StructField("shortname", StringType(), True),
    StructField("longname", StringType(), True),
    StructField("km", DoubleType(), True),
    StructField("agency", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("water_shortname", StringType(), True),
    StructField("water_longname", StringType(), True)
])

In [146]:
timeseries_schema = StructType([
    StructField("station_uuid", StringType(), True),
    StructField("shortname", StringType(), True),
    StructField("longname", StringType(), True),
    StructField("unit", StringType(), True),
    StructField("equidistance", IntegerType(), True),
    StructField("timeseries_start", TimestampType(), True),
    StructField("timeseries_end", TimestampType(), True),
    StructField("comment_shortDescription", StringType(), True),
    StructField("comment_longDescription", StringType(), True),
    StructField("gaugeZero_unit", StringType(), True),
    StructField("gaugeZero_value", DoubleType(), True),
    StructField("gaugeZero_validFrom", DateType(), True)
])

In [147]:
# Convert to JSON string
stations_str = json.dumps(stations_data)
timeseries_str = json.dumps(timeseries_data)

# Load the JSON string into Spark DataFrames
stations_df = spark.read.json(spark.sparkContext.parallelize([stations_str]), schema=station_schema)
timeseries_df = spark.read.json(spark.sparkContext.parallelize([timeseries_str]), schema=timeseries_schema)

In [148]:
# Create separate DataFrames for each timeseries category
water_level_raw_df = timeseries_df.filter(col("longname").isin(["WASSERSTAND ROHDATEN"])) \
    .select(col("station_uuid"), col("shortname"), col("longname"), col("unit"), col("equidistance"),
            col("gaugeZero_unit"), col("gaugeZero_value"), col("gaugeZero_validFrom"))

water_level_forecast_df = timeseries_df.filter(col("longname").isin(["WASSERSTANDVORHERSAGE"])) \
    .select(col("station_uuid"), col("shortname"), col("longname"), col("unit"), col("equidistance"),
            col("timeseries_start"), col("timeseries_end"), col("comment_shortDescription"), col("comment_longDescription"))

water_temperature_df = timeseries_df.filter(col("longname").isin(["WASSERTEMPERATUR", "WASSERTEMPERATUR ROHDATEN"])) \
    .select(col("station_uuid"), col("shortname"), col("longname"), col("unit"), col("equidistance"),
            col("comment_shortDescription"), col("comment_longDescription"))

air_temperature_df = timeseries_df.filter(col("longname").isin(["LUFTTEMPERATUR"])) \
    .select(col("station_uuid"), col("shortname"), col("longname"), col("unit"), col("equidistance"))

drain_df = timeseries_df.filter(col("longname").isin(["ABFLUSS", "ABFLUSS_ROHDATEN"])) \
    .select(col("station_uuid"), col("shortname"), col("longname"), col("unit"), col("equidistance"))

In [149]:
# Print schema and data to verify
stations_df.printSchema()
timeseries_df.printSchema()
water_level_raw_df.printSchema()

root
 |-- uuid: string (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- longname: string (nullable = true)
 |-- km: double (nullable = true)
 |-- agency: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- water_shortname: string (nullable = true)
 |-- water_longname: string (nullable = true)

root
 |-- station_uuid: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- longname: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- equidistance: integer (nullable = true)
 |-- timeseries_start: timestamp (nullable = true)
 |-- timeseries_end: timestamp (nullable = true)
 |-- comment_shortDescription: string (nullable = true)
 |-- comment_longDescription: string (nullable = true)
 |-- gaugeZero_unit: string (nullable = true)
 |-- gaugeZero_value: double (nullable = true)
 |-- gaugeZero_validFrom: date (nullable = true)

root
 |-- station_uuid: stri

In [150]:
# Verify the contents of DataFrames
print("Sample of water_df:")
print(water_level_raw_df.head(5))

Sample of water_df:
[Row(station_uuid='e6d68ab7-5c27-4f25-896f-11dbf04056cd', shortname='W', longname='WASSERSTAND ROHDATEN', unit='cm', equidistance=15, gaugeZero_unit='m. ü. NHN', gaugeZero_value=297.043, gaugeZero_validFrom=datetime.date(2019, 1, 1)), Row(station_uuid='616dd98e-816d-4e17-b8cd-57b163dbc8a2', shortname='W', longname='WASSERSTAND ROHDATEN', unit='cm', equidistance=15, gaugeZero_unit='m. ü. NHN', gaugeZero_value=299.575, gaugeZero_validFrom=datetime.date(2019, 11, 1)), Row(station_uuid='53d40547-8a09-4b25-988c-2e6d8d8d98ee', shortname='W', longname='WASSERSTAND ROHDATEN', unit='cm', equidistance=15, gaugeZero_unit='m. ü. NHN', gaugeZero_value=306.972, gaugeZero_validFrom=datetime.date(2019, 11, 1)), Row(station_uuid='7fe63a95-8ff6-4cff-9a29-a124136516b5', shortname='W', longname='WASSERSTAND ROHDATEN', unit='cm', equidistance=15, gaugeZero_unit='m. ü. NHN', gaugeZero_value=308.123, gaugeZero_validFrom=datetime.date(2019, 11, 1)), Row(station_uuid='7cb7461b-3530-4c01-897

In [151]:
# Database connection parameters
conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

# Define the PostgreSQL table creation queries
create_stations_table_query = """
CREATE TABLE IF NOT EXISTS stations (
    uuid UUID PRIMARY KEY,
    number VARCHAR(255),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    km DOUBLE PRECISION,
    agency VARCHAR(255),
    longitude DOUBLE PRECISION,
    latitude DOUBLE PRECISION,
    water_shortname VARCHAR(255),
    water_longname VARCHAR(255)
);
"""

create_water_level_raw_table_query = """
CREATE TABLE IF NOT EXISTS water_level_raw (
    id SERIAL PRIMARY KEY,
    station_uuid UUID REFERENCES stations(uuid),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    unit VARCHAR(50),
    equidistance INTEGER,
    gaugeZero_unit VARCHAR(50),
    gaugeZero_value DOUBLE PRECISION,
    gaugeZero_validFrom DATE
);
"""

create_water_level_forecast_table_query = """
CREATE TABLE IF NOT EXISTS water_level_forecast (
    id SERIAL PRIMARY KEY,
    station_uuid UUID REFERENCES stations(uuid),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    unit VARCHAR(50),
    equidistance INTEGER,
    timeseries_start TIMESTAMPTZ,
    timeseries_end TIMESTAMPTZ,
    comment_shortDescription TEXT,
    comment_longDescription TEXT
);
"""

create_water_temperature_table_query = """
CREATE TABLE IF NOT EXISTS water_temperature (
    id SERIAL PRIMARY KEY,
    station_uuid UUID REFERENCES stations(uuid),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    unit VARCHAR(50),
    equidistance INTEGER,
    comment_shortDescription TEXT,
    comment_longDescription TEXT
);
"""

create_air_temperature_table_query = """
CREATE TABLE IF NOT EXISTS air_temperature (
    id SERIAL PRIMARY KEY,
    station_uuid UUID REFERENCES stations(uuid),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    unit VARCHAR(50),
    equidistance INTEGER
);
"""

create_drain_table_query = """
CREATE TABLE IF NOT EXISTS drain (
    id SERIAL PRIMARY KEY,
    station_uuid UUID REFERENCES stations(uuid),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    unit VARCHAR(50),
    equidistance INTEGER
);
"""

In [152]:
# Database connection parameters
conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

# Function to create the tables
def create_tables():
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    cursor.execute(create_stations_table_query)
    cursor.execute(create_water_level_raw_table_query)
    cursor.execute(create_water_level_forecast_table_query)
    cursor.execute(create_water_temperature_table_query)
    cursor.execute(create_air_temperature_table_query)
    cursor.execute(create_drain_table_query)
    conn.commit()
    cursor.close()
    conn.close()

create_tables()

# Function to insert data into PostgreSQL
def insert_data(table, data, columns):
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    insert_query = f"INSERT INTO {table} ({', '.join(columns)}) VALUES %s"
    execute_values(cursor, insert_query, data)
    conn.commit()
    cursor.close()
    conn.close()

In [153]:
stations_data_list = stations_df.collect()
water_level_raw_data_list = water_level_raw_df.collect()
water_level_forecast_data_list = water_level_forecast_df.collect()
water_temperature_data_list = water_temperature_df.collect()
air_temperature_data_list = air_temperature_df.collect()
drain_data_list = drain_df.collect()

# Define columns for insertion
stations_columns = [
    "uuid", "number", "shortname", "longname", "km", "agency",
    "longitude", "latitude", "water_shortname", "water_longname"
]

water_level_raw_columns = [
    "station_uuid", "shortname", "longname", "unit", "equidistance",
    "gaugeZero_unit", "gaugeZero_value", "gaugeZero_validFrom"
]

water_level_forecast_columns = [
    "station_uuid", "shortname", "longname", "unit", "equidistance",
    "timeseries_start", "timeseries_end", "comment_shortDescription", "comment_longDescription"
]

water_temperature_columns = [
    "station_uuid", "shortname", "longname", "unit", "equidistance",
    "comment_shortDescription", "comment_longDescription"
]

air_temperature_columns = [
    "station_uuid", "shortname", "longname", "unit", "equidistance"
]

drain_columns = [
    "station_uuid", "shortname", "longname", "unit", "equidistance"
]

# Insert data into PostgreSQL tables
insert_data("stations", stations_data_list, stations_columns)
insert_data("water_level_raw", water_level_raw_data_list, water_level_raw_columns)
insert_data("water_level_forecast", water_level_forecast_data_list, water_level_forecast_columns)
insert_data("water_temperature", water_temperature_data_list, water_temperature_columns)
insert_data("air_temperature", air_temperature_data_list, air_temperature_columns)
insert_data("drain", drain_data_list, drain_columns)

# Verify the insertion
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()
# Print sample rows from each table
cursor.execute("SELECT * FROM water_level_raw LIMIT 5")
water_level_raw_rows = cursor.fetchall()
print("Water Level Raw Data:")
for row in water_level_raw_rows:
    print(row)

cursor.execute("SELECT * FROM water_level_forecast LIMIT 5")
water_level_forecast_rows = cursor.fetchall()
print("\nWater Level Forecast Data:")
for row in water_level_forecast_rows:
    print(row)

cursor.execute("SELECT * FROM water_temperature LIMIT 5")
water_temperature_rows = cursor.fetchall()
print("\nWater Temperature Data:")
for row in water_temperature_rows:
    print(row)

cursor.execute("SELECT * FROM air_temperature LIMIT 5")
air_temperature_rows = cursor.fetchall()
print("\nAir Temperature Data:")
for row in air_temperature_rows:
    print(row)

cursor.execute("SELECT * FROM drain LIMIT 5")
drain_rows = cursor.fetchall()
print("\nDrain Data:")
for row in drain_rows:
    print(row)

cursor.close()
conn.close()

Water Level Raw Data:
(1, 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'W', 'WASSERSTAND ROHDATEN', 'cm', 15, 'm. ü. NHN', 297.043, datetime.date(2019, 1, 1))
(2, '616dd98e-816d-4e17-b8cd-57b163dbc8a2', 'W', 'WASSERSTAND ROHDATEN', 'cm', 15, 'm. ü. NHN', 299.575, datetime.date(2019, 11, 1))
(3, '53d40547-8a09-4b25-988c-2e6d8d8d98ee', 'W', 'WASSERSTAND ROHDATEN', 'cm', 15, 'm. ü. NHN', 306.972, datetime.date(2019, 11, 1))
(4, '7fe63a95-8ff6-4cff-9a29-a124136516b5', 'W', 'WASSERSTAND ROHDATEN', 'cm', 15, 'm. ü. NHN', 308.123, datetime.date(2019, 11, 1))
(5, '7cb7461b-3530-4c01-8978-7f676b8f71ed', 'W', 'WASSERSTAND ROHDATEN', 'cm', 15, 'm. ü. NHN', 116.2, datetime.date(2019, 11, 1))

Water Level Forecast Data:
(1, 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'WV', 'WASSERSTANDVORHERSAGE', 'cm', 120, datetime.datetime(2024, 7, 13, 13, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 17, 3, 0, tzinfo=datetime.timezone.utc), 'nwv-bfg', 'Vorhersagen und Abschätzungen vom: 13.07.2024 um 05:0