In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, struct, collect_list, element_at
import requests
import json

In [24]:
spark = SparkSession.builder \
    .appName("mini-project") \
    .getOrCreate()

In [25]:
api = "https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json?includeTimeseries=true&hasTimeseries=WV&includeForecastTimeseries=true"
responses = requests.get(api).json()
print(responses[0])

{'uuid': 'e6d68ab7-5c27-4f25-896f-11dbf04056cd', 'number': '10089006', 'shortname': 'VILSHOFEN', 'longname': 'VILSHOFEN', 'km': 2249.5, 'agency': 'STANDORT REGENSBURG', 'longitude': 13.182352727075328, 'latitude': 48.63719446076585, 'water': {'shortname': 'DONAU', 'longname': 'DONAU'}, 'timeseries': [{'shortname': 'W', 'longname': 'WASSERSTAND ROHDATEN', 'unit': 'cm', 'equidistance': 15, 'gaugeZero': {'unit': 'm. ü. NHN', 'value': 297.043, 'validFrom': '2019-01-01'}}, {'shortname': 'WV', 'longname': 'WASSERSTANDVORHERSAGE', 'unit': 'cm', 'equidistance': 120, 'start': '2024-07-13T07:00:00+02:00', 'end': '2024-07-16T05:00:00+02:00', 'comment': {'shortDescription': 'nwv-bfg', 'longDescription': 'Vorhersagen und Abschätzungen vom: 12.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}}, {'shortname': 'LT', 'longname': 'LUFTTEMPERATUR', 'unit': '°C

In [26]:
timeseries_data = [{'uuid': item['uuid'], 'timeseries_data': item['timeseries']} for item in responses]
[item.pop('timeseries') for item in responses]

[[{'shortname': 'W',
   'longname': 'WASSERSTAND ROHDATEN',
   'unit': 'cm',
   'equidistance': 15,
   'gaugeZero': {'unit': 'm. ü. NHN',
    'value': 297.043,
    'validFrom': '2019-01-01'}},
  {'shortname': 'WV',
   'longname': 'WASSERSTANDVORHERSAGE',
   'unit': 'cm',
   'equidistance': 120,
   'start': '2024-07-13T07:00:00+02:00',
   'end': '2024-07-16T05:00:00+02:00',
   'comment': {'shortDescription': 'nwv-bfg',
    'longDescription': 'Vorhersagen und Abschätzungen vom: 12.07.2024 um 05:00 Uhr, Quelle: Bundesanstalt für GewässerkundeWeitere Informationen zur Unterscheidung von Vorhersage und Abschätzung finden Sie auf den Seiten der Bundesanstalt für Gewässerkunde'}},
  {'shortname': 'LT',
   'longname': 'LUFTTEMPERATUR',
   'unit': '°C',
   'equidistance': 60},
  {'shortname': 'WT',
   'longname': 'WASSERTEMPERATUR',
   'unit': '°C',
   'equidistance': 60,
   'comment': {'shortDescription': 'Wassertemperatursonde ausgefallen.',
    'longDescription': 'Wassertemperatursonde ausge

In [27]:
response_str = json.dumps(responses)

# Load the JSON string into a Spark DataFrame
response_df = spark.read.json(spark.sparkContext.parallelize([response_str]))

# Flatten the dataframe
df_flattened = response_df.select(
    col("agency"),
    col("km").cast("double"),
    col("latitude").cast("double"),
    col("longitude").cast("double"),
    col("longname").cast("double"),
    col("number"),
    col("shortname"),
    col("uuid"),
    col("water.longname").alias("water_longname"),
    col("water.shortname").alias("water_shortname")
)

In [28]:
df_flattened.printSchema()
df_flattened.show()

root
 |-- agency: string (nullable = true)
 |-- km: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- longname: double (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- water_longname: string (nullable = true)
 |-- water_shortname: string (nullable = true)

+-------------------+------+------------------+------------------+--------+--------+--------------------+--------------------+--------------+---------------+
|             agency|    km|          latitude|         longitude|longname|  number|           shortname|                uuid|water_longname|water_shortname|
+-------------------+------+------------------+------------------+--------+--------+--------------------+--------------------+--------------+---------------+
|STANDORT REGENSBURG|2249.5| 48.63719446076585|13.182352727075328|    NULL|10089006|           VILSHOFEN|e6d68ab7-5c27-

In [29]:
"""
    Basically, there are five types of data the timeseries contains:
    - WASSERSTAND ROHDATEN: water level raw data
    - WASSERSTANDVORHERSAGE: water level forecast
    - WASSERTEMPERATUR: water temperature
    - LUFTTEMPERATUR: air temperature 
    - ABFLUSS: drain
    We will divide it into five dataframes, each correspond to a table in the db
"""

timeseries_df = spark.createDataFrame(timeseries_data)
joined_df = df_flattened.join(timeseries_df, df_flattened.uuid == timeseries_df.uuid, 'inner').drop(timeseries_df.uuid)
# timeseries_df = timeseries_df.withColumn("timeseries_data", explode("timeseries_data")).select("*")
# timeseries_df = timeseries_df.withColumn("category",element_at(col("timeseries_data"),"longname"))
# timeseries_grouped = timeseries_df.groupBy("category").agg(collect_list(struct(col("uuid"), col("timeseries_data"))).alias("timeseries_data"))
# timeseries_grouped.show(truncate=True)
joined_df.printSchema()
joined_df.show(truncate=False)
print(joined_df.head(5))

root
 |-- agency: string (nullable = true)
 |-- km: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- longname: double (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- water_longname: string (nullable = true)
 |-- water_shortname: string (nullable = true)
 |-- timeseries_data: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)

+-------------------+-------+------------------+------------------+--------+--------+-----------+------------------------------------+--------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
water_level_raw_df = timeseries_grouped.filter(timeseries_grouped["category"] == "ASSERSTAND ROHDATEN") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_level_forecast_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERSTANDVORHERSAGE") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
water_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "WASSERTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
air_temperature_df = timeseries_grouped.filter(timeseries_grouped["category"] == "LUFTTEMPERATUR") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))
drain_df = timeseries_grouped.filter(timeseries_grouped["category"] == "ABFLUSS") \
                                            .select("category", explode("timeseries_data").alias("timeseries_data"))

water_temperature_df.show(truncate=False)

+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category        |timeseries_data                                                                                                                                                                                                                                                                                   |
+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|WASSERTEMPERATUR|{e6d68ab7-5c27-4f25-896f-11dbf04056cd, {longname -> 

In [31]:
!pip install psycopg2-binary



In [32]:
import psycopg2
from psycopg2.extras import execute_values

conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()

cursor.execute("""
CREATE TABLE station_timeseries (
    agency VARCHAR(255),
    km DOUBLE PRECISION,
    latitude DOUBLE PRECISION,
    longitude DOUBLE PRECISION,
    longname VARCHAR(255),
    number VARCHAR(255),
    shortname VARCHAR(255),
    uuid UUID,
    water_longname VARCHAR(255),
    water_shortname VARCHAR(255),
    category VARCHAR(255),
    timeseries_uuid UUID,
    timeseries_end TIMESTAMPTZ,
    timeseries_unit VARCHAR(50),
    timeseries_start TIMESTAMPTZ,
    timeseries_comment TEXT,
    timeseries_longname VARCHAR(255),
    timeseries_shortname VARCHAR(50),
    timeseries_equidistance VARCHAR(50)
);


""")
conn.commit()

In [33]:
import psycopg2
from psycopg2.extras import execute_values

# Function to insert data into PostgreSQL
def insert_data(table, data, columns):
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    insert_query = f"INSERT INTO {table} ({', '.join(columns)}) VALUES %s"
    execute_values(cursor, insert_query, data)
    conn.commit()
    cursor.close()
    conn.close()

# Convert Spark DataFrame to list of tuples for insertion into PostgreSQL
joined_data = [tuple(row) for row in joined_df.collect()]

# Define the columns
columns = [
    "agency", "km", "latitude", "longitude", "longname", "number", "shortname", 
    "uuid", "water_longname", "water_shortname", "timeseries_uuid", "category", 
    "unit", "start", "end", "comment", "equidistance"
]

# Insert joined data into PostgreSQL
insert_data("station_timeseries", joined_data, columns)

conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()
cursor.execute("SELECT * FROM station_timeseries")
rows = cursor.fetchall()
for row in rows:
    print(row)

cursor.close()
conn.close()


ProgrammingError: can't adapt type 'dict'