In [245]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, explode, struct, collect_list, element_at, lit
import requests
import json
import psycopg2
from psycopg2.extras import execute_values

In [246]:
spark = SparkSession.builder \
    .appName("mini-project") \
    .getOrCreate()

In [247]:
api = "https://www.pegelonline.wsv.de/webservices/rest-api/v2/stations.json?includeTimeseries=true&hasTimeseries=WV&includeForecastTimeseries=true"
responses = requests.get(api).json()

In [248]:
stations_data = []
timeseries_data = []

In [249]:
for station in responses:
    # Station entry
    station_entry = {
        "uuid": station["uuid"],
        "number": station["number"],
        "shortname": station["shortname"],
        "longname": station["longname"],
        "km": station["km"],
        "agency": station["agency"],
        "longitude": station["longitude"],
        "latitude": station["latitude"],
        "water_shortname": station["water"]["shortname"],
        "water_longname": station["water"]["longname"]
    }
    
    # Append station data if not already in list
    if station_entry not in stations_data:
        stations_data.append(station_entry)

    # Timeseries data for the current station
    for ts in station["timeseries"]:
        timeseries_entry = {
            "station_uuid": station["uuid"],
            "shortname": ts.get("shortname"),
            "longname": ts.get("longname"),
            "unit": ts.get("unit"),
            "equidistance": ts.get("equidistance"),
            "timeseries_start": ts.get("start"),
            "timeseries_end": ts.get("end"),
            "comment_shortDescription": ts.get("comment", {}).get("shortDescription"),
            "comment_longDescription": ts.get("comment", {}).get("longDescription"),
            "gaugeZero_unit": ts.get("gaugeZero", {}).get("unit"),
            "gaugeZero_value": ts.get("gaugeZero", {}).get("value"),
            "gaugeZero_validFrom": ts.get("gaugeZero", {}).get("validFrom")
        }
        # Append timeseries entry
        timeseries_data.append(timeseries_entry)

In [250]:
station_schema = StructType([
    StructField("uuid", StringType(), True),
    StructField("number", StringType(), True),
    StructField("shortname", StringType(), True),
    StructField("longname", StringType(), True),
    StructField("km", DoubleType(), True),
    StructField("agency", StringType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("water_shortname", StringType(), True),
    StructField("water_longname", StringType(), True)
])

In [251]:
timeseries_schema = StructType([
    StructField("station_uuid", StringType(), True),
    StructField("shortname", StringType(), True),
    StructField("longname", StringType(), True),
    StructField("unit", StringType(), True),
    StructField("equidistance", IntegerType(), True),
    StructField("timeseries_start", TimestampType(), True),
    StructField("timeseries_end", TimestampType(), True),
    StructField("comment_shortDescription", StringType(), True),
    StructField("comment_longDescription", StringType(), True),
    StructField("gaugeZero_unit", StringType(), True),
    StructField("gaugeZero_value", DoubleType(), True),
    StructField("gaugeZero_validFrom", DateType(), True)
])

In [252]:
# Convert to JSON string
stations_str = json.dumps(stations_data)
timeseries_str = json.dumps(timeseries_data)

# Load the JSON string into Spark DataFrames
stations_df = spark.read.json(spark.sparkContext.parallelize([stations_str]), schema=station_schema)
timeseries_df = spark.read.json(spark.sparkContext.parallelize([timeseries_str]), schema=timeseries_schema)

In [253]:
# Print schema and data to verify
stations_df.printSchema()
timeseries_df.printSchema()

root
 |-- uuid: string (nullable = true)
 |-- number: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- longname: string (nullable = true)
 |-- km: double (nullable = true)
 |-- agency: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- water_shortname: string (nullable = true)
 |-- water_longname: string (nullable = true)

root
 |-- station_uuid: string (nullable = true)
 |-- shortname: string (nullable = true)
 |-- longname: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- equidistance: integer (nullable = true)
 |-- timeseries_start: timestamp (nullable = true)
 |-- timeseries_end: timestamp (nullable = true)
 |-- comment_shortDescription: string (nullable = true)
 |-- comment_longDescription: string (nullable = true)
 |-- gaugeZero_unit: string (nullable = true)
 |-- gaugeZero_value: double (nullable = true)
 |-- gaugeZero_validFrom: date (nullable = true)



In [254]:
# Verify the contents of DataFrames
print("Sample of stations_df:")
print(stations_df.head(5))
print("Sample of timeseries_df:")
print(timeseries_df.head(5))

Sample of stations_df:
[Row(uuid='e6d68ab7-5c27-4f25-896f-11dbf04056cd', number='10089006', shortname='VILSHOFEN', longname='VILSHOFEN', km=2249.5, agency='STANDORT REGENSBURG', longitude=13.182352727075328, latitude=48.63719446076585, water_shortname='DONAU', water_longname='DONAU'), Row(uuid='616dd98e-816d-4e17-b8cd-57b163dbc8a2', number='10088003', shortname='HOFKIRCHEN', longname='HOFKIRCHEN', km=2256.9, agency='STANDORT REGENSBURG', longitude=13.115159742918724, latitude=48.676629660669875, water_shortname='DONAU', water_longname='DONAU'), Row(uuid='53d40547-8a09-4b25-988c-2e6d8d8d98ee', number='10081004', shortname='DEGGENDORF', longname='DEGGENDORF', km=2284.4, agency='STANDORT REGENSBURG', longitude=12.962349954256105, latitude=48.824943730974475, water_shortname='DONAU', water_longname='DONAU'), Row(uuid='7fe63a95-8ff6-4cff-9a29-a124136516b5', number='10078000', shortname='PFELLING', longname='PFELLING', km=2305.5, agency='STANDORT REGENSBURG', longitude=12.74721651873093, lat

In [255]:
# Convert Spark DataFrames to list of tuples for insertion into PostgreSQL
stations_data_list = [tuple(row) for row in stations_df.collect()]
timeseries_data_list = [tuple(row) for row in timeseries_df.collect()]

In [256]:
# Define the columns
stations_columns = [
    "uuid", "number", "shortname", "longname", "km", "agency",
    "longitude", "latitude", "water_shortname", "water_longname"
]

timeseries_columns = [
    "station_uuid", "shortname", "longname", "unit", "equidistance",
    "timeseries_start", "timeseries_end", "comment_shortDescription", "comment_longDescription",
    "gaugeZero_unit", "gaugeZero_value", "gaugeZero_validFrom"
]

In [257]:
# Define the PostgreSQL table schema and create the tables
create_stations_table_query = """
CREATE TABLE IF NOT EXISTS stations (
    uuid UUID PRIMARY KEY,
    number VARCHAR(255),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    km DOUBLE PRECISION,
    agency VARCHAR(255),
    longitude DOUBLE PRECISION,
    latitude DOUBLE PRECISION,
    water_shortname VARCHAR(255),
    water_longname VARCHAR(255)
);
"""

create_timeseries_table_query = """
CREATE TABLE IF NOT EXISTS timeseries (
    id SERIAL PRIMARY KEY,
    station_uuid UUID REFERENCES stations(uuid),
    shortname VARCHAR(255),
    longname VARCHAR(255),
    unit VARCHAR(50),
    equidistance INTEGER,
    timeseries_start TIMESTAMPTZ,
    timeseries_end TIMESTAMPTZ,
    comment_shortDescription TEXT,
    comment_longDescription TEXT,
    gaugeZero_unit VARCHAR(50),
    gaugeZero_value DOUBLE PRECISION,
    gaugeZero_validFrom DATE
);
"""

In [258]:
# Database connection parameters
conn_params = {
    "dbname": "db",
    "user": "postgres",
    "password": "12345",
    "host": "pg_container",
    "port": 5432
}

# Function to create the tables
def create_tables():
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    cursor.execute(create_stations_table_query)
    cursor.execute(create_timeseries_table_query)
    conn.commit()
    cursor.close()
    conn.close()

create_tables()

# Function to insert data into PostgreSQL
def insert_data(table, data, columns):
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    insert_query = f"INSERT INTO {table} ({', '.join(columns)}) VALUES %s"
    execute_values(cursor, insert_query, data)
    conn.commit()
    cursor.close()
    conn.close()

In [259]:
# Insert station data into PostgreSQL
insert_data("stations", stations_data_list, stations_columns)

# Insert timeseries data into PostgreSQL
insert_data("timeseries", timeseries_data_list, timeseries_columns)

# Verify the insertion
conn = psycopg2.connect(**conn_params)
cursor = conn.cursor()
cursor.execute("SELECT * FROM stations")
stations_rows = cursor.fetchall()
for row in stations_rows:
    print(row)

cursor.execute("SELECT * FROM timeseries")
timeseries_rows = cursor.fetchall()
for row in timeseries_rows:
    print(row)

cursor.close()
conn.close()

('e6d68ab7-5c27-4f25-896f-11dbf04056cd', '10089006', 'VILSHOFEN', 'VILSHOFEN', 2249.5, 'STANDORT REGENSBURG', 13.182352727075328, 48.63719446076585, 'DONAU', 'DONAU')
('616dd98e-816d-4e17-b8cd-57b163dbc8a2', '10088003', 'HOFKIRCHEN', 'HOFKIRCHEN', 2256.9, 'STANDORT REGENSBURG', 13.115159742918724, 48.676629660669875, 'DONAU', 'DONAU')
('53d40547-8a09-4b25-988c-2e6d8d8d98ee', '10081004', 'DEGGENDORF', 'DEGGENDORF', 2284.4, 'STANDORT REGENSBURG', 12.962349954256105, 48.824943730974475, 'DONAU', 'DONAU')
('7fe63a95-8ff6-4cff-9a29-a124136516b5', '10078000', 'PFELLING', 'PFELLING', 2305.5, 'STANDORT REGENSBURG', 12.74721651873093, 48.87976613950202, 'DONAU', 'DONAU')
('7cb7461b-3530-4c01-8978-7f676b8f71ed', '501010', 'SCHÖNA', 'SCHÖNA', 2.05, 'STANDORT DRESDEN', 14.235228651495222, 50.87576232293828, 'ELBE', 'ELBE')
('70272185-b2b3-4178-96b8-43bea330dcae', '501060', 'DRESDEN', 'DRESDEN', 55.63, 'STANDORT DRESDEN', 13.738831783620384, 51.054459765598125, 'ELBE', 'ELBE')
('b04b739d-7ffa-41ee-