#### Data description:
Aruba:
- 1920-1938
- Daily Pressure & Rainfall

ArubaS:
- 1936-1940
- Daily measurements of various variables measured (sometimes whole table, sometimes only wind)

Bonaire:
- 1920-1941
- Daily rain, max and min pressure and temperature (**not always**)

Curacao:
- 1898-1903
- Hourly wind speed

CuracaoM:
- 1902 - 1919
- Various subdaily

Saba:
- 1920 - 1945
- Subdaily pressure, min and max temperature and daily rain

Statia:
- 1943 - 1946
- Subdaily pressure, daily rain

Willemstad:
- 1912 - 1946
- Various subdaily measurements
- TXT format

#### Notes:
- Lot of variability in time scale (hourly, daily, subdaily)
- Lot of variability in recorded variables (between 1 and 12 variables) -> sparse data if all data is put in one table with all columns


In [1]:
import duckdb
import pandas as pd 
import os
from tqdm.auto import tqdm
import time
start_time = time.time()

#### Convert to CSV

In [2]:
dirs = ['Aruba', 'Bonaire', 'Curacao', 'Saba', 'Statia']
for dir in tqdm(dirs):
    if not os.path.exists(f'data/CSVs/{dir}'):
        os.makedirs(f'data/CSVs/{dir}')
    for file in os.listdir(f'data/digitized/{dir}'):
        for sheet in ['Regen', 'Overig']:
            try:
                name, ext = file.split('.')
                df = pd.DataFrame(pd.read_excel(f"data/digitized/{dir}/{file}", sheet_name=sheet)) 
                df.to_csv(f'data/CSVs/{dir}/{name}_{sheet}.csv')
            except ValueError:
                # Catch sheet not found error from files that don't have all sheets
                print(f"Error processing {file} in {sheet} sheet")
                continue

  0%|          | 0/5 [00:00<?, ?it/s]

Error processing Curacao1901.xlsm in Regen sheet
Error processing Curacao1899.xlsm in Regen sheet
Error processing Curacao1903.xlsm in Regen sheet
Error processing Curacao1902.xlsm in Regen sheet
Error processing Curacao1898.xlsm in Regen sheet
Error processing Curacao1900.xlsm in Regen sheet


In [3]:
# Special treatment for Suriname (xls files, and no separate sheets for rain and other 
# resulting in a two level header)
if not os.path.exists('data/CSVs/Suriname'):
    os.makedirs('data/CSVs/Suriname')
for file in os.listdir('data/digitized/Suriname'):
    name, ext = file.split('.')
    df = pd.DataFrame(pd.read_excel(f"data/digitized/Suriname/{file}", header=[0,1])) 
    df.to_csv(f'data/CSVs/Suriname/{name}.csv')

In [4]:
# Special treatment for Willemstad (latin1 encoded TXT files following CSV format)
import csv
if not os.path.exists('data/CSVs/Willemstad'):
    os.makedirs('data/CSVs/Willemstad')
for file in os.listdir('data/digitized/Willemstad'):
    name, ext = file.split('.')
    df = pd.read_csv(f"data/digitized/Willemstad/{file}", delimiter=',', encoding='latin1')
    # use QUOTE_NONE to remove double quotes from the TXT file
    df.to_csv(f'data/CSVs/Willemstad/{name}.csv', quoting=csv.QUOTE_NONE) 

#### DuckDB Config

In [5]:
%load_ext sql
conn = duckdb.connect('dbs/knmi_database.db')
%sql conn --alias duckdb

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

#### Aruba

In [6]:
%%sql
# Create Aruba_Temp table from CSV files
DROP TABLE IF EXISTS Aruba_Temp;
CREATE TABLE IF NOT EXISTS Aruba_Temp AS
SELECT Stn, Datum AS Date, Tijd AS Time, Rd AS Precipitation, qRd AS qPrecipitation, P AS Pressure, 
qP AS qPressure, T AS dryBulbTemperature, qT AS qDryBulbTemperature, Tw AS wetBulbTemperature,
qTw AS qWetBulbTemperature, Tn AS minTemperature, 
qTn AS qMinTemperature, Tx AS maxTemperature, qTx AS qMaxTemperature,
U AS Humidity, qU AS qHumidity, Ed AS VapourPressure, qEd AS qVaporPressure, 
D AS windDirection, qD AS qWindDirection, F AS windSpeed, qF AS qWindSpeed,
N AS cloudCover, qN AS qCloudCover
FROM
(SELECT * FROM read_csv('data/CSVs/Aruba/*Regen.csv', union_by_name = true, dateformat = '%Y%m%d'))
FULL JOIN 
(SELECT * FROM read_csv('data/CSVs/Aruba/*Overig.csv', union_by_name = true, dateformat = '%Y%m%d'))
USING (Stn, Datum)
;
SELECT * FROM Aruba_Temp;

Unnamed: 0,Stn,Date,Time,Precipitation,qPrecipitation,Pressure,qPressure,dryBulbTemperature,qDryBulbTemperature,wetBulbTemperature,...,Humidity,qHumidity,VapourPressure,qVaporPressure,windDirection,qWindDirection,windSpeed,qWindSpeed,cloudCover,qCloudCover
0,32,1920-01-01,,0,0,764.0,0,,,,...,,,,,,,,,,
1,32,1920-01-02,,0,0,764.0,0,,,,...,,,,,,,,,,
2,32,1920-01-03,,0,0,764.0,0,,,,...,,,,,,,,,,
3,32,1920-01-04,,0,0,764.0,0,,,,...,,,,,,,,,,
4,32,1920-01-05,,0,0,764.5,0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17532,32,1940-12-30,24.0,—,7,—,7,—,7,—,...,—,7,—,7,—,7,—,7,—,7
17533,32,1940-12-31,7.0,—,7,—,7,—,7,—,...,—,7,—,7,—,7,—,7,—,7
17534,32,1940-12-31,12.0,—,7,—,7,—,7,—,...,—,7,—,7,—,7,—,7,—,7
17535,32,1940-12-31,17.0,—,7,—,7,—,7,—,...,—,7,—,7,—,7,—,7,—,7


In [7]:
# Update Aruba_Temp table to set '—' values to NULL
cursor = conn.execute("PRAGMA table_info('Aruba_Temp');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Aruba_Temp
    SET {column} = NULL
    WHERE TRIM({column}) = '—';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['Stn', 'Date', 'Time', 'qPrecipitation', 'qPressure', 'qDryBulbTemperature', 'qWetBulbTemperature', 'qMinTemperature', 'qMaxTemperature', 'qHumidity', 'qVaporPressure', 'qWindDirection', 'qWindSpeed', 'qCloudCover']


In [8]:
%%sql
df << SELECT column_name, column_type, min, max, approx_unique, count, null_percentage
FROM (SUMMARIZE Aruba_Temp)

In [9]:
df.to_latex('Aruba_summary.tex', index=False, float_format="%.2f", escape=False, column_format='lcccccc')

In [10]:
%%sql
# Set the timestamps, as the range is 1920 to 1940 we can use the default timezone of Aruba (America/Aruba)
# Use 24:00 UTC for 24:00 and NULL as specified by SEF
ALTER TABLE Aruba_Temp DROP COLUMN IF EXISTS timestamp;
ALTER TABLE Aruba_Temp ADD COLUMN timestamp TIMESTAMP WITH TIME ZONE;

UPDATE Aruba_Temp AS a
SET timestamp = (
    CASE 
        -- When Time is NULL or 24, set to 24:00 UTC
        WHEN a.Time IS NULL OR CAST(a.Time AS INTEGER) = 24 THEN 
            (a.Date + INTERVAL '24 hours') AT TIME ZONE 'UTC'
        -- For other times, convert to UTC
        ELSE 
            ((a.Date + to_hours(CAST(a.Time AS INTEGER))) AT TIME ZONE 'America/Aruba')
    END
);

Unnamed: 0,Count
0,17537


In [11]:
%%sql
# Normalize column names and types
DROP TABLE IF EXISTS Aruba;
CREATE TABLE Aruba AS
    SELECT 
        Stn, 
        Timestamp,
        # Date, 
        # Time, 
        CAST(Precipitation AS FLOAT)/10 AS Precipitation, 
        qPrecipitation,
        CAST(Pressure AS FLOAT) AS Pressure, 
        qPressure, 
        CAST(dryBulbTemperature AS FLOAT) AS dryBulbTemperature, 
        qDryBulbTemperature,
        CAST(wetBulbTemperature AS FLOAT) AS wetBulbTemperature, 
        qWetBulbTemperature,
        CAST(minTemperature AS FLOAT) AS minTemperature, 
        qMinTemperature,
        CAST(maxTemperature AS FLOAT) AS maxTemperature, 
        qMaxTemperature,
        CAST(Humidity AS INTEGER) AS Humidity, 
        qHumidity,
        CAST(VapourPressure AS FLOAT) AS VapourPressure, 
        qVaporPressure,
        CAST(windDirection AS INTEGER) AS windDirection, 
        qWindDirection,
        CAST(windSpeed AS FLOAT) AS windSpeed, 
        qWindSpeed,
        CAST(cloudCover AS INTEGER) AS cloudCover, 
        qCloudCover
    FROM Aruba_Temp

Unnamed: 0,Count
0,17537


In [12]:
%%sql
FROM (SUMMARIZE Aruba)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Stn,BIGINT,32,32,1,32.0,0.0,32,32,32,17537,0.0
1,timestamp,TIMESTAMP WITH TIME ZONE,1920-01-02 00:00:00+00,1941-01-01 02:00:00+02,14070,,,1936-03-14 04:12:27.771477+00,1937-05-27 02:01:05.860587+01,1938-08-07 23:49:10.670611+01,17537,0.0
2,Precipitation,FLOAT,0.0,413.0,175,1.3320365699529593,10.385958765918335,0.0,0.0,0.0,17537,38.88
3,qPrecipitation,BIGINT,0,7,2,2.721446085419399,3.4124064528172235,0,0,7,17537,0.0
4,Pressure,FLOAT,752.8,769.5,70,762.1354909699613,2.9613626858759625,759.6532912865285,761.9210126582277,765.0,17537,59.53
5,qPressure,BIGINT,0,7,2,4.166790214974055,3.4359953460516395,0,7,7,17537,0.0
6,dryBulbTemperature,FLOAT,22.3,32.4,83,27.48115851513373,1.6792784072518914,26.49758124549733,27.5014005655668,28.60200783815222,17537,82.08
7,qDryBulbTemperature,BIGINT,0,7,2,5.118884707492303,3.103229921713479,0,7,7,17537,33.33
8,wetBulbTemperature,FLOAT,19.0,31.0,79,24.652291539518185,1.6274545004103582,23.418539198079674,24.568138535645343,25.737704895868266,17537,82.08
9,qWetBulbTemperature,BIGINT,0,7,2,5.118884707492303,3.103229921713479,0,7,7,17537,33.33


#### Bonaire

In [13]:
%%sql
# Load the Bonaire_Temp table from CSV files
CREATE TABLE IF NOT EXISTS Bonaire_Temp AS
(SELECT *  FROM
  (SELECT * FROM read_csv('data/CSVs/Bonaire/*Regen.csv', union_by_name = true, dateformat = '%Y%m%d')) t1
FULL JOIN 
  (SELECT * FROM read_csv('data/CSVs/Bonaire/*Overig.csv', union_by_name = true, dateformat = '%Y%m%d')) t2
USING (Stn, Datum))
;

Unnamed: 0,Count


In [14]:
# Update Bonaire_Temp table to set '—' values to NULL
cursor = conn.execute("PRAGMA table_info('Bonaire_Temp');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Bonaire_Temp
    SET {column} = NULL
    WHERE TRIM({column}) = '—';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['column0', 'Stn', 'Datum', 'Niveau', 'qRd', 'column00', 'Tijd', 'Px', 'qPx', 'Pn', 'qPn', 'Tx', 'qTx', 'Tn', 'qTn', 'column0_1', 'qP', 'qT', 'timestamp']


In [15]:
%%sql
# Set the timestamps with the default timezone of Bonaire (America/Aruba)
# Use 24:00 UTC for times that are NULL or 24 as specified by SEF
ALTER TABLE Bonaire_Temp DROP COLUMN IF EXISTS timestamp;
ALTER TABLE Bonaire_Temp ADD COLUMN timestamp TIMESTAMP WITH TIME ZONE;

UPDATE Bonaire_Temp AS a
SET timestamp = (
    CASE 
        -- When Time is NULL or 24, set to 24:00 UTC
        WHEN a.Tijd IS NULL OR CAST(a.Tijd AS INTEGER) = 24 THEN 
            (a.Datum + INTERVAL '24 hours') AT TIME ZONE 'UTC'
        -- For other times, convert to UTC
        ELSE 
            ((a.Datum + to_hours(CAST(a.Tijd AS INTEGER))) AT TIME ZONE 'America/Aruba')
    END
);

Unnamed: 0,Count
0,20091


In [16]:
%%sql
# Normalize column names and types for Bonaire
DROP TABLE IF EXISTS Bonaire;
CREATE TABLE Bonaire AS
    SELECT 
        Stn, 
        Datum AS Date, 
        Tijd AS Time,
        Timestamp, 
        CAST(Rd AS FLOAT)/10 AS Precipitation,
        qRd AS qPrecipitation, 
        CAST(Px AS FLOAT) AS maxPressure, 
        qP AS qPressure,
        CAST(Pn AS FLOAT) AS minPressure,
        qPn AS qMinPressure,
        CAST(Tx AS FLOAT) AS maxTemperature,
        qTx AS qMaxTemperature,
        CAST(Tn AS FLOAT) AS minTemperature,
        qTn AS qMinTemperature,
        CAST(P AS FLOAT) AS Pressure,
        qP AS qPressure,
        CAST(T AS FLOAT) AS dryBulbTemperature,
        qT AS qDryBulbTemperature
    FROM Bonaire_Temp;

Unnamed: 0,Count
0,20091


In [17]:
%%sql
FROM(SUMMARIZE Bonaire)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Stn,BIGINT,42,49,3,44.14533870887462,2.1860411835670304,42,44,44,20091,0.0
1,Date,DATE,1920-01-01,1941-12-31,5459,,,1930-12-01,1935-02-15,1938-07-26,20091,0.0
2,Time,DOUBLE,8.0,24.0,3,11.90400918403391,4.091752787139371,8.0,14.976716315699367,15.0,20091,43.64
3,timestamp,TIMESTAMP WITH TIME ZONE,1920-01-02 00:00:00+00,1942-01-01 02:00:00+02,19531,,,1930-12-01 16:05:33.06737+00,1935-02-16 04:15:42.867098+00,1938-07-26 21:05:23.533745+01,20091,0.0
4,Precipitation,FLOAT,0.0,168.0,356,1.4914091171749198,6.435168753628951,0.0,0.0,0.0,20091,6.43
5,qPrecipitation,BIGINT,0,7,2,0.4501518092678314,1.717140872020826,0,0,0,20091,0.0
6,maxPressure,FLOAT,753.0,760.5,13,757.0765027322404,1.2201032108792575,756.3888888888889,757.0,758.0,20091,98.18
7,qPressure,BIGINT,0,7,2,0.2734075561233802,1.356196537477642,0,0,0,20091,45.46
8,minPressure,FLOAT,751.5,759.0,15,755.389344262295,1.341914847984338,754.5,755.09,756.265625,20091,98.18
9,qMinPressure,BIGINT,0,0,1,0.0,0.0,0,0,0,20091,98.18


#### Curacao

In [18]:
%%sql
# Load the Curacao_Temp table from CSV files
CREATE TABLE IF NOT EXISTS Curacao_Temp AS
SELECT * FROM
  (SELECT * FROM read_csv('data/CSVs/Curacao/*Regen.csv', union_by_name = true, dateformat = '%Y%m%d')) t1
FULL JOIN 
  (SELECT * FROM read_csv('data/CSVs/Curacao/*Overig.csv', union_by_name = true, dateformat = '%Y%m%d')) t2
USING (Stn, Datum)
;

Unnamed: 0,Count


In [19]:
# Update Curacao_Temp table to set '—' values to NULL
cursor = conn.execute("PRAGMA table_info('Curacao_Temp');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Curacao_Temp
    SET {column} = NULL
    WHERE TRIM({column}) = '—';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['column0', 'Stn', 'Datum', 'Niveau', 'qRd', 'column0_1', 'Tijd', 'qF', 'column00', 'qTp', 'qPt', 'qPc', 'qP0', 'qP', 'qT', 'qTw', 'qTd', 'qU', 'qEd', 'qTx', 'qTn', 'qD', 'qFx', 'qDFx', 'qN', 'qC', 'qDn', 'timestamp']


In [20]:
%%sql
# Set the timestamps manually due to historical timezone changes not considered by DuckDB
# 24:00 UTC is used for NULL values as specified by SEF
ALTER TABLE Curacao_Temp DROP COLUMN IF EXISTS timestamp;
ALTER TABLE Curacao_Temp ADD COLUMN timestamp TIMESTAMP WITH TIME ZONE;

UPDATE Curacao_Temp AS a
SET timestamp = (
    CASE 
        -- When Time is NULL or 24, set to 24:00 UTC
        WHEN a.Tijd IS NULL OR CAST(a.Tijd AS INTEGER) = 24 THEN 
            (a.Datum + INTERVAL '24 hours') AT TIME ZONE 'UTC'
            
        -- For times >= 100, treat as HHMM format with historical timezone adjustments
        WHEN CAST(a.Tijd AS INTEGER) >= 100 THEN
            CASE
                -- Before Feb 12, 1912: UTC-04:35:24
                WHEN a.Datum < '1912-02-12' THEN 
                    (a.Datum + 
                    to_hours(CAST(a.Tijd/100 AS INTEGER)) + 
                    to_minutes(CAST(a.Tijd%100 AS INTEGER)) + 
                    INTERVAL '4 hours 35 minutes 24 seconds') AT TIME ZONE 'UTC'
                    
                -- After Feb 12, 1912: UTC-04:35:24
                ELSE 
                    (a.Datum + 
                    to_hours(CAST(a.Tijd/100 AS INTEGER)) + 
                    to_minutes(CAST(a.Tijd%100 AS INTEGER)) + 
                    INTERVAL '4 hours 30 minutes') AT TIME ZONE 'UTC'
            END
            
        -- For regular hour values, also apply historical timezone adjustments
        ELSE 
            CASE
                -- Before Feb 12, 1912: UTC-04:35:24
                WHEN a.Datum < '1912-02-12' THEN 
                    (a.Datum + 
                    to_hours(CAST(a.Tijd AS INTEGER)) + 
                    INTERVAL '4 hours 35 minutes 24 seconds') AT TIME ZONE 'UTC'
                    
                -- After Feb 12, 1912: UTC-04:30:00
                ELSE 
                    (a.Datum + 
                    to_hours(CAST(a.Tijd AS INTEGER)) + 
                    INTERVAL '4 hours 30 minutes') AT TIME ZONE 'UTC'
            END
    END
);

Unnamed: 0,Count
0,62419


In [21]:
%%sql
# Normalize column names and types for Curacao
DROP TABLE IF EXISTS Curacao;
CREATE TABLE Curacao AS
    SELECT 
        Stn, 
        Datum AS Date, 
        Tijd AS Time,
        Timestamp, 
        TRY_CAST(Rd AS INTEGER) AS Precipitation,
        qRd AS qPrecipitation, 
        CAST(F AS FLOAT) AS windSpeed,
        qF AS qWindSpeed,
        CAST(Tp AS FLOAT) AS Temperature_reading_idk,
        qTp AS qTemperature_reading_idk,
        CAST(Pt AS FLOAT) AS Pressure_reading_idk,
        qPt AS qPressure_reading_idk,
        CAST(Pc AS FLOAT) AS Pressure_correction_idk,
        qPc AS qPressure_correction_idk,
        CAST(P0 AS FLOAT) AS Pressure_station_idk,
        qP0 AS qPressure_station_idk,
        CAST(P AS FLOAT) AS Pressure,
        qP AS qPressure,
        CAST(Td AS FLOAT) AS dewPointTemperature,
        qTd AS qDewPointTemperature,
        CAST(U AS FLOAT) AS Humidity,
        qU AS qHumidity,
        CAST(Ed AS FLOAT) AS VapourPressure,
        qEd AS qVaporPressure,
        CAST(Tx AS FLOAT) AS maxTemperature,
        qTx AS qMaxTemperature,
        CAST(Tn AS FLOAT) AS minTemperature,
        qTn AS qMinTemperature,
        CAST(D AS INTEGER) AS windDirection,
        qD AS qWindDirection,
        CAST(Dn AS INTEGER) AS cloudDirection_idk,
        qDn AS qCloudDirection
    FROM Curacao_Temp;

Unnamed: 0,Count
0,62419


In [22]:
%%sql
FROM(SUMMARIZE Curacao)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Stn,BIGINT,78989,78989,1,78989.0,0.0,78989,78989,78989,62419,0.0
1,Date,DATE,1898-01-01,1919-12-31,7321,,,1899-10-14,1901-07-28,1903-04-11,62419,0.0
2,Time,DOUBLE,1.0,800.0,32,126.70981592143418,265.6118209818312,7.813393298163388,14.960846093690865,21.98686014625229,62419,0.0
3,timestamp,TIMESTAMP WITH TIME ZONE,1898-01-01 05:35:24+00,1919-12-31 11:30:00+00,56029,,,1899-10-14 16:38:54.25951+00,1901-07-28 17:31:48.679486+00,1903-04-11 23:09:03.974464+00,62419,0.0
4,Precipitation,INTEGER,0,114,40,1.028664072632944,4.084759413741208,0,0,0,62419,87.65
5,qPrecipitation,BIGINT,0,7,2,4.871288213594361,3.2202402691068706,0,7,7,62419,56.14
6,windSpeed,FLOAT,0.0,27.0,56,10.873874289181902,4.292810269060038,7.975645344434127,11.0,14.0,62419,32.58
7,qWindSpeed,BIGINT,0,7,3,2.2830548390714367,3.281238405705362,0,0,7,62419,0.0
8,Temperature_reading_idk,FLOAT,23.9,31.4,30,27.9395871797601,1.4361914370718196,26.899999618530273,27.799999237060547,28.899999618530273,62419,96.58
9,qTemperature_reading_idk,BIGINT,0,7,2,5.486256212597627,2.881948767624004,7,7,7,62419,84.21


#### Saba

In [23]:
%%sql
# Load the Saba_Temp table from CSV files
DROP TABLE IF EXISTS Saba_Temp;
CREATE TABLE IF NOT EXISTS Saba_Temp AS
(SELECT *  FROM
  (SELECT * FROM read_csv('data/CSVs/Saba/*Regen.csv', union_by_name = true, dateformat = '%Y%m%d')) t1
FULL JOIN 
  (SELECT * FROM read_csv('data/CSVs/Saba/*Overig.csv', union_by_name = true, dateformat = '%Y%m%d')) t2
USING (Stn, Datum))
;

Unnamed: 0,Count
0,38726


In [24]:
# Update Saba_Temp table to set '—' values to NULL
cursor = conn.execute("PRAGMA table_info('Saba_Temp');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Saba_Temp
    SET {column} = NULL
    WHERE TRIM({column}) = '—';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['column0', 'Stn', 'Datum', 'Niveau', 'qRd', 'column0_1', 'Tijd', 'qP', 'qTn', 'qTx', 'qT']


In [25]:
%%sql
# Set the timestamps manually due to historical timezone changes not considered by DuckDB
# Use 24:00 UTC for NULL values as specified by SEF
ALTER TABLE Saba_Temp DROP COLUMN IF EXISTS timestamp;
ALTER TABLE Saba_Temp ADD COLUMN timestamp TIMESTAMP WITH TIME ZONE;

UPDATE Saba_Temp AS a
SET timestamp = (
    CASE 
        -- When Time is NULL or 24, set to 24:00 UTC
        WHEN a.Tijd IS NULL OR CAST(a.Tijd AS INTEGER) = 24 THEN 
            (a.Datum + INTERVAL '24 hours') AT TIME ZONE 'UTC'

        -- For regular hour values, also apply historical timezone adjustments
        ELSE 
            CASE
                -- Before Feb 12, 1912: UTC-04:35:24
                WHEN a.Datum < '1912-02-12' THEN 
                    (a.Datum + 
                    to_hours(CAST(FLOOR(CAST(a.Tijd AS FLOAT)) AS INTEGER)) + 
                    to_minutes(CAST(ROUND((CAST(a.Tijd AS FLOAT) - FLOOR(CAST(a.Tijd AS FLOAT))) * 100) AS INTEGER)) + 
                    INTERVAL '4 hours 35 minutes 47 seconds') AT TIME ZONE 'UTC'
                    
                -- After Feb 12, 1912: UTC-04:30:00
                ELSE 
                    (a.Datum + 
                    to_hours(CAST(FLOOR(CAST(a.Tijd AS FLOAT)) AS INTEGER)) + 
                    to_minutes(CAST(ROUND((CAST(a.Tijd AS FLOAT) - FLOOR(CAST(a.Tijd AS FLOAT))) * 100) AS INTEGER)) + 
                    INTERVAL '4 hours 30 minutes') AT TIME ZONE 'UTC'
            END
    END
);

Unnamed: 0,Count
0,38726


In [26]:
%%sql
# Normalize column names and types for Saba
DROP TABLE IF EXISTS Saba;
CREATE TABLE Saba AS
    SELECT 
        Stn, 
        Datum AS Date, 
        Tijd AS Time,
        Timestamp, 
        CAST(Rd AS INTEGER) AS Precipitation,
        qRd AS qPrecipitation,
        CAST(P AS FLOAT) AS Pressure,
        qP AS qPressure,
        CAST(Tn AS FLOAT) AS minTemperature,
        qTn AS qMinTemperature,
        CAST(Tx AS FLOAT) AS maxTemperature,
        qTx AS qMaxTemperature,
        CAST(T AS FLOAT) AS dryBulbTemperature,
        qT AS qDryBulbTemperature
    FROM Saba_Temp;

Unnamed: 0,Count
0,38726


In [27]:
%sql FROM(SUMMARIZE Saba)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Stn,BIGINT,81,83,3,81.5659763466405,0.7892913106056159,81,81,82,38726,0.0
1,Date,DATE,1920-01-01,1945-12-31,7080,,,1929-02-09,1934-05-24,1939-09-12,38726,0.0
2,Time,DOUBLE,7.45,24.0,5,13.535755162982028,5.0045606534180855,7.45,13.45,19.45,38726,37.73
3,timestamp,TIMESTAMP WITH TIME ZONE,1920-01-01 12:30:00+00,1946-01-01 01:15:00+01,39806,,,1929-02-09 21:38:43.93848+00,1934-05-25 06:43:36.399554+01,1939-09-12 11:26:43.408964+01,38726,0.0
4,Precipitation,INTEGER,0,2300,381,30.76948390460055,91.2521256996843,0,0,25,38726,20.75
5,qPrecipitation,BIGINT,0,7,3,1.4528223932241904,2.838558206621826,0,0,0,38726,0.0
6,Pressure,FLOAT,458.0,762.8,147,751.7242197617294,6.0678143898389125,747.221051685782,751.3480181866283,756.8001833904622,38726,51.49
7,qPressure,BIGINT,0,7,2,1.5466533963672555,2.90427044781216,0,0,0,38726,37.73
8,minTemperature,FLOAT,22.3,29.3,30,25.76448079927372,1.432536066271169,24.955555386013454,26.0,27.0,38726,99.05
9,qMinTemperature,BIGINT,0,7,2,4.666666666666667,3.301335328297214,0,7,7,38726,97.16


#### Statia

In [28]:
%%sql
# Load the Statia_Temp table from CSV files
DROP TABLE IF EXISTS Statia_Temp;
CREATE TABLE IF NOT EXISTS Statia_Temp AS
(SELECT *  FROM
  (SELECT * FROM read_csv('data/CSVs/Statia/*Regen.csv', union_by_name = true, dateformat = '%Y%m%d')) t1
FULL JOIN 
  (SELECT * FROM read_csv('data/CSVs/Statia/*Overig.csv', union_by_name = true, dateformat = '%Y%m%d')) t2
USING (Stn, Datum))
;

Unnamed: 0,Count
0,4383


In [29]:
# Update Statia_Temp table to set '—' values to NULL
cursor = conn.execute("PRAGMA table_info('Statia_Temp');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Statia_Temp
    SET {column} = NULL
    WHERE TRIM({column}) = '—';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['column0', 'Stn', 'Datum', 'Niveau', 'qRd', 'column0_1', 'Tijd', 'qP']


In [30]:
%%sql
# Set the timestamps manually due to historical timezone changes not considered by DuckDB
# Use 24:00 UTC for NULL values as specified by SEF
ALTER TABLE Statia_Temp DROP COLUMN IF EXISTS timestamp;
ALTER TABLE Statia_Temp ADD COLUMN timestamp TIMESTAMP WITH TIME ZONE;

UPDATE Statia_Temp AS a
SET timestamp = (
    CASE 
        -- When Time is NULL or 24, set to 24:00 UTC
        WHEN a.Tijd IS NULL OR CAST(a.Tijd AS INTEGER) = 24 THEN 
            (a.Datum + INTERVAL '24 hours') AT TIME ZONE 'UTC'

        ELSE 
            (a.Datum + 
            to_hours(CAST(a.Tijd AS INTEGER)) + 
            INTERVAL '4 hours 30 minutes') AT TIME ZONE 'UTC'
    END
);
            

Unnamed: 0,Count
0,4383


In [31]:
%%sql
# Normalize column names and types for Statia
DROP TABLE IF EXISTS Statia;
CREATE TABLE Statia AS
    SELECT 
        Stn, 
        Datum AS Date, 
        Tijd AS Time,
        Timestamp, 
        CAST(Rd AS INTEGER) AS Precipitation,
        qRd AS qPrecipitation,
        CAST(P AS FLOAT) AS Pressure,
        qP AS qPressure
    FROM Statia_Temp;

Unnamed: 0,Count
0,4383


In [32]:
%sql FROM(SUMMARIZE Statia)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Stn,BIGINT,73,73,1,73.0,0.0,73,73,73,4383,0.0
1,Date,DATE,1943-01-01,1946-12-31,1732,,,1944-01-01,1944-12-31,1945-12-31,4383,0.0
2,Time,DOUBLE,8.0,16.0,3,12.0,3.2663589617774624,8.0,12.0,16.0,4383,0.0
3,timestamp,TIMESTAMP WITH TIME ZONE,1943-01-01 13:30:00+01,1946-12-31 21:30:00+01,3738,,,1944-01-01 17:17:16.86636+01,1945-01-01 05:38:34.285714+01,1946-01-01 04:01:35.032145+01,4383,0.0
4,Precipitation,INTEGER,0,915,202,26.069335239456755,81.87703646299815,0,0,13,4383,4.24
5,qPrecipitation,BIGINT,0,7,2,0.2970568104038329,1.4112438904436446,0,0,0,4383,0.0
6,Pressure,FLOAT,755.0,766.0,13,760.980177360459,1.8670931740301857,760.0,761.0,762.0,4383,56.26
7,qPressure,BIGINT,0,7,2,3.938398357289528,3.4728314885410096,0,7,7,4383,0.0


#### Suriname

For Suriname, columns are split according to measurement time. Measurement time varies across the years. We create one table per set of measurement times, and join them later on.

In [33]:
%%sql
# Load the Suriname_1896 table from CSV files
# Rename column names due to two-level header in the CSV files
DROP TABLE IF EXISTS Suriname_1896;
CREATE TABLE IF NOT EXISTS Suriname_1896 AS
SELECT
    column00 AS "column00",
    column01 AS "Datum",
    column02 AS "Luchtdruk_8",
    column03 AS "Luchtdruk_12",
    column04 AS "Luchtdruk_18",
    column05 AS "Temp_droog_8",
    column06 AS "Temp_nat_8",
    column07 AS "Temp_verschil_8",
    column08 AS "Temp_droog_12",
    column09 AS "Temp_nat_12",
    column10 AS "Temp_verschil_12",
    column11 AS "Temp_droog_18",
    column12 AS "Temp_nat_18",
    column13 AS "Temp_verschil_18",
    column14 AS "Temp_min",
    column15 AS "Temp_max",
    column16 AS "Dampdruk_8",
    column17 AS "Dampdruk_12",
    column18 AS "Dampdruk_18",
    column19 AS "Rel_vocht_8",
    column20 AS "Rel_vocht_12",
    column21 AS "Rel_vocht_18",
    column22 AS "Wind_richt_8",
    column23 AS "Wind_kracht_8",
    column24 AS "Wind_richt_12",
    column25 AS "Wind_kracht_12",
    column26 AS "Wind_richt_18",
    column27 AS "Wind_kracht_18",
    column28 AS "Bewolking_8",
    column29 AS "Bewolking_12",
    column30 AS "Bewolking_18",
    column31 AS "Regen",
    column32 AS "Regen_wrong",
FROM read_csv('data/CSVs/Suriname/1896-1898/*.csv', union_by_name = true, dateformat = '%Y%m%d', skip = 2, null_padding = true, ignore_errors = true);
SELECT * FROM Suriname_1896 LIMIT 5;


Unnamed: 0,column00,Datum,Luchtdruk_8,Luchtdruk_12,Luchtdruk_18,Temp_droog_8,Temp_nat_8,Temp_verschil_8,Temp_droog_12,Temp_nat_12,...,Wind_kracht_8,Wind_richt_12,Wind_kracht_12,Wind_richt_18,Wind_kracht_18,Bewolking_8,Bewolking_12,Bewolking_18,Regen,Regen_wrong
0,0,18960101.0,,,,,,0.0,,,...,,,,,,,,,,
1,1,18960102.0,,,,,,0.0,,,...,,,,,,,,,,
2,2,18960103.0,,,,,,0.0,,,...,,,,,,,,,,
3,3,18960104.0,,,,,,0.0,,,...,,,,,,,,,,
4,4,18960105.0,,,,,,0.0,,,...,,,,,,,,,,


In [34]:
%%sql 
# Load the Suriname_1899 table from CSV files
DROP TABLE IF EXISTS Suriname_1899;
CREATE TABLE IF NOT EXISTS Suriname_1899 AS
SELECT
    column00 AS "column00",
    column01 AS "Datum",
    column02 AS "Luchtdruk_8",
    column03 AS "Luchtdruk_14",
    column04 AS "Luchtdruk_19",
    column05 AS "Temp_droog_8",
    column06 AS "Temp_nat_8",
    column07 AS "Temp_verschil_8",
    column08 AS "Temp_droog_14",
    column09 AS "Temp_nat_14",
    column10 AS "Temp_verschil_14",
    column11 AS "Temp_droog_19",
    column14 AS "Temp_nat_19",
    column13 AS "Temp_verschil_19",
    column14 AS "Temp_min",
    column15 AS "Temp_max",
    column16 AS "Dampdruk_8",
    column17 AS "Dampdruk_14",
    column18 AS "Dampdruk_19",
    column19 AS "Rel_vocht_8",
    column20 AS "Rel_vocht_14",
    column21 AS "Rel_vocht_19",
    column22 AS "Wind_richt_8",
    column23 AS "Wind_kracht_8",
    column24 AS "Wind_richt_14",
    column25 AS "Wind_kracht_14",
    column26 AS "Wind_richt_19",
    column27 AS "Wind_kracht_19",
    column28 AS "Bewolking_8",
    column29 AS "Bewolking_14",
    column30 AS "Bewolking_19",
    column31 AS "Regen",
    column32 AS "Regen_wrong",
FROM read_csv('data/CSVs/Suriname/1899-1904/*.csv', union_by_name = true, dateformat = '%Y%m%d', skip = 2, null_padding = true, ignore_errors = true)
WHERE column01 IS NOT NULL;
SELECT * FROM Suriname_1899 LIMIT 5;

Unnamed: 0,column00,Datum,Luchtdruk_8,Luchtdruk_14,Luchtdruk_19,Temp_droog_8,Temp_nat_8,Temp_verschil_8,Temp_droog_14,Temp_nat_14,...,Wind_kracht_8,Wind_richt_14,Wind_kracht_14,Wind_richt_19,Wind_kracht_19,Bewolking_8,Bewolking_14,Bewolking_19,Regen,Regen_wrong
0,0,18990101.0,620.0,605.0,609.0,239.0,,239.0,262.0,,...,2.0,NE,6.0,N,4.0,9.0,9.0,10.0,127.0,
1,1,18990102.0,625.0,608.0,615.0,241.0,,241.0,275.0,,...,2.0,NE,5.0,N,4.0,9.0,9.0,10.0,93.0,
2,2,18990103.0,619.0,604.0,609.0,245.0,,245.0,272.0,,...,6.0,ENE,7.0,NE,2.0,8.0,9.0,10.0,259.0,
3,3,18990104.0,616.0,605.0,601.0,246.0,,246.0,247.0,,...,1.0,SE,1.0,NNW,2.0,9.0,10.0,3.0,177.0,
4,4,18990105.0,605.0,596.0,598.0,220.0,,220.0,267.0,,...,1.0,NE,5.0,NNE,4.0,7.0,9.0,4.0,40.0,


In [35]:
%%sql 
# Load the Suriname_1905 table from CSV files
DROP TABLE IF EXISTS Suriname_1905;
CREATE TABLE IF NOT EXISTS Suriname_1905 AS
SELECT
    column00 AS "column00",
    column01 AS "Datum",
    column02 AS "Luchtdruk_8",
    column03 AS "Luchtdruk_14",
    column04 AS "Luchtdruk_18",
    column05 AS "Luchtdruk_19",
    column06 AS "Temp_droog_8",
    column07 AS "Temp_droog_14",
    column08 AS "Temp_droog_18",
    column09 AS "Temp_droog_19",
    column10 AS "Temp_max",
    column11 AS "Temp_min",
    column12 AS "Dampdruk_8",
    column13 AS "Dampdruk_14",
    column14 AS "Dampdruk_18",
    column15 AS "Dampdruk_19",
    column16 AS "Rel_vocht_8",
    column17 AS "Rel_vocht_14",
    column18 AS "Rel_vocht_18",
    column19 AS "Rel_vocht_19",
    column20 AS "Wind_richt_8",
    column21 AS "Wind_kracht_8",
    column22 AS "Wind_richt_14",
    column23 AS "Wind_kracht_14",
    column24 AS "Wind_richt_18",
    column25 AS "Wind_kracht_18",
    column26 AS "Wind_richt_19",
    column27 AS "Wind_kracht_19",
    column28 AS "Bewolking_8",
    column29 AS "Bewolking_14",
    column30 AS "Bewolking_18",
    column31 AS "Bewolking_19",
    column32 AS "Regen",
    column33 AS "Regen_wrong",
FROM read_csv('data/CSVs/Suriname/1905/*.csv', union_by_name = true, dateformat = '%Y%m%d', skip = 2, null_padding = true, ignore_errors = true)
WHERE column01 IS NOT NULL;
SELECT * FROM Suriname_1905 WHERE Temp_droog_8 > 40;


Unnamed: 0,column00,Datum,Luchtdruk_8,Luchtdruk_14,Luchtdruk_18,Luchtdruk_19,Temp_droog_8,Temp_droog_14,Temp_droog_18,Temp_droog_19,...,Wind_richt_18,Wind_kracht_18,Wind_richt_19,Wind_kracht_19,Bewolking_8,Bewolking_14,Bewolking_18,Bewolking_19,Regen,Regen_wrong
0,0,19050101.0,624.0,617.0,,612.0,242.0,267.0,,248.0,...,,,S,3.0,10.0,10.0,,7.0,175.0,
1,1,19050102.0,617.0,614.0,,604.0,241.0,266.0,,251.0,...,,,ESE,3.0,10.0,10.0,,8.0,50.0,
2,2,19050103.0,616.0,604.0,,599.0,242.0,310.0,,257.0,...,,,E,6.0,6.0,5.0,,6.0,7.0,
3,3,19050104.0,623.0,613.0,,605.0,252.0,312.0,,262.0,...,,,E,6.0,7.0,6.0,,7.0,5.0,
4,4,19050105.0,617.0,605.0,,607.0,266.0,281.0,,256.0,...,,,ESE,4.0,7.0,8.0,,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,371,19051227.0,638.0,624.0,618.0,,227.0,258.0,230.0,,...,N,1.0,,,10.0,10.0,8.0,,20.0,
330,372,19051228.0,624.0,618.0,616.0,,242.0,254.0,240.0,,...,NE,3.0,,,5.0,8.0,10.0,,105.0,
331,373,19051229.0,614.0,608.0,610.0,,240.0,243.0,241.0,,...,NNE,0.0,,,8.0,10.0,9.0,,60.0,
332,374,19051230.0,610.0,596.0,597.0,,251.0,268.0,258.0,,...,NNE,5.0,,,8.0,7.0,6.0,,115.0,


In [36]:
%%sql 
# Load the Suriname_Temp table from CSV files (after 1905)
DROP TABLE IF EXISTS Suriname_Temp;
CREATE TABLE IF NOT EXISTS Suriname_Temp AS
SELECT
    column00 AS "column00",
    column01 AS "Datum",
    column02 AS "Luchtdruk_8",
    column03 AS "Luchtdruk_14",
    column04 AS "Luchtdruk_18",
    column05 AS "Temp_droog_8",
    column06 AS "Temp_droog_14",
    column07 AS "Temp_droog_18",
    column08 AS "Temp_max",
    column09 AS "Temp_min",
    column10 AS "Dampdruk_8",
    column11 AS "Dampdruk_14",
    column12 AS "Dampdruk_18",
    column13 AS "Rel_vocht_8",
    column14 AS "Rel_vocht_14",
    column15 AS "Rel_vocht_18",
    column16 AS "Wind_richt_8",
    column17 AS "Wind_kracht_8",
    column18 AS "Wind_richt_14",
    column19 AS "Wind_kracht_14",
    column20 AS "Wind_richt_18",
    column21 AS "Wind_kracht_18",
    column22 AS "Bewolking_8",
    column23 AS "Bewolking_14",
    column24 AS "Bewolking_18",
    column25 AS "Regen",
    column26 AS "Regen_wrong",
FROM read_csv('data/CSVs/Suriname/*.csv', union_by_name = true, dateformat = '%Y%m%d', skip = 2, null_padding = true, ignore_errors = true);
SELECT * FROM Suriname_Temp LIMIT 5;

Unnamed: 0,column00,Datum,Luchtdruk_8,Luchtdruk_14,Luchtdruk_18,Temp_droog_8,Temp_droog_14,Temp_droog_18,Temp_max,Temp_min,...,Wind_kracht_8,Wind_richt_14,Wind_kracht_14,Wind_richt_18,Wind_kracht_18,Bewolking_8,Bewolking_14,Bewolking_18,Regen,Regen_wrong
0,0,18960101.0,,,,,,0.0,,,...,,,,,,,,,,
1,1,18960102.0,,,,,,0.0,,,...,,,,,,,,,,
2,2,18960103.0,,,,,,0.0,,,...,,,,,,,,,,
3,3,18960104.0,,,,,,0.0,,,...,,,,,,,,,,
4,4,18960105.0,,,,,,0.0,,,...,,,,,,,,,,


In [37]:
%%sql
# Compare table columns to identify columns present for join
WITH union_tables AS (
    -- Get columns from first table 
    SELECT 'Suriname_1896' AS table_name, 
           column_name
    FROM information_schema.columns
    WHERE table_name = 'Suriname_1896'
    
    UNION ALL
    
    -- Get columns from second table
    SELECT 'Suriname_1899' AS table_name, 
           column_name
    FROM information_schema.columns
    WHERE table_name = 'Suriname_1899'
    
    UNION ALL
    

    SELECT 'Suriname_1905' AS table_name, 
           column_name
    FROM information_schema.columns
    WHERE table_name = 'Suriname_1905'
    
    UNION ALL
    
    SELECT 'Suriname_Temp' AS table_name, 
           column_name
    FROM information_schema.columns
    WHERE table_name = 'Suriname_Temp'
)

SELECT column_name,
       MAX(CASE WHEN table_name = 'Suriname_1896' THEN 1 ELSE 0 END) AS in_1896,
       MAX(CASE WHEN table_name = 'Suriname_1899' THEN 1 ELSE 0 END) AS in_1899,
       MAX(CASE WHEN table_name = 'Suriname_1905' THEN 1 ELSE 0 END) AS in_1905,
       MAX(CASE WHEN table_name = 'Suriname_Temp' THEN 1 ELSE 0 END) AS in_temp
FROM union_tables
GROUP BY column_name
ORDER BY column_name;

Unnamed: 0,column_name,in_1896,in_1899,in_1905,in_temp
0,Bewolking_12,1,0,0,0
1,Bewolking_14,0,1,1,1
2,Bewolking_18,1,0,1,1
3,Bewolking_19,0,1,1,0
4,Bewolking_8,1,1,1,1
5,Dampdruk_12,1,0,0,0
6,Dampdruk_14,0,1,1,1
7,Dampdruk_18,1,0,1,1
8,Dampdruk_19,0,1,1,0
9,Dampdruk_8,1,1,1,1


In [38]:
%%sql
# Create Suriname_Combined table by combining the Suriname tables
# Use NULL for missing columns in each table
DROP TABLE IF EXISTS Suriname_Combined;
CREATE TABLE IF NOT EXISTS Suriname_Combined AS
SELECT * FROM (
SELECT 
    Bewolking_12,
    NULL AS Bewolking_14,
    Bewolking_18,
    NULL AS Bewolking_19,
    Bewolking_8,
    Dampdruk_12,
    NULL AS Dampdruk_14,
    Dampdruk_18,
    NULL AS Dampdruk_19,
    Dampdruk_8,
    Datum,
    Luchtdruk_12,
    NULL AS Luchtdruk_14,
    Luchtdruk_18,
    NULL AS Luchtdruk_19,
    Luchtdruk_8,
    Regen,
    Regen_wrong,
    Rel_vocht_12,
    NULL AS Rel_vocht_14,
    Rel_vocht_18,
    NULL AS Rel_vocht_19,
    Rel_vocht_8,
    Temp_droog_12,
    NULL AS Temp_droog_14,
    Temp_droog_18,
    NULL AS Temp_droog_19,
    Temp_droog_8,
    Temp_max,
    Temp_min,
    Temp_nat_12,
    NULL AS Temp_nat_14,
    Temp_nat_18,
    NULL AS Temp_nat_19,
    Temp_nat_8,
    Temp_verschil_12,
    NULL AS Temp_verschil_14,
    Temp_verschil_18,
    NULL AS Temp_verschil_19,
    Temp_verschil_8,
    Wind_kracht_12,
    NULL AS Wind_kracht_14,
    Wind_kracht_18,
    NULL AS Wind_kracht_19,
    Wind_kracht_8,
    Wind_richt_12,
    NULL AS Wind_richt_14,
    Wind_richt_18,
    NULL AS Wind_richt_19,
    Wind_richt_8,
FROM Suriname_1896

UNION ALL

SELECT 
    NULL AS Bewolking_12,
    Bewolking_14,
    NULL AS Bewolking_18,
    Bewolking_19,
    Bewolking_8,
    NULL AS Dampdruk_12,
    Dampdruk_14,
    NULL AS Dampdruk_18,
    Dampdruk_19,
    Dampdruk_8,
    Datum,
    NULL AS Luchtdruk_12,
    Luchtdruk_14,
    NULL AS Luchtdruk_18,
    Luchtdruk_19,
    Luchtdruk_8,
    Regen,
    Regen_wrong,
    NULL AS Rel_vocht_12,
    Rel_vocht_14,
    NULL AS Rel_vocht_18,
    Rel_vocht_19,
    Rel_vocht_8,
    NULL AS Temp_droog_12,
    Temp_droog_14,
    NULL AS Temp_droog_18,
    Temp_droog_19,
    Temp_droog_8,
    Temp_max,
    Temp_min,
    NULL AS Temp_nat_12,
    Temp_nat_14,
    NULL AS Temp_nat_18,
    Temp_nat_19,
    Temp_nat_8,
    NULL AS Temp_verschil_12,
    Temp_verschil_14,
    NULL AS Temp_verschil_18,
    Temp_verschil_19,
    Temp_verschil_8,
    NULL AS Wind_kracht_12,
    Wind_kracht_14,
    NULL AS Wind_kracht_18,
    Wind_kracht_19,
    Wind_kracht_8,
    NULL AS Wind_richt_12,
    Wind_richt_14,
    NULL AS Wind_richt_18,
    Wind_richt_19,
    Wind_richt_8,
FROM Suriname_1899

UNION ALL

SELECT
    NULL AS Bewolking_12,
    Bewolking_14,
    Bewolking_18,
    Bewolking_19,
    Bewolking_8,
    NULL AS Dampdruk_12,
    Dampdruk_14,
    Dampdruk_18,
    Dampdruk_19,
    Dampdruk_8,
    Datum,
    NULL AS Luchtdruk_12,
    Luchtdruk_14,
    Luchtdruk_18,
    Luchtdruk_19,
    Luchtdruk_8,
    Regen,
    Regen_wrong,
    NULL AS Rel_vocht_12,
    Rel_vocht_14,
    Rel_vocht_18,
    Rel_vocht_19,
    Rel_vocht_8,
    NULL AS Temp_droog_12,
    Temp_droog_14,
    Temp_droog_18,
    Temp_droog_19,
    Temp_droog_8,
    Temp_max,
    Temp_min,
    NULL AS Temp_nat_12,
    NULL AS Temp_nat_14,
    NULL AS Temp_nat_18,
    NULL AS Temp_nat_19,
    NULL AS Temp_nat_8,
    NULL AS Temp_verschil_12,
    NULL AS Temp_verschil_14,
    NULL AS Temp_verschil_18,
    NULL AS Temp_verschil_19,
    NULL AS Temp_verschil_8,
    NULL AS Wind_kracht_12,
    Wind_kracht_14,
    Wind_kracht_18,
    Wind_kracht_19,
    Wind_kracht_8,
    NULL AS Wind_richt_12,
    Wind_richt_14,
    Wind_richt_18,
    Wind_richt_19,
    Wind_richt_8,
FROM Suriname_1905

UNION ALL

SELECT 
    NULL AS Bewolking_12,
    Bewolking_14,
    Bewolking_18,
    NULL AS Bewolking_19,
    Bewolking_8,
    NULL AS Dampdruk_12,
    Dampdruk_14,
    Dampdruk_18,
    NULL AS Dampdruk_19,
    Dampdruk_8,
    Datum,
    NULL AS Luchtdruk_12,
    Luchtdruk_14,
    Luchtdruk_18,
    NULL AS Luchtdruk_19,
    Luchtdruk_8,
    Regen,
    Regen_wrong,
    NULL AS Rel_vocht_12,
    Rel_vocht_14,
    Rel_vocht_18,
    NULL AS Rel_vocht_19,
    Rel_vocht_8,
    NULL AS Temp_droog_12,
    Temp_droog_14,
    Temp_droog_18,
    NULL AS Temp_droog_19,
    Temp_droog_8,
    Temp_max,
    Temp_min,
    NULL AS Temp_nat_12,
    NULL AS Temp_nat_14,
    NULL AS Temp_nat_18,
    NULL AS Temp_nat_19,
    NULL AS Temp_nat_8,
    NULL AS Temp_verschil_12,
    NULL AS Temp_verschil_14,
    NULL AS Temp_verschil_18,
    NULL AS Temp_verschil_19,
    NULL AS Temp_verschil_8,
    NULL AS Wind_kracht_12,
    Wind_kracht_14,
    Wind_kracht_18,
    NULL AS Wind_kracht_19,
    Wind_kracht_8,
    NULL AS Wind_richt_12,
    Wind_richt_14,
    Wind_richt_18,
    NULL AS Wind_richt_19,
    Wind_richt_8,
FROM Suriname_Temp)
WHERE Datum IS NOT NULL;

Unnamed: 0,Count
0,28123


In [39]:
# Update Suriname_Combined table to set '-' values to NULL
cursor = conn.execute("PRAGMA table_info('Suriname_Combined');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Suriname_Combined
    SET {column} = NULL
    WHERE TRIM({column}) = '-';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['Bewolking_12', 'Bewolking_19', 'Dampdruk_12', 'Dampdruk_19', 'Datum', 'Luchtdruk_12', 'Luchtdruk_14', 'Luchtdruk_18', 'Luchtdruk_19', 'Luchtdruk_8', 'Regen', 'Rel_vocht_12', 'Rel_vocht_19', 'Temp_droog_12', 'Temp_droog_18', 'Temp_droog_19', 'Temp_droog_8', 'Temp_nat_19', 'Temp_verschil_12', 'Temp_verschil_14', 'Temp_verschil_18', 'Temp_verschil_19', 'Temp_verschil_8', 'Wind_kracht_12', 'Wind_kracht_19']


In [40]:
%%sql
-- First, add a new column to store the formatted date
ALTER TABLE Suriname_Combined ADD COLUMN Date DATE;

-- Then update the column with the properly formatted date
UPDATE Suriname_Combined
SET Date = make_date(
    CAST(SUBSTRING(CAST(Datum AS VARCHAR), 1, 4) AS INTEGER),  -- Year: first 4 chars
    CAST(SUBSTRING(CAST(Datum AS VARCHAR), 5, 2) AS INTEGER),  -- Month: chars 5-6
    CAST(SUBSTRING(CAST(Datum AS VARCHAR), 7, 2) AS INTEGER)   -- Day: chars 7-8
);

Unnamed: 0,Count
0,28123


In [41]:
%%sql
# Take the measurements at different times and convert them to a single column and a timestamp
DROP TABLE IF EXISTS Suriname_Long;
CREATE TABLE Suriname_Long AS
SELECT * FROM 
    (
    -- 8:00 measurements
    SELECT
        datum,
        '08:00:00' AS time_of_day,
        (Date + INTERVAL '8 hours') AT TIME ZONE 'America/Paramaribo' AS Timestamp,
        TRY_CAST(Luchtdruk_8 AS FLOAT) AS pressure,
        TRY_CAST(Temp_droog_8 AS FLOAT) AS dry_temperature,
        TRY_CAST(Temp_nat_8 AS FLOAT) AS wet_temperature,
        TRY_CAST(Temp_verschil_8 AS FLOAT) AS temp_difference,
        TRY_CAST(Dampdruk_8 AS FLOAT) AS vapor_pressure,
        TRY_CAST(Rel_vocht_8 AS INTEGER) AS relative_humidity,
        TRY_CAST(Wind_richt_8 AS VARCHAR) AS wind_direction,
        TRY_CAST(Wind_kracht_8 AS FLOAT) AS wind_force,
        TRY_CAST(Bewolking_8 AS INTEGER) AS cloud_cover,
        NULL AS minTemperature,
        NULL AS maxTemperature,
        NULL AS precipitation
    FROM Suriname_Combined

    UNION ALL

    -- 12:00 measurements
    SELECT
        datum,
        '12:00:00' AS time_of_day,
        (Date + INTERVAL '12 hours') AT TIME ZONE 'America/Paramaribo' AS Timestamp,
        TRY_CAST(Luchtdruk_12 AS FLOAT) AS pressure,
        TRY_CAST(Temp_droog_12 AS FLOAT) AS dry_temperature,
        TRY_CAST(Temp_nat_12 AS FLOAT) AS wet_temperature,
        TRY_CAST(Temp_verschil_12 AS FLOAT) AS temp_difference,
        TRY_CAST(Dampdruk_12 AS FLOAT) AS vapor_pressure,
        TRY_CAST(Rel_vocht_12 AS INTEGER) AS relative_humidity,
        TRY_CAST(Wind_richt_12 AS VARCHAR) AS wind_direction,
        TRY_CAST(Wind_kracht_12 AS FLOAT) AS wind_force,
        TRY_CAST(Bewolking_12 AS INTEGER) AS cloud_cover,
        NULL AS minTemperature,
        NULL AS maxTemperature,
        NULL AS precipitation
    FROM Suriname_Combined


    UNION ALL

    -- 14:00 measurements
    SELECT
        datum,
        '14:00:00' AS time_of_day,
        (Date + INTERVAL '14 hours') AT TIME ZONE 'America/Paramaribo' AS Timestamp,
        TRY_CAST(Luchtdruk_14 AS FLOAT) AS pressure,
        TRY_CAST(Temp_droog_14 AS FLOAT) AS dry_temperature,
        TRY_CAST(Temp_nat_14 AS FLOAT) AS wet_temperature,
        TRY_CAST(Temp_verschil_14 AS FLOAT) AS temp_difference,
        TRY_CAST(Dampdruk_14 AS FLOAT) AS vapor_pressure,
        TRY_CAST(Rel_vocht_14 AS INTEGER) AS relative_humidity,
        TRY_CAST(Wind_richt_14 AS VARCHAR) AS wind_direction,
        TRY_CAST(Wind_kracht_14 AS FLOAT) AS wind_force,
        TRY_CAST(Bewolking_14 AS INTEGER) AS cloud_cover,
        NULL AS minTemperature,
        NULL AS maxTemperature,
        NULL AS precipitation
    FROM Suriname_Combined


    UNION ALL

    -- 18:00 measurements
    SELECT
        datum,
        '18:00:00' AS time_of_day,
        (Date + INTERVAL '18 hours') AT TIME ZONE 'America/Paramaribo' AS Timestamp,
        TRY_CAST(Luchtdruk_18 AS FLOAT) AS pressure,
        TRY_CAST(Temp_droog_18 AS FLOAT) AS dry_temperature,
        TRY_CAST(Temp_nat_18 AS FLOAT) AS wet_temperature,
        TRY_CAST(Temp_verschil_18 AS FLOAT) AS temp_difference,
        TRY_CAST(Dampdruk_18 AS FLOAT) AS vapor_pressure,
        TRY_CAST(Rel_vocht_18 AS INTEGER) AS relative_humidity,
        TRY_CAST(Wind_richt_18 AS VARCHAR) AS wind_direction,
        TRY_CAST(Wind_kracht_18 AS FLOAT) AS wind_force,
        TRY_CAST(Bewolking_18 AS INTEGER) AS cloud_cover,
        NULL AS minTemperature,
        NULL AS maxTemperature,
        NULL AS precipitation
    FROM Suriname_Combined

    UNION ALL

    -- 19:00 measurements
    SELECT
        datum,
        '19:00:00' AS time_of_day,
        (Date + INTERVAL '19 hours') AT TIME ZONE 'America/Paramaribo' AS Timestamp,
        TRY_CAST(Luchtdruk_19 AS FLOAT) AS pressure,
        TRY_CAST(Temp_droog_19 AS FLOAT) AS dry_temperature,
        TRY_CAST(Temp_nat_19 AS FLOAT) AS wet_temperature,
        TRY_CAST(Temp_verschil_19 AS FLOAT) AS temp_difference,
        TRY_CAST(Dampdruk_19 AS FLOAT) AS vapor_pressure,
        TRY_CAST(Rel_vocht_19 AS INTEGER) AS relative_humidity,
        TRY_CAST(Wind_richt_19 AS VARCHAR) AS wind_direction,
        TRY_CAST(Wind_kracht_19 AS FLOAT) AS wind_force,
        TRY_CAST(Bewolking_19 AS INTEGER) AS cloud_cover,
        NULL AS minTemperature,
        NULL AS maxTemperature,
        NULL AS precipitation
    FROM Suriname_Combined

    UNION ALL

    SELECT  
        datum,
        NULL AS time_of_day,
        (Date + INTERVAL '24 hours') AT TIME ZONE 'UTC' AS Timestamp,
        NULL AS pressure,
        NULL AS dry_temperature,
        NULL AS wet_temperature,
        NULL AS temp_difference,
        NULL AS vapor_pressure,
        NULL AS relative_humidity,
        NULL AS wind_direction,
        NULL AS wind_force,
        NULL AS cloud_cover,
        TRY_CAST(Temp_min AS FLOAT) AS minTemperature,
        TRY_CAST(Temp_max AS FLOAT) AS maxTemperature,
        TRY_CAST(Regen AS INTEGER) AS precipitation
    FROM Suriname_Combined
    )
    WHERE Datum IS NOT NULL;

Unnamed: 0,Count
0,168738


In [42]:
%%sql
FROM (SUMMARIZE Suriname_Long)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,Datum,DOUBLE,18960101.0,19621231.0,22883,19253662.388365395,204519.8554319957,19050603.632235393,19240280.041555826,19434623.581241433,168738,0.0
1,time_of_day,VARCHAR,08:00:00,19:00:00,5,,,,,,168738,16.67
2,Timestamp,TIMESTAMP WITH TIME ZONE,1896-01-01 11:40:40+00,1963-01-01 01:00:00+01,173019,,,1905-08-19 21:02:53.209925+00,1924-07-01 05:38:14.920184+01,1943-10-06 03:07:24.741008+01,168738,0.0
3,pressure,FLOAT,10.0,898.0,222,530.1347839188409,181.6803518349892,591.9383542838567,606.964696788004,617.0101398831054,168738,52.8
4,dry_temperature,FLOAT,-37.0,631.0,350,260.7193863113473,52.06433396736903,251.01106561247792,263.00915242355177,280.3840609915563,168738,51.83
5,wet_temperature,FLOAT,192.0,374.0,178,266.5476904619076,28.82095319030654,243.90644932671864,258.6622276132108,290.1984410135647,168738,96.05
6,temp_difference,FLOAT,-42.0,399.0,275,92.0057803468208,116.08537577846204,10.0,20.52753444669419,246.5944181055247,168738,94.16
7,vapor_pressure,FLOAT,-42.0,3684.0,376,215.1256961605134,42.22089773667206,207.0278476534242,217.9003266928308,228.3614768077054,168738,51.8
8,relative_humidity,INTEGER,0,374,292,98.43247085932808,58.11899601565138,75,86,93,168738,51.65
9,wind_direction,VARCHAR,Calm,WSW,226,,,,,,168738,52.67


#### Willemstad

In [43]:
%%sql
# Load the Willemstad_Temp table from CSV files
DROP TABLE IF EXISTS Willemstad_Temp;
CREATE TABLE IF NOT EXISTS Willemstad_Temp AS
SELECT * FROM read_csv('data/CSVs/Willemstad/*.csv', union_by_name = true, 
dateformat = '%Y%m%d', skip=1,  columns=
{
    'column00': 'BIGINT',
    'Stn': 'BIGINT',
    'Datum': 'DATE',
    'Tijd': 'BIGINT',
    'DD': 'VARCHAR',
    'qDD': 'BIGINT',
    'FK': 'VARCHAR',
    'FF': 'VARCHAR',
    'FF1': 'VARCHAR',
    'FF2': 'VARCHAR',
    'qFF': 'BIGINT',
    'WW': 'VARCHAR',
    'qWW': 'BIGINT',
    'N': 'VARCHAR',
    'qN': 'BIGINT',
    'T': 'VARCHAR',
    'qT': 'BIGINT',
    'U': 'VARCHAR',
    'qU': 'BIGINT',
    'EE': 'VARCHAR',
    'qEE': 'BIGINT',
    'Tw': 'VARCHAR',
    'qTw': 'BIGINT',
    'P': 'VARCHAR',
    'Pt': 'VARCHAR',
    'Pt1': 'VARCHAR',
    'qPt': 'BIGINT',
    'Rd': 'VARCHAR',
    'Dr': 'VARCHAR',
    'qRd': 'BIGINT',
    'EV': 'VARCHAR',
    'qEV': 'BIGINT',
    'Trn': 'VARCHAR',
    'Tin': 'VARCHAR',
    'Tn': 'VARCHAR',
    'qTn': 'BIGINT',
    'Trx': 'VARCHAR',
    'Tix': 'VARCHAR',
    'Tx': 'VARCHAR',
    'qTx': 'BIGINT'
});

Unnamed: 0,Count
0,58803


In [44]:
# Update Willemstad_Temp table to set '//////' (and similar) values to NULL
cursor = conn.execute("PRAGMA table_info('Willemstad_Temp');")
columns = [row[1] for row in cursor.fetchall()]
failed_columns = []

for column in columns:
    update_query = f"""
    UPDATE Willemstad_Temp
    SET {column} = NULL
    WHERE TRIM({column}) = '//////'
    OR TRIM({column}) = '//////////////' 
    OR TRIM({column}) = '////////////////////////';
    """
    try:
        conn.execute(update_query)
    except Exception:
        failed_columns.append(column)
        continue
conn.commit()
print(f"Failed to update columns: {failed_columns}")

Failed to update columns: ['column00', 'Stn', 'Datum', 'Tijd', 'qDD', 'qFF', 'qWW', 'qN', 'qT', 'qU', 'qEE', 'qTw', 'qPt', 'qRd', 'qEV', 'qTn', 'qTx']


In [45]:
# Some columns contain all NULL values, so we drop them
def drop_all_null_columns(table_name):
    """Drops all columns from a table that contain only NULL values"""
    
    # Get summary statistics for the table
    query = f"SELECT column_name, null_percentage FROM (SUMMARIZE {table_name})"
    result = conn.execute(query).fetchall()
    
    # Find columns with 100% NULL values
    all_null_columns = [row[0] for row in result if row[1] == 100.0]
    
    if not all_null_columns:
        print(f"No columns with 100% NULL values found in {table_name}")
        return
    
    # Drop each column that's 100% NULL
    for column in all_null_columns:
        drop_query = f"ALTER TABLE {table_name} DROP COLUMN {column}"
        conn.execute(drop_query)
    
    conn.commit()
    print(f"Dropped {len(all_null_columns)} columns from {table_name}: {', '.join(all_null_columns)}")

# Apply to Willemstad_Temp table
drop_all_null_columns('Willemstad_Temp')

Dropped 8 columns from Willemstad_Temp: WW, P, Dr, EV, Trn, Tin, Trx, Tix


In [46]:
%%sql
# Set the timestamps manually due to historical timezone changes not considered by DuckDB
ALTER TABLE Willemstad_Temp DROP COLUMN IF EXISTS timestamp;
ALTER TABLE Willemstad_Temp ADD COLUMN timestamp TIMESTAMP WITH TIME ZONE;

UPDATE Willemstad_Temp AS a
SET timestamp = (
    CASE 
        -- When Time is NULL or 24, set to 24:00 UTC
        WHEN a.Tijd IS NULL OR CAST(a.Tijd AS INTEGER) = 2400 THEN 
            (a.Datum + INTERVAL '24 hours') AT TIME ZONE 'UTC'

        ELSE 
            CASE
                -- Before Feb 12, 1912: UTC-04:35:24
                WHEN a.Datum < '1912-02-12' THEN 
                    (a.Datum + 
                    to_hours(CAST(a.Tijd/100 AS INTEGER)) + 
                    to_minutes(CAST(a.Tijd%100 AS INTEGER)) + 
                    INTERVAL '4 hours 35 minutes 24 seconds') AT TIME ZONE 'UTC'
                    
                -- After Feb 12, 1912: UTC-04:35:24
                ELSE 
                    (a.Datum + 
                    to_hours(CAST(a.Tijd/100 AS INTEGER)) + 
                    to_minutes(CAST(a.Tijd%100 AS INTEGER)) + 
                    INTERVAL '4 hours 30 minutes') AT TIME ZONE 'UTC'
            END
    END
);
            

Unnamed: 0,Count
0,58803


In [47]:
%%sql
# Normalize column names and types for Willemstad
DROP TABLE IF EXISTS Willemstad;
CREATE TABLE Willemstad AS
    SELECT 
        Stn, 
        Datum AS Date, 
        Tijd AS Time,
        Timestamp, 
        TRY_CAST(DD AS STRING) AS windDirection,
        qDD AS qWindDirection,
        TRY_CAST(FK AS INTEGER) AS windForce, # niet in de KNMI lijst
        TRY_CAST(FF AS INTEGER) AS windForce0,
        TRY_CAST(FF1 AS INTEGER) AS windForce1,
        TRY_CAST(FF2 AS INTEGER) AS windForce2,
        qFF AS qWindForce,
        TRY_CAST(N AS INTEGER) AS cloudCover,
        qN AS qCloudCover,
        TRY_CAST(T AS FLOAT)/10.0 AS dryBulbTemperature,
        qT AS qDryBulbTemperature,
        TRY_CAST(U AS INTEGER) AS relativeHumidity,
        qU AS qRelativeHumidity,
        TRY_CAST(EE AS FLOAT)/10.0 AS vapourPressure,
        qEE AS qVapourPressure,
        TRY_CAST(Tw AS FLOAT)/10.0 AS wetBulbTemperature,
        qTw AS qWetBulbTemperature,
        TRY_CAST(Pt AS FLOAT)/100.0 AS pressure,
        qPt AS qPressure,
        TRY_CAST(Rd AS FLOAT)/10.0 AS precipitation,
        qRd AS qPrecipitation,
        TRY_CAST(Tn AS FLOAT)/10.0 AS minTemperature,
        qTn AS qMinTemperature,
        TRY_CAST(Tx AS FLOAT)/10.0 AS maxTemperature,
        qTx AS qMaxTemperature,
    FROM Willemstad_Temp;

Unnamed: 0,Count
0,58803


In [48]:
%%sql 
# Filter extreme values in vapourPressure
UPDATE Willemstad
SET vapourPressure = NULL
WHERE vapourPressure > 50;

Unnamed: 0,Count
0,1


In [49]:
%%sql 
SELECT * FROM Willemstad LIMIT 10;

Unnamed: 0,Stn,Date,Time,timestamp,windDirection,qWindDirection,windForce,windForce0,windForce1,windForce2,...,wetBulbTemperature,qWetBulbTemperature,pressure,qPressure,precipitation,qPrecipitation,minTemperature,qMinTemperature,maxTemperature,qMaxTemperature
0,78989,1910-01-01,800,1910-01-01 12:35:24+00:00,ONO,0,0.0,,,,...,20.799999,0,764.5,0,5.0,0,,7,,7
1,78989,1910-01-01,1400,1910-01-01 18:35:24+00:00,NO,0,0.0,,,,...,23.200001,0,764.0,0,,7,,7,,7
2,78989,1910-01-01,1800,1910-01-01 22:35:24+00:00,,7,,,,,...,,7,,7,,7,,7,,7
3,78989,1910-01-01,2400,1910-01-02 00:00:00+00:00,,7,,,,,...,,7,,7,,7,24.200001,0,27.799999,0
4,78989,1910-01-02,800,1910-01-02 12:35:24+00:00,ONO,0,1.0,,,,...,22.0,0,764.0,0,12.6,0,,7,,7
5,78989,1910-01-02,1400,1910-01-02 18:35:24+00:00,O,0,0.0,,,,...,24.4,0,763.5,0,,7,,7,,7
6,78989,1910-01-02,1800,1910-01-02 22:35:24+00:00,,7,,,,,...,,7,,7,,7,,7,,7
7,78989,1910-01-02,2400,1910-01-03 00:00:00+00:00,,7,,,,,...,,7,,7,,7,25.0,0,28.0,0
8,78989,1910-01-03,800,1910-01-03 12:35:24+00:00,O,0,1.0,,,,...,21.6,0,764.5,0,17.0,0,,7,,7
9,78989,1910-01-03,1400,1910-01-03 18:35:24+00:00,O,0,1.0,,,,...,24.0,0,764.0,0,,7,,7,,7


In [210]:
%%sql
# Save all tables to Parquet files
COPY
    (SELECT * FROM Aruba)
    TO 'dbs/aruba.parquet'
    (FORMAT parquet);
COPY
    (SELECT * FROM Bonaire)
    TO 'dbs/bonaire.parquet'
    (FORMAT parquet);
COPY
    (SELECT * FROM Curacao)
    TO 'dbs/curacao.parquet'
    (FORMAT parquet);
COPY
    (SELECT * FROM Saba)
    TO 'dbs/saba.parquet'
    (FORMAT parquet);
COPY
    (SELECT * FROM Statia)
    TO 'dbs/statia.parquet'
    (FORMAT parquet);
COPY
    (SELECT * FROM Suriname_Long)
    TO 'dbs/suriname.parquet'
    (FORMAT parquet);

Unnamed: 0,Count
0,168738


In [211]:
%%sql
FROM (SHOW ALL TABLES);

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,knmi_database,main,Aggregated,"[Stn, Datum, Niveau, Rd, qRd, Tijd, Tx, qTx, T...","[BIGINT, DATE, BIGINT, VARCHAR, BIGINT, DOUBLE...",False
1,knmi_database,main,Aruba,"[Stn, timestamp, Precipitation, qPrecipitation...","[BIGINT, TIMESTAMP WITH TIME ZONE, FLOAT, BIGI...",False
2,knmi_database,main,Aruba_Temp,"[Stn, Date, Time, Precipitation, qPrecipitatio...","[BIGINT, DATE, DOUBLE, VARCHAR, BIGINT, VARCHA...",False
3,knmi_database,main,Bonaire,"[Stn, Date, Time, timestamp, Precipitation, qP...","[BIGINT, DATE, DOUBLE, TIMESTAMP WITH TIME ZON...",False
4,knmi_database,main,Bonaire_Temp,"[column0, Stn, Datum, Niveau, Rd, qRd, column0...","[BIGINT, BIGINT, DATE, BIGINT, VARCHAR, BIGINT...",False
5,knmi_database,main,Curacao,"[Stn, Date, Time, timestamp, Precipitation, qP...","[BIGINT, DATE, DOUBLE, TIMESTAMP WITH TIME ZON...",False
6,knmi_database,main,Curacao_Temp,"[column0, Stn, Datum, Niveau, Rd, qRd, column0...","[BIGINT, BIGINT, DATE, BIGINT, VARCHAR, BIGINT...",False
7,knmi_database,main,Saba,"[Stn, Date, Time, timestamp, Precipitation, qP...","[BIGINT, DATE, DOUBLE, TIMESTAMP WITH TIME ZON...",False
8,knmi_database,main,Saba_Temp,"[column0, Stn, Datum, Niveau, Rd, qRd, column0...","[BIGINT, BIGINT, DATE, BIGINT, VARCHAR, BIGINT...",False
9,knmi_database,main,Statia,"[Stn, Date, Time, timestamp, Precipitation, qP...","[BIGINT, DATE, DOUBLE, TIMESTAMP WITH TIME ZON...",False


In [212]:
end_time = time.time()
print(f"Script executed in {end_time - start_time:.2f} seconds.")

Script executed in 33.05 seconds.
