In [25]:
import duckdb
import pandas as pd
from datetime import datetime
import time
start_time = time.time()

In [None]:
# Convert Holyoke pressure data from .dat to .csv format
def pressure_txt_dat_to_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    datalist = []

    offset = 0
    # The file has 140 groups of 33 lines each
    # The first line of this group is the year
    # The second line contains the month names
    # The next 31 lines contain the data for each day of the month
    for offset in range(140):
        year = lines[offset*33]
        months = lines[offset*33+1].split()
        data = lines[offset*33+2].split()
        print(year, months, data)
        
        for i in range(3): # for each month, iterate over the data, while getting the date and entries
            curr_month = months[i]
            for line in range(31):
                datestring = f'{year.strip()}_{curr_month}_{line+1}'
                try:
                    date = datetime.strptime(datestring, '%Y_%B_%d')
                except ValueError:
                    date = None
                data = lines[offset*33+line+2].split()
                curr_data = (date, data[2*i+1], data[2*i+2], datestring)
                datalist.append(curr_data)

    # Create a DataFrame and save it to CSV
    df = pd.DataFrame(datalist, columns=['date', 'p0', 'p1', 'datestring'])
    df.to_csv(file_path + '.csv', index=False)

pressure_txt_dat_to_csv('data/Holyoke/holyoke.pressure.dat.txt')

1786
 ['January', 'February', 'March'] ['1', '1021.8', '1015.6', '1027.3', '1029.0', '1007.8', '1008.8']
1786
 ['April', 'May', 'June'] ['1', '1020.1', '1014.7', '997.8', '998.1', '1016.5', '1015.0']
1786
 ['July', 'August', 'September'] ['1', '1017.3', '1013.2', '1016.4', '1014.6', '1012.6', '1015.4']
1786
 ['October', 'November', 'December'] ['1', '1013.5', '1010.5', '1011.1', '1016.7', '1007.5', '1006.5']
1787
 ['January', 'February', 'March'] ['1', '1015.2', '1015.6', '1028.7', '1025.5', '995.4', '999.7']
1787
 ['April', 'May', 'June'] ['1', '1019.4', '1008.9', '996.2', '1004.0', '995.9', '997.6']
1787
 ['July', 'August', 'September'] ['1', '1011.6', '1009.8', '1014.9', '1012.5', '1015.9', '1008.5']
1787
 ['October', 'November', 'December'] ['1', '1026.9', '1021.8', '1008.2', '1016.8', '1001.4', '998.4']
1788
 ['January', 'February', 'March'] ['1', '1017.3', '1008.3', '1019.1', '1021.5', '1005.9', '1003.7']
1788
 ['April', 'May', 'June'] ['1', '1010.1', '1008.9', '1012.9', '1007.7'

In [27]:
def temperature_txt_dat_to_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    datalist = []

    offset = 0
    for offset in range(176):
        year = lines[offset*33]
        months = lines[offset*33+1].split()
        data = lines[offset*33+2].split()
        
        for i in range(3):
            curr_month = months[i]
            for line in range(31):
                datestring = f'{year.strip()}_{curr_month}_{line+1}'
                try:
                    date = datetime.strptime(datestring, '%Y_%B_%d')
                except ValueError:
                    date = None
                data = lines[offset*33+line+2].split()
                curr_data = (date, data[4*i+1], data[4*i+2], 
                             data[4*i+3], data[4*i+4],
                             datestring)
                datalist.append(curr_data)

    df = pd.DataFrame(datalist, columns=['date', 't0', 't1', 't2', 't3', 'datestring'])
    df.to_csv(file_path + '.csv', index=False)

temperature_txt_dat_to_csv('data/Holyoke/holyoke.temperature.dat.txt')

In [28]:
%load_ext sql
conn = duckdb.connect('dbs/holyoke_database.db')
%sql conn --alias duckdb

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


Month
pe      pe    te      te      te      te
08:00   12:00 08:00   12:00   sunset  22:00

The observation time for the first measurement was generally noted down as `eight o'clock', but sometimes `from eight to nine'.  The second observation (for temperature and the state of the weather) is generally made around noon, but sometimes `at one P.M.', `at two P.M.' or `between one and two P.M.'. The third measurement (for temperature and the state of the weather) is at sunset, and the fourth is uniformly at ten in the evening.
In contrast to the data in partI, the first of the five thermometer readings denote the measurement taken at night. The second to fifth readings correspond to similar observing times as in PartI.

In [None]:
%%sql
# Read the pressure CSVs
DROP TABLE IF EXISTS Pressure;
CREATE TABLE Pressure AS
SELECT
    date,
    p0,
    p1,
    # CASE
    #     WHEN TRY_CAST(STRPTIME(date, '%Y_%B_%d') AS TIMESTAMP) IS NULL THEN NULL
    #     ELSE STRPTIME(date, '%Y_%B_%d')
    # END AS datetime_column
FROM read_csv_auto('data/Holyoke/holyoke.pressure.dat.txt.csv')

Unnamed: 0,Count
0,13020


In [None]:
%%sql
# Set the NULL values
UPDATE Pressure
SET p0 = NULL
WHERE p0 = 9999.9;

Unnamed: 0,Count
0,570


In [32]:
%%sql
UPDATE Pressure
SET p1 = NULL
WHERE p1 = 9999.9;

Unnamed: 0,Count
0,798


#### Temperature

In [None]:
%%sql
# Read the temperature CSVs
DROP TABLE IF EXISTS Temperature;
CREATE TABLE Temperature AS
SELECT *
FROM read_csv_auto('data/Holyoke/holyoke.temperature.dat.txt.csv')

Unnamed: 0,Count
0,16368


In [None]:
%%sql
# Set the NULL values
UPDATE Temperature
SET t0 = NULL
WHERE t0 = -99.0;

Unnamed: 0,Count
0,803


In [37]:
%%sql
UPDATE Temperature
SET t1 = NULL
WHERE t1 = -99.0;

Unnamed: 0,Count
0,729


In [38]:
%%sql
UPDATE Temperature
SET t2 = NULL
WHERE t2 = -99.0;

Unnamed: 0,Count
0,1459


In [39]:
%%sql
UPDATE Temperature
SET t3 = NULL
WHERE t3 = -99.0;

Unnamed: 0,Count
0,761


#### Combined data

In [None]:
%%sql
# Create the Holyoke table by joining Temperature and Pressure on date
DROP TABLE IF EXISTS Holyoke;
CREATE TABLE Holyoke AS
SELECT * FROM Temperature
FULL JOIN Pressure
USING (date);

Unnamed: 0,Count
0,16605


In [None]:
%%sql
# Combine the columns at different times in single columns with timestamp
DROP TABLE IF EXISTS Holyoke_Long;
CREATE TABLE Holyoke_Long AS 
SELECT * FROM
    (
    SELECT
        date,
        '08:00:00' AS time_of_day,
        (Date + INTERVAL '8 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        t0 AS temperature,
        p0 AS pressure
    FROM Holyoke

    UNION ALL

    SELECT
        date,
        '12:00:00' AS time_of_day,
        (Date + INTERVAL '12 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        t1 AS temperature,
        p1 AS pressure
    FROM Holyoke

    UNION ALL

    -- SUNSET
    SELECT
        date,
        '19:00:00' AS time_of_day, 
        (Date + INTERVAL '19 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        t2 AS temperature,
        NULL AS pressure
    FROM Holyoke

    UNION ALL

    SELECT
        date,
        '22:00:00' AS time_of_day,
        (Date + INTERVAL '22 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        t3 AS temperature,
        NULL AS pressure
    FROM Holyoke
    )
    WHERE Date IS NOT NULL;

Unnamed: 0,Count
0,64280


In [48]:
%%sql
COPY
    (SELECT * FROM Holyoke_Long)
    TO 'dbs/holyoke.parquet'
    (FORMAT parquet);

Unnamed: 0,Count
0,64280


In [49]:
end_time = time.time()
print(f"Script executed in {end_time - start_time:.2f} seconds.")

Script executed in 1.82 seconds.
