In [26]:
import duckdb
import pandas as pd
from datetime import datetime
import time
start_time = time.time()

In [None]:
# Convert Holyoke pressure data from .dat to .csv format
def pressure_txt_dat_to_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    datalist = []
    # The file has 48 groups of 33 lines each
    # The first line of this group is the year
    # The second line contains the month names
    # The next 31 lines contain the data for each day of the month
    offset = 0
    for offset in range(48):
        year = lines[offset*33]
        months = lines[offset*33+1].split()
        data = lines[offset*33+2].split()
        
        curr_month = months[0]
        for line in range(31):
            datestring = f'{year.strip()}_{curr_month}_{line+1}'
            try:
                date = datetime.strptime(datestring, '%Y_%B_%d')
            except ValueError:
                date = None
            data = lines[offset*33+line+2].split()
            curr_data = (date, data[1], data[2], data[3], datestring)
            datalist.append(curr_data)

    df = pd.DataFrame(datalist, columns=['date', 'p0', 'p1', 'p2', 'datestring'])
    df.to_csv(file_path + '.csv', index=False)

pressure_txt_dat_to_csv('data/Wigglesworth/wigglesworth.pressure.dat.txt')

In [28]:
def temperature_txt_dat_to_csv(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    datalist = []

    offset = 0
    for offset in range(48):
        year = lines[offset*33]
        months = lines[offset*33+1].split()
        data = lines[offset*33+2].split()
        
        curr_month = months[0]
        for line in range(31):
            datestring = f'{year.strip()}_{curr_month}_{line+1}'
            try:
                date = datetime.strptime(datestring, '%Y_%B_%d')
            except ValueError:
                date = None
            data = lines[offset*33+line+2].split()
            curr_data = (date, data[1], data[2], data[3], 
                         data[4], data[5], data[6], datestring)
            datalist.append(curr_data)

    df = pd.DataFrame(datalist, columns=['date', 'it8', 'it13', 'it21', 
                                         'ot8', 'ot13', 'ot21', 'datestring'])
    df.to_csv(file_path + '.csv', index=False)

temperature_txt_dat_to_csv('data/Wigglesworth/wigglesworth.temperature.dat.txt')

In [29]:
%load_ext sql
conn = duckdb.connect('dbs/wigglesworth_database.db')
%sql conn --alias duckdb

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


#### Pressure

In [None]:
%%sql
# Read the CSVs
DROP TABLE IF EXISTS Pressure;
CREATE TABLE Pressure AS
SELECT
    *
FROM read_csv_auto('data/Wigglesworth/wigglesworth.pressure.dat.txt.csv')

Unnamed: 0,Count
0,1488


In [None]:
%%sql
# Set the NULL values
UPDATE Pressure
SET p0 = NULL
WHERE p0 = 100;

Unnamed: 0,Count
0,27


In [33]:
%%sql
UPDATE Pressure
SET p1 = NULL
WHERE p1 = 100;

Unnamed: 0,Count
0,27


In [34]:
%%sql
UPDATE Pressure
SET p2 = NULL
WHERE p2 = 100;

Unnamed: 0,Count
0,27


#### Temperature

In [None]:
%%sql
# Read the CSVs for temperature
DROP TABLE IF EXISTS Temperature;
CREATE TABLE Temperature AS
SELECT *
FROM read_csv_auto('data/Wigglesworth/wigglesworth.temperature.dat.txt.csv')

Unnamed: 0,Count
0,1488


In [None]:
%%sql
# Set the NULL values
UPDATE Temperature
SET it8 = NULL
WHERE it8 = -99.9;

Unnamed: 0,Count
0,29


In [40]:
%%sql
UPDATE Temperature
SET it13 = NULL
WHERE it13 = -99.9;

Unnamed: 0,Count
0,28


In [41]:
%%sql
UPDATE Temperature
SET it21 = NULL
WHERE it21 = -99.9;

Unnamed: 0,Count
0,27


In [42]:
%%sql
UPDATE Temperature
SET ot8 = NULL
WHERE ot8 = -99.9;

Unnamed: 0,Count
0,28


In [43]:
%%sql
UPDATE Temperature
SET ot13 = NULL
WHERE ot13 = -99.9;

Unnamed: 0,Count
0,27


In [44]:
%%sql
UPDATE Temperature
SET ot21 = NULL
WHERE ot21 = -99.9;

Unnamed: 0,Count
0,27


In [None]:
%%sql
# Create the Wigglesworth table by joining Temperature and Pressure
DROP TABLE IF EXISTS Wigglesworth;
CREATE TABLE Wigglesworth AS
SELECT * FROM Temperature
FULL JOIN Pressure
USING (date);

Unnamed: 0,Count
0,1515


In [None]:
%%sql
# Combine the columns at different times in single columns with timestamp
DROP TABLE IF EXISTS Wigglesworth_Long;
CREATE TABLE Wigglesworth_Long AS 
SELECT * FROM
    (
    SELECT
        date,
        '08:00:00' AS time_of_day,
        (Date + INTERVAL '8 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        it8 AS indoorTemperature,
        ot8 AS outdoorTemperature,
        p0 AS pressure
    FROM Wigglesworth

    UNION ALL

    SELECT
        date,
        '13:00:00' AS time_of_day,
        (Date + INTERVAL '13 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        it13 AS indoorTemperature,
        ot13 AS outdoorTemperature,
        p1 AS pressure
    FROM Wigglesworth

    UNION ALL

    -- SUNSET
    SELECT
        date,
        '21:00:00' AS time_of_day, 
        (Date + INTERVAL '21 hours') AT TIME ZONE 'America/New_York' AS Timestamp,
        it21 AS indoorTemperature,
        ot21 AS outdoorTemperature,
        p2 AS pressure
    FROM Wigglesworth
    )
    WHERE Date IS NOT NULL;

Unnamed: 0,Count
0,4383


In [50]:
%%sql
FROM (SUMMARIZE Wigglesworth_Long)

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,date,DATE,1786-01-01,1789-12-31,1574,,,1787-01-01,1788-01-01,1788-12-31,4383,0.0
1,time_of_day,VARCHAR,08:00:00,21:00:00,3,,,,,,4383,0.0
2,Timestamp,TIMESTAMP WITH TIME ZONE,1786-01-01 13:13:32+00:17,1790-01-01 02:13:32+00:17,4180,,,1787-01-01 20:30:22.922941+00:17,1788-01-02 01:54:15.636364+00:17,1788-12-31 22:01:01.999999+00:17,4383,0.0
3,indoorTemperature,DOUBLE,-17.22,35.83,247,10.034449771689523,9.920908330367464,2.2448307692307696,10.346439628482972,18.83393006993007,4383,0.07
4,outdoorTemperature,DOUBLE,-25.56,35.0,470,8.766485623003193,10.91282016204802,0.6620761524948963,8.856606833910032,17.449978012710464,4383,0.02
5,pressure,DOUBLE,968.6,1030.7,438,1005.7615788272892,8.389815959727084,1000.5467634893483,1005.9617142472696,1011.5336801426873,4383,0.0


In [51]:
%%sql
COPY
    (SELECT * FROM Wigglesworth_Long)
    TO 'dbs/wigglesworth.parquet'
    (FORMAT parquet);

Unnamed: 0,Count
0,4383


In [52]:
end_time = time.time()
print(f"Script executed in {end_time - start_time:.2f} seconds.")

Script executed in 1.34 seconds.
