# Data Model: `wx_data`

In [1]:
import os
from typing import Optional
from pathlib import Path
from datetime import datetime
from joblib import cpu_count, Parallel, delayed
import pandas as pd
from sqlmodel import SQLModel, Field, create_engine

# functions
def read_wxdata(FILE: Path) -> pd.DataFrame:
    """
    Read the wx_data.
    Input:
    - FILENAME: Path (from pathlib) object of the path of a wx_data file

    Output:
    - DF: dataframe of a wx_data
    """
    cols = ["date", "tmax", "tmin", "precip"]
    dtyp  = (str, int, int, int)
    dtypes  = {k:v for k, v in zip(cols, dtyp)}
    DF = pd.read_table(FILE,
                       delimiter = "\t",
                       header = None,
                       names  = cols, 
                       dtype  = dtypes)
    # Make time datetime obj
    DF["date"] = pd.to_datetime(DF["date"])
    # Assign a column for the site ID from the filename
    DF = DF.assign(site_id = FILE.stem)
    # Sanitize the null values
    DF = DF.replace(-9999, pd.NA)
    # print(df[df.isna().any(axis = 1)].head(3))
    return DF

In [2]:
dir_wxdata = Path("/code/wx_data/")
# Let's work on one file(
f_wxdata      = [f for f in dir_wxdata.glob("*")][0]
df = read_wxdata(f_wxdata)
print(df.head(3))

        date  tmax  tmin precip      site_id
0 1985-01-01   -22  -128     94  USC00110072
1 1985-01-02  -122  -217      0  USC00110072
2 1985-01-03  -106  -244      0  USC00110072


In [3]:
%%time
dfall = Parallel(n_jobs = cpu_count() - 1)(delayed(read_wxdata)(f) for f in dir_wxdata.glob("USC*.txt"))
dfall = pd.concat(dfall)

CPU times: user 598 ms, sys: 208 ms, total: 806 ms
Wall time: 3.46 s


## Create Data Model

In [4]:
# models.py
class WxTable(SQLModel, table = True):
    id: Optional[int] = Field(default = None, primary_key = True)
    site_id: str
    date:    datetime
    tmax:    int | None
    tmin:    int | None
    precip:  int | None

# app.py
# Create db engine
engine = create_engine(os.environ["DATABASE_URL"], echo = True)
SQLModel.metadata.create_all(engine)
# if we suffer a collation mismatch, it might due to previously using a different postgres and not deleting the volume.
# run the code below:
# > podman-compose down
# > podman volume ls # to find the name of the volume
# > podman volume rm <name-of-volume>
# > podman-compose up -d 

2026-01-19 02:52:15,276 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2026-01-19 02:52:15,278 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-01-19 02:52:15,281 INFO sqlalchemy.engine.Engine select current_schema()
2026-01-19 02:52:15,283 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-01-19 02:52:15,287 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2026-01-19 02:52:15,288 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-01-19 02:52:15,291 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-01-19 02:52:15,297 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

In [17]:
# Verify database
from sqlalchemy import inspect
inspector = inspect(engine)
inspector.has_table("wxtable")

2026-01-19 02:54:58,387 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-01-19 02:54:58,390 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2026-01-19 02:54:58,394 INFO sqlalchemy.engine.Engine [cached since 163.1s ago] {'table_name': 'wxtable', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2026-01-19 02:54:58,398 INFO sqlalchemy.engine.Engine ROLLBACK


True

## Using scripts

In [1]:
# SQLModel.metadata.clear()
# SQLModel.metadata.drop_all(engine)
from models import main
main()

2026-01-19 04:30:44,232 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2026-01-19 04:30:44,233 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-01-19 04:30:44,234 INFO sqlalchemy.engine.Engine select current_schema()
2026-01-19 04:30:44,235 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-01-19 04:30:44,237 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2026-01-19 04:30:44,238 INFO sqlalchemy.engine.Engine [raw sql] {}
2026-01-19 04:30:44,240 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2026-01-19 04:30:44,245 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname