# Scrape NOAA

- Pull Climate Data Online (CDO) for daily mean temperatures in Washington, DC
- Put the results in a pyarrow dataset at `data/cdo`

In [None]:
import requests
import yaml
import json
import polars as pl
import altair as alt
import pyarrow as pa
import pyarrow.dataset as ds

with open("secrets.yaml") as f:
    secrets = yaml.safe_load(f)

In [None]:
def get_noaa(
    startdate,
    enddate,
    limit,
    datasetid="GHCND",
    datatypeid="TMIN",
    stationid="GHCND:USW00013743",
):
    """Get NOAA Climate Data Online data"""
    headers = {"token": secrets["ncdc_key"]}
    r = requests.get(
        "".join(
            [
                "https://www.ncdc.noaa.gov/cdo-web/api/v2/data",
                f"?datasetid={datasetid}",
                f"&datatypeid={datatypeid}",
                f"&stationid={stationid}",
                f"&startdate={startdate}",
                f"&enddate={enddate}",
                "&units=metric",
                f"&limit={limit}",
            ]
        ),
        headers=headers,
    )

    return r


def get_temp_year(year):
    """For one year, get the daily temperature"""
    startdate = f"{year}-01-01"
    enddate = f"{year}-12-31"

    r = get_noaa(startdate, enddate, 366)
    assert r.status_code == 200
    content = json.loads(r.content)["results"]

    return (
        pl.from_dicts(content)
        .with_columns(pl.col("date").str.to_datetime().cast(pl.Date))
        .with_columns(year=pl.col("date").dt.year())
        .select(["year", "date", "value"])
    )

In [None]:
# as an example, get data from 2010
data2010 = get_temp_year(2010)

(alt.Chart(data2010.to_pandas()).encode(x="date", y="value").mark_line())

In [None]:
def read_data(path="data/cdo") -> pl.LazyFrame:
    df = pl.scan_pyarrow_dataset(
        ds.dataset(path, format="parquet", partitioning="hive")
    )

    return df


def write_data(df: pl.DataFrame, path="data/cdo"):
    ds.write_dataset(
        df.to_arrow(),
        path,
        format="parquet",
        partitioning=ds.partitioning(
            pa.schema([df.to_arrow().schema.field("year")]), flavor="hive"
        ),
        existing_data_behavior="delete_matching",
    )


# bootstrap the dataset
write_data(data2010)
read_data().head(5).collect()

In [None]:
needed_years = range(2010, 2024)

known_years = read_data().select(["year"]).unique().collect()["year"].to_list()

missing_years = set(needed_years) - set(known_years)

if len(missing_years) > 0:
    print(missing_years)

    for year in missing_years:
        print(year)
        df = get_temp_year(year)
        write_data(df)
        print("done")

(
    alt.Chart(read_data().take_every(10).collect().to_pandas())
    .encode(x="date", y="value")
    .mark_line()
)