In [None]:
import urllib.request
import pandas as pd
import polars as pl

In [None]:
url = "https://www.nps.gov/subjects/cherryblossom/bloom-watch.htm"
page = urllib.request.urlopen(url)
html = page.read()

In [None]:
tables = pd.read_html(html, header=0)
assert len(tables) == 1
pd_df = tables[0]
pd_df

In [None]:
stages = pd_df.columns[1:].to_list()
stages_df = pl.DataFrame({"stage_number": range(len(stages)), "stage": stages})
stages_df

In [None]:
def fix_2017_stars(df: pl.DataFrame) -> pl.DataFrame:
    bad_rows = df.filter(pl.col("day").str.contains(r"\*"))
    assert bad_rows.shape[0] == 2
    assert (bad_rows["Year"] == 2017).all()
    return df.with_columns(pl.col("day").str.replace(r"\*", ""))


df = (
    pl.from_pandas(pd_df)
    .melt(id_vars="Year", variable_name="stage")
    .with_columns(
        pl.col("value")
        .str.split_exact(by=" ", n=1)
        .struct.rename_fields(["month", "day"])
    )
    .unnest("value")
    # remove '*' from the two 2017 dates
    .with_columns(pl.col("day"))
    .pipe(fix_2017_stars)
    .with_columns(
        pl.concat_str([pl.col("Year"), pl.col("month"), pl.col("day")], separator="-")
        .alias("date")
        .str.strptime(pl.Date, "%Y-%b-%d")
    )
    .rename({"Year": "year"})
    .select(["year", "stage", "date"])
)

df

In [None]:
# manually coded values
df_manual = pl.DataFrame(
    {
        "year": [1990, 1958],
        "stage": "Peak Bloom",
        "date": ["1990 March 15", "1958 April 18"],
    }
).with_columns(pl.col("date").str.strptime(pl.Date, "%Y %B %d"))

df_manual

In [None]:
df_out = pl.concat([df_manual, df]).sort("year", "date")

df_out

In [None]:
stages_df.write_csv("data/nps_stages.csv")
df_out.write_csv("data/nps.csv")