# EDA: Temperature timing

- Read in data
- Group into seasons
- Count: on what day were there are least $D$ days with average temperature $T$ or lower?
- Compare that across seasons
- Reasonable values look like temps of -5 to 0 C and 10-30 days

Compare this to reading that cherries require something like 860 hours of <=7C. This gets a little confusing, because I started by doing *daily* average temperature, not hourly, which might matter

In [None]:
import polars as pl
import altair as alt
import pyarrow.dataset as ds
import itertools
from collections import OrderedDict

In [None]:
data = pl.scan_pyarrow_dataset(
    ds.dataset("data/cdo", format="parquet", partitioning="hive")
).collect()

data.sample(5)

In [None]:
def first_after_below(date, x, x0, n):
    css = (x <= x0).cum_sum()

    return pl.when(css.max() < n).then(None).otherwise(date.filter(css == n).first())


# data to be used for analysis
data2 = (
    data.with_columns(
        season=pl.when(pl.col("date") < pl.date(pl.col("year"), 6, 1))
        .then(pl.col("year"))
        .otherwise(pl.col("year") + 1)
    )
    .filter(pl.col("season").is_between(2011, 2022))
    .sort("date")
    # .group_by('season')
)


def f(season, x0, n):
    return (
        data2.filter(pl.col("season") == season)
        .select(first_after_below(pl.col("date"), pl.col("value"), x0, n))
        .item()
    )


out = []

for season in data2["season"].unique().to_list():
    for max_temp in [-5, 0]:
        for n_days in [10, 20, 30, 40]:
            first_day = f(season, max_temp, n_days)
            out.append(
                {
                    "season": season,
                    "max_temp": max_temp,
                    "n_days": n_days,
                    "first_day": first_day,
                }
            )

results = pl.from_dicts(out)
results

In [None]:
(
    alt.Chart(
        results.with_columns(
            pl.col("season").cast(str),
            label=pl.format("{}-{}", pl.col("max_temp"), pl.col("n_days")),
            y=pl.col("first_day").dt.ordinal_day().cast(pl.Int64),
        )
        .with_columns(
            pl.when(pl.col("y") > 200).then(pl.col("y") - 365).otherwise(pl.col("y"))
        )
        .to_pandas()
    )
    .encode(x="season", y="y", color="label")
    .mark_line()
)