In [1]:
import numpy as np
import pandas as pd

from typing import Sequence, Iterable

from nepal.datasets import NYTimes

In [2]:
df_covid = NYTimes().load()


def fill_index(df: pd.DataFrame, names: Sequence[str]) -> pd.DataFrame:
    return df.pipe(_complete_index, names=names).pipe(_fill_na)


def _complete_index(df: pd.DataFrame, names: Sequence[str]) -> pd.DataFrame:
    dates: pd.Index = df.index.get_level_values("date")

    labels = {
        "fips": df.index.get_level_values("fips").unique(),
        "date": pd.date_range(start=dates.min(), end=dates.max(), freq="D"),
    }

    complete: pd.MultiIndex = pd.MultiIndex.from_product(
        [labels[names[0]], labels[names[1]]], names=names
    )

    return df.reindex(complete)


def _fill_na(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(
        county=df.county.bfill(),
        state=df.state.bfill(),
        cases=df.cases.fillna(0),
        deaths=df.deaths.fillna(0),
    )


# Important: date index must be last
index = ["fips", "date"]
df = (
    df_covid.dropna(subset=index)
    .set_index(index)
    .pipe(fill_index, names=index)
    .sort_index(level=index)
)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,state,cases,deaths
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01001,2020-01-21,Autauga,Alabama,0,0
01001,2020-01-22,Autauga,Alabama,0,0
01001,2020-01-23,Autauga,Alabama,0,0
01001,2020-01-24,Autauga,Alabama,0,0
01001,2020-01-25,Autauga,Alabama,0,0
...,...,...,...,...,...
78030,2022-04-07,St. Thomas,Virgin Islands,7206,59
78030,2022-04-08,St. Thomas,Virgin Islands,7219,59
78030,2022-04-09,St. Thomas,Virgin Islands,7219,59
78030,2022-04-10,St. Thomas,Virgin Islands,7219,59


In [3]:
def cast_types_as_signed(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    return df.astype({col: "int64" for col in cols})


def calculate_new(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    for col in cols:
        df[f"new_{col}"] = df[col].diff().fillna(0).clip(lower=0).astype("int64")
    return df


targets = ["cases", "deaths"]
df = df.pipe(cast_types_as_signed, cols=targets).pipe(calculate_new, cols=targets)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,state,cases,deaths,new_cases,new_deaths
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01001,2020-01-21,Autauga,Alabama,0,0,0,0
01001,2020-01-22,Autauga,Alabama,0,0,0,0
01001,2020-01-23,Autauga,Alabama,0,0,0,0
01001,2020-01-24,Autauga,Alabama,0,0,0,0
01001,2020-01-25,Autauga,Alabama,0,0,0,0
...,...,...,...,...,...,...,...
78030,2022-04-07,St. Thomas,Virgin Islands,7206,59,7,0
78030,2022-04-08,St. Thomas,Virgin Islands,7219,59,13,0
78030,2022-04-09,St. Thomas,Virgin Islands,7219,59,0,0
78030,2022-04-10,St. Thomas,Virgin Islands,7219,59,0,0


In [4]:
# Test how we can correctly index a multiindex
df.loc[pd.IndexSlice[:, "2020-01-21":"2020-02-01"], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,county,state,cases,deaths,new_cases,new_deaths
fips,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01001,2020-01-21,Autauga,Alabama,0,0,0,0
01001,2020-01-22,Autauga,Alabama,0,0,0,0
01001,2020-01-23,Autauga,Alabama,0,0,0,0
01001,2020-01-24,Autauga,Alabama,0,0,0,0
01001,2020-01-25,Autauga,Alabama,0,0,0,0
...,...,...,...,...,...,...,...
78030,2020-01-28,St. Thomas,Virgin Islands,0,0,0,0
78030,2020-01-29,St. Thomas,Virgin Islands,0,0,0,0
78030,2020-01-30,St. Thomas,Virgin Islands,0,0,0,0
78030,2020-01-31,St. Thomas,Virgin Islands,0,0,0,0


In [208]:
import lightgbm as lgb
from sktime.forecasting.compose import RecursiveTabularRegressionForecaster

forecaster = RecursiveTabularRegressionForecaster(
    estimator=lgb.LGBMRegressor(),
    window_length=30,
)

In [209]:
from typing import Tuple

from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import SlidingWindowSplitter

fh = ForecastingHorizon(list(range(1, 15)))
cv = SlidingWindowSplitter(window_length=60, step_length=60, fh=fh)


def get_windows(df: pd.DataFrame) -> Tuple[pd.DataFrame]:
    for train, test in cv.split(df):
        yield df.iloc[train], df.iloc[test]

In [210]:
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error as meap


def sample_fips(df: pd.DataFrame, amount: int) -> pd.DataFrame:
    subindex = df.index.get_level_values("fips")
    sample_ids = np.random.choice(subindex, amount, replace=False)
    return df.loc[sample_ids].sort_index(level=["fips", "date"])


def date_index(df: pd.DataFrame) -> pd.Index:
    idx: pd.Index = df.index.unique("date")
    return pd.DatetimeIndex(idx, freq="D")


y = sample_fips(df, 5)
n_splits = cv.get_n_splits(y)
print(f"Number of Folds = {n_splits}")

errors = []
for y_train, y_test in get_windows(y[["new_cases"]]):
    y_pred = forecaster.fit_predict(y=y_train, fh=fh)
    errors.append(meap(y_test, y_pred))

print(np.mean(errors))

Number of Folds = 67


  if not hasattr(x, "freq") or x.freq is None:
  by *= x.freq


TypeError: Level type mismatch: 2020-02-20 00:00:00