In [30]:
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import njit
import seaborn as sns

from window_ops.rolling import rolling_mean


sns.set_context("talk")

In [8]:
df = pd.DataFrame({"y":np.random.rand(100000)})

In [36]:
%%timeit
result = df["y"].rolling(window=12).mean()

1.8 ms ± 5.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [33]:
@njit
def custom_mean(x):
    return rolling_mean(x, window_size=12)

In [39]:
1.8e-3 / (415e-6)

4.337349397590361

In [35]:
%%timeit
custom_mean(df["y"].values)

415 µs ± 21.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [32]:
%%timeit
result = df["y"].rolling(window=12).apply(custom_mean, engine="numba", raw=True)

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
No implementation of function Function(<built-in function setitem>) found for signature:
 
 >>> setitem(array(float64, 1d, C), int64, array(float64, 1d, C))
 
There are 16 candidate implementations:
  - Of which 16 did not match due to:
  Overload of function 'setitem': File: <numerous>: Line N/A.
    With argument(s): '(array(float64, 1d, C), int64, array(float64, 1d, C))':
   No match.

During: typing of setitem at /Users/kishan_manani/.pyenv/versions/3.8.7/envs/udemy-ts/lib/python3.8/site-packages/pandas/core/window/numba_.py (70)

File "../../../.pyenv/versions/3.8.7/envs/udemy-ts/lib/python3.8/site-packages/pandas/core/window/numba_.py", line 70:
    def roll_apply(
        <source elided>
            if len(window) - count_nan >= minimum_periods:
                result[i] = numba_func(window, *args)
                ^


In [2]:
# Specify dtypes when loading data
# for memory efficiency

# Sales & static features data
sales_dtypes = {
    'id': 'category',
    'item_id': 'category',
    'dept_id': 'category',
    'cat_id': 'category',
    'store_id': 'category',
    'state_id': 'category',
    **{f'd_{i}': np.uint64 for i in range(1942)}
}
df = pd.read_csv("m5-forecasting-accuracy/sales_train_evaluation.csv", 
                 dtype=sales_dtypes)

# Calendar & promos
cal_dtypes = {
    'd': 'category',
    'wm_yr_wk': np.uint16,
    'event_name_1': 'category',
    'event_type_1': 'category',
    'event_name_2': 'category',
    'event_type_2': 'category',
    'snap_CA': np.uint8,
    'snap_TX': np.uint8,
    'snap_WI': np.uint8,
}
df_cal = pd.read_csv("m5-forecasting-accuracy/calendar.csv", dtype=cal_dtypes, parse_dates=["date"])


static_features = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
df_static = df[static_features]

In [3]:
# Create panel view of time series
df_sales = df.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name="d",
    value_name="y"
)

In [4]:
# df_sales["id"] = df_sales["id"].cat.codes

In [5]:
# Add date column to sales dataframe
df_sales = df_sales.merge(right=df_cal[["d", "date"]], on="d")

In [6]:
df_sales = df_sales.sort_values(by=["id", "date"])

In [7]:
# Set index to id and then date 
# so sktime can transform each time series
# separately
df_sales = df_sales.set_index(["id", "date"]) 

In [8]:
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.transformations.series.time_since import TimeSince
from sktime.transformations.series.date import DateTimeFeatures

In [None]:
date_time_trasformer = DateTimeFeatures(manual_selection=[
    "day_of_month",
    "month_of_year",
    "day_of_week",
])

d_feats = date_time_trasformer.fit_transform(df_sales[["y"]])

CPU times: user 12 µs, sys: 1 µs, total: 13 µs
Wall time: 15.7 µs


In [35]:
time_since_transformer = TimeSince()
time_since_transformer.fit_transform(df_sales.head())

TypeError: X must be in an sktime compatible format, of scitype Series, Panel or Hierarchical, for instance a pandas.DataFrame with sktime compatible time indices, or with MultiIndex and last(-1) level an sktime compatible time index. Allowed compatible mtype format specifications are: ['pd.Series', 'pd.DataFrame', 'np.ndarray', 'nested_univ', 'numpy3D', 'pd-multiindex', 'df-list', 'pd_multiindex_hier'] . See the data format tutorial examples/AA_datatypes_and_datasets.ipynb. If you think the data is already in an sktime supported input format, run sktime.datatypes.check_raise(data, mtype) to diagnose the error, where mtype is the string of the type specification you want. Error message for checked mtypes, in format [mtype: message], as follows: [pd.DataFrame: <class 'pandas.core.indexes.multi.MultiIndex'> is not supported for X, use one of (<class 'pandas.core.indexes.range.RangeIndex'>, <class 'pandas.core.indexes.period.PeriodIndex'>, <class 'pandas.core.indexes.datetimes.DatetimeIndex'>) or integer index instead.]  [pd.Series: X must be a pandas.Series, found <class 'pandas.core.frame.DataFrame'>]  [np.ndarray: X must be a numpy.ndarray, found <class 'pandas.core.frame.DataFrame'>]  [df-list: X must be list of pd.DataFrame, found <class 'pandas.core.frame.DataFrame'>]  [numpy3D: X must be a numpy.ndarray, found <class 'pandas.core.frame.DataFrame'>]  [pd-multiindex: X.loc[i] must be Series of mtype pd.DataFrame, not at i=[0]]  [nested_univ: X entries must be pd.Series]  [pd_multiindex_hier: X must have a MultiIndex with 3 or more levels, found 2] 

In [34]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [None]:
# Features computed from the target.
# Compute lag and window features.
lag_window_transformer = WindowSummarizer(
    lag_feature={
        "lag": [1, 2],  # Lag features.
        "mean": [[1, 2], [1, 3]]  # [[lag, window size]]
    },
    target_cols=["y"],
#    truncate="bfill", # Backfill missing values from lagging and windowing.
    )

In [None]:
lag_window_transformer.fit_transform(df)

In [None]:
get_examples(mtype="pd-multiindex", as_scitype="Panel")[0]
