In [None]:
import json
import os
from pathlib import Path
from typing import List, NamedTuple, Optional, Any, Dict

import pandas as pd
import numpy as np



In [None]:
# export 
# from gluon-ts/src/gluonts/dataset/repository/_util.py
def to_dict(
    target_values: np.ndarray,
    start: str,
    cat: Optional[List[int]] = None,
    item_id: Optional[Any] = None,
):
    def serialize(x):
        if np.isnan(x):
            return "NaN"
        else:
            # return x
            return float("{0:.6f}".format(float(x)))

    res = {
        "start": str(start),
        "target": [serialize(x) for x in target_values],
    }

    if cat is not None:
        res["feat_static_cat"] = cat

    if item_id is not None:
        res["item_id"] = item_id

    return res


def save_to_file(path: Path, data: List[Dict]):
    print(f"saving time-series into {path}")
    path_dir = os.path.dirname(path)
    os.makedirs(path_dir, exist_ok=True)
    with open(path, "wb") as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode("utf-8"))


def metadata(cardinality: int, freq: str, prediction_length: int):
    return {
        "freq": freq,
        "prediction_length": prediction_length,
        "feat_static_cat": [
            {"name": "feat_static_cat", "cardinality": str(cardinality)}
        ],
    }

In [None]:
# export
# from gluon-ts/src/gluonts/support/pandas.py

def frequency_add(ts: pd.Timestamp, amount: int) -> pd.Timestamp:
    return ts + ts.freq * amount

def forecast_start(entry):
    return frequency_add(entry["start"], len(entry["target"]))

In [None]:
# export
# from gluon-ts/src/gluonts/dataset/repository/_lstnet.py
def load_from_pandas(
    df: pd.DataFrame,
    time_index: pd.DatetimeIndex,
    agg_freq: Optional[str] = None,
) -> List[pd.Series]:
    df = df.set_index(time_index)

    pivot_df = df.transpose()
    pivot_df.head()

    timeseries = []
    for row in pivot_df.iterrows():
        ts = pd.Series(row[1].values, index=time_index)
        if agg_freq is not None:
            ts = ts.resample(agg_freq).sum()
        first_valid = ts[ts.notnull()].index[0]
        last_valid = ts[ts.notnull()].index[-1]
        ts = ts[first_valid:last_valid]

        timeseries.append(ts)

    return timeseries

In [None]:
# export
# from gluon-ts/src/gluonts/dataset/repository/_lstnet.py
class LstnetDataset(NamedTuple):
    name: str
    url: str
    num_series: int
    num_time_steps: int
    prediction_length: int
    rolling_evaluations: int
    freq: str
    start_date: str
    agg_freq: Optional[str] = None


root = "https://raw.githubusercontent.com/laiguokun/multivariate-time-series-data/master/"

datasets_info = {
    "exchange_rate": LstnetDataset(
        name="exchange_rate",
        url=root + "exchange_rate/exchange_rate.txt.gz",
        num_series=8,
        num_time_steps=7588,
        prediction_length=30,
        rolling_evaluations=5,
        start_date="1990-01-01",
        freq="1B",
        agg_freq=None,
    ),
    "electricity": LstnetDataset(
        name="electricity",
        url=root + "electricity/electricity.txt.gz",
        # original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
        # the aggregated ones that is used from LSTNet filters out from the initial 370 series the one with no data
        # in 2011
        num_series=321,
        num_time_steps=26304,
        prediction_length=24,
        rolling_evaluations=7,
        start_date="2012-01-01",
        freq="1H",
        agg_freq=None,
    ),
    "traffic": LstnetDataset(
        name="traffic",
        url=root + "traffic/traffic.txt.gz",
        # note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
        # but only 862 in LSTNet
        num_series=862,
        num_time_steps=17544,
        prediction_length=24,
        rolling_evaluations=7,
        start_date="2015-01-01",
        freq="H",
        agg_freq=None,
    ),
    "solar-energy": LstnetDataset(
        name="solar-energy",
        url=root + "solar-energy/solar_AL.txt.gz",
        num_series=137,
        num_time_steps=52560,
        prediction_length=24,
        rolling_evaluations=7,
        start_date="2006-01-01",
        freq="10min",
        agg_freq="1H",
    ),
}

In [None]:


def generate_lstnet_dataset(dataset_path: Path, dataset_name: str):
    ds_info = datasets_info[dataset_name]

    os.makedirs(dataset_path, exist_ok=True)

    with open(dataset_path / "metadata.json", "w") as f:
        f.write(
            json.dumps(
                metadata(
                    cardinality=ds_info.num_series,
                    freq=ds_info.freq,
                    prediction_length=ds_info.prediction_length,
                )
            )
        )

    train_file = dataset_path / "train" / "data.json"
    test_file = dataset_path / "test" / "data.json"

    time_index = pd.date_range(
        start=ds_info.start_date,
        freq=ds_info.freq,
        periods=ds_info.num_time_steps,
    )

    df = pd.read_csv(ds_info.url, header=None)

    assert df.shape == (
        ds_info.num_time_steps,
        ds_info.num_series,
    ), f"expected num_time_steps/num_series {(ds_info.num_time_steps, ds_info.num_series)} but got {df.shape}"

    timeseries = load_from_pandas(
        df=df, time_index=time_index, agg_freq=ds_info.agg_freq
    )

    # the last date seen during training
    ts_index = timeseries[0].index
    training_end = ts_index[int(len(ts_index) * (8 / 10))]

    train_ts = []
    for cat, ts in enumerate(timeseries):
        sliced_ts = ts[:training_end]
        if len(sliced_ts) > 0:
            train_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                )
            )

    assert len(train_ts) == ds_info.num_series

    save_to_file(train_file, train_ts)
    print('saved train file')
    # time of the first prediction
    prediction_dates = [
        frequency_add(training_end, i * ds_info.prediction_length)
        for i in range(ds_info.rolling_evaluations)
    ]

    test_ts = []
    for prediction_start_date in prediction_dates:
        for cat, ts in enumerate(timeseries):
            # print(prediction_start_date)
            prediction_end_date = frequency_add(
                prediction_start_date, ds_info.prediction_length
            )
            sliced_ts = ts[:prediction_end_date]
            test_ts.append(
                to_dict(
                    target_values=sliced_ts.values,
                    start=sliced_ts.index[0],
                    cat=[cat],
                    item_id=cat,
                )
            )

    assert len(test_ts) == ds_info.num_series * ds_info.rolling_evaluations

    save_to_file(test_file, test_ts)

In [None]:
generate_lstnet_dataset(Path('../data/solar-energy'), 'solar-energy')

saving time-series into ../data/solar-energy/train/data.json
saved train file
saving time-series into ../data/solar-energy/test/data.json


In [None]:
a = np.array([1,np.nan,np.nan])
np.ones_like(a)*a[0]

array([1., 1., 1.])

In [None]:
import sys
sys.path.append('..')
from fastseq.core import *
path = Path('../data/m5/rows')
i = 0
for f in path.glob('*FOODS_2_1*_CA_*.json'):
    f.copy('../data/m5_tiny/mini')
# i

In [None]:
import orjson

In [None]:
p = Path('../data/m5_tiny/m5_mini')
meta = {
    "freq": "1D",
    "prediction_length": 28,
    "feat_static_cat": [{"name": "store_id",},
                        {"name": "state_id",}],
    'feat_static_real':[{'name':'item_id',},
                        {'name':'random'}],
    'feat_dynamic_cat':[{'name':'weekday'},
                        {'name':'month'}],
    'feat_dynamic_cat':[{'name':'prices'},
                        {'name':'dayofyear'}],
}
json.dump(meta,open(p / 'metadata.json','w') )

In [None]:
json.load(p / 'metadata.json')

{'freq': '1D',
 'prediction_length': 28,
 'feat_static_cat': [{'name': 'store_id'}, {'name': 'state_id'}],
 'feat_static_real': [{'name': 'item_id'}, {'name': 'random'}],
 'feat_dynamic_cat': [{'name': 'prices'}, {'name': 'dayofyear'}]}

In [None]:
for i,f in enumerate(p.glob('*.json')):  
    try:
        ts = json.load(f)
    except:
        print('could not load',f)
        assert False
    if 'start' not in ts:
        dct = {'start':'2011-01-29 00:00:00',
              'target': ts['ts_con']['sales'],
              'feat_dynamic_real':[ts['ts_con']['prices'],
                                [(pd.Timestamp('2011-01-29 00:00:00') + pd.Timedelta('1D')*i).dayofyear for i in range(ts['_length'])]]
                                         ,
               'feat_dynamic_cat':[[(pd.Timestamp('2011-01-29 00:00:00') + pd.Timedelta('1D')*i).weekday() for i in range(ts['_length'])],
                                           [(pd.Timestamp('2011-01-29 00:00:00') + pd.Timedelta('1D')*i).month for i in range(ts['_length'])]]
                                         ,
               'feat_static_cat': [ts['cat']['store_id'], ts['cat']['state_id']],
               'feat_static_real': [int(ts['cat']['item_id'].split('_')[-1]), np.random.randn() ],           
              }
        open(f,'wb').write(orjson.dumps(dct ))
        if i%10 == 0:
            print(i, '/400')
    
for k,v in l[0].items():
    print(k,v[:10], v.shape if type(v) == np.ndarray else v)

280 /400
290 /400
300 /400
310 /400
320 /400
330 /400
340 /400
350 /400
360 /400
370 /400
380 /400
390 /400
start 2011-01-29 2011-01-29 00:00:00
target [2. 0. 0. 0. 0. 0. 0. 0. 0. 0.] (1969,)
feat_dynamic_real [[  2.88   2.88   2.88 ...   2.98   2.98   2.98]
 [ 29.    30.    31.   ... 169.   170.   171.  ]] (2, 1969)
feat_dynamic_cat [[5 6 0 ... 4 5 6]
 [1 1 1 ... 6 6 6]] (2, 1969)
feat_static_cat ['CA_1' 'CA'] (2,)
feat_static_real [ 1.76000000e+02 -1.01814516e-01] (2,)


In [None]:
url = "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv"
df = pd.read_csv(filepath_or_buffer=url, header=0, index_col=0)
df.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2015-02-26 21:42:53,57
2015-02-26 21:47:53,43
2015-02-26 21:52:53,55
2015-02-26 21:57:53,64
2015-02-26 22:02:53,93
