In [1]:
from datetime import datetime, timedelta

import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from time_series_experiments.pipeline.dataset import DatasetConfig, read_dataset, VarType
from time_series_experiments.pipeline.validation import BacktestingCrossVal
from time_series_experiments.pipeline.data import to_task_data
from time_series_experiments.pipeline import Pipeline, ColumnsProcessor, Step
from time_series_experiments.pipeline.tasks import Wrap, OrdCat, DateFeatures

In [2]:
config = DatasetConfig(
    path='https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/nyc_energy.csv', 
    date_col='timeStamp', 
    target_col='demand', 
    series_id_col=None,
    feature_tyspes={
        'precip': VarType.NUM,
        'temp': VarType.NUM,
        'timeStamp': VarType.DATE,
    }
)

In [3]:
def create_pipeline():
    numeric_pipeline = Pipeline(
        steps=[
            Step("impute", Wrap(SimpleImputer(strategy="mean"))),
            Step("scale", Wrap(StandardScaler())),
        ]
    )
    categorical_pipeline = Pipeline(
        steps=[
            Step(
                "impute",
                Wrap(
                    SimpleImputer(
                        strategy="constant", fill_value="missing", add_indicator=True
                    )
                ),
            ),
            Step("encode", OrdCat()),
        ]
    )
    date_pipeline = Pipeline(
        steps=[
            Step("derive", DateFeatures()), 
            Step("encode", ColumnsProcessor(
                branches=[
                    Step("num", Wrap(StandardScaler()), types=[VarType.NUM]),
                    Step("cat", OrdCat(), types=[VarType.CAT])
                ]
            ))
        ]
    )

    preprocessor = ColumnsProcessor(
        branches=[
            Step("num", numeric_pipeline, types=[VarType.NUM]),
            Step("cat", categorical_pipeline, types=[VarType.CAT]),
            Step("dat", date_pipeline, types=[VarType.DATE])
        ]
    )
    return Pipeline(steps=[Step("prep", preprocessor)])

In [4]:
nyc = read_dataset(config)

In [5]:
y = nyc.pop(config.target_col)
y = y.values

In [6]:
cross_val = BacktestingCrossVal(
    data=nyc,
    config=config, 
    k=1,
    validation_size=0.2,
)

In [18]:
for backtest in cross_val: 
    X_train = pd.DataFrame(nyc.values[backtest.train_index], columns=nyc.columns)
    y_train = y[backtest.train_index]
    X_test = pd.DataFrame(nyc.values[backtest.test_index], columns=nyc.columns)
    y_test = y[backtest.test_index]
    
    train = to_task_data(X_train, y_train, config)
    test = to_task_data(X_test, y_test, config)
    
    pipeline = create_pipeline()
    train = pipeline.fit_transform(train)
    test = pipeline.transform(test)

In [19]:
test.X

array([[-0.16092953,  0.88611196,  1.69736034, ...,  5.        ,
         0.        ,  0.        ],
       [-0.16092953,  0.93340299,  1.69736034, ...,  6.        ,
         0.        ,  0.        ],
       [-0.16092953,  0.92143779,  1.69736034, ...,  7.        ,
         0.        ,  0.        ],
       ...,
       [-0.16092953,  0.86560018,  2.45925247, ...,  4.        ,
         0.        ,  0.        ],
       [-0.16092953,  0.79152989,  2.45925247, ...,  5.        ,
         0.        ,  0.        ],
       [-0.02356444,  0.71973868,  2.45925247, ...,  6.        ,
         0.        ,  0.        ]])