I came across the [Tensorflow time series tutorial](https://www.tensorflow.org/tutorials/structured_data/time_series) and wanted to see if I could apply it to [project 4](./project-4-stock-predictor.ipynb). This notebook relies on [Python 3.10](https://www.python.org/downloads/), [the project 4 datasets](./data/README.md), and a CUDA environment setup via [gpu-venv.sh](./gpu-venv.sh).

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # https://stackoverflow.com/q/40426502/3178898
%load_ext cudf.pandas
import pandas as pd
import numpy as np
import tensorflow as tf # cuda warnings are a known bug https://github.com/tensorflow/tensorflow/issues/62075
keras = tf.keras
from keras import utils
from sklearn.model_selection import TimeSeriesSplit

In [2]:
def load_time_series(path:str, index:str, target:str, **kwargs) -> pd.DataFrame:
    data = pd.read_csv(path, index_col=index, usecols=[index, target], **kwargs).dropna()
    data.index = pd.to_datetime(data.index, **kwargs).to_period('D')
    return data.sort_index()

btc_data = load_time_series('./data/BTC-USD.csv', 'Date', 'Close')
fng_data = load_time_series('./data/crypto-FNG.csv', 'date', 'fng_value', dayfirst=True)
data = btc_data.join(fng_data, how='inner').set_axis(['close', 'fng'], axis=1)
with pd.option_context('display.max_rows', 4): display(data)

Unnamed: 0,close,fng
2018-02-01,9170.540039,30
2018-02-02,8830.750000,15
...,...,...
2023-07-30,29275.308594,52
2023-08-01,29315.177734,53


In [3]:
# https://www.tensorflow.org/tutorials/structured_data/time_series#time
features = pd.DataFrame(index=data.index)
features['dayofweek'] = data.index.dayofweek
features['dayofyear'] = data.index.dayofyear
features['week'] = data.index.week

for feature, period in zip(features.columns, [7, 365.2425, 52.143]):
    features[f'{feature}_sin'] = np.sin(features[feature] * (2 * np.pi / period))
    features[f'{feature}_cos'] = np.cos(features[feature] * (2 * np.pi / period))

features = features.drop(['dayofweek', 'dayofyear', 'week'], axis=1)
features

Unnamed: 0,dayofweek_sin,dayofweek_cos,dayofyear_sin,dayofyear_cos,week_sin,week_cos
2018-02-01,0.433884,-0.900969,0.523104,0.852269,0.566700,0.823924
2018-02-02,-0.433884,-0.900969,0.537687,0.843144,0.566700,0.823924
2018-02-03,-0.974928,-0.222521,0.552111,0.833770,0.566700,0.823924
2018-02-04,-0.781831,0.623490,0.566372,0.824150,0.566700,0.823924
2018-02-05,0.000000,1.000000,0.580465,0.814285,0.661633,0.749828
...,...,...,...,...,...,...
2023-07-27,0.433884,-0.900969,-0.422847,-0.906201,-0.455898,-0.890032
2023-07-28,-0.433884,-0.900969,-0.438373,-0.898793,-0.455898,-0.890032
2023-07-29,-0.974928,-0.222521,-0.453769,-0.891119,-0.455898,-0.890032
2023-07-30,-0.781831,0.623490,-0.469031,-0.883182,-0.455898,-0.890032


In [4]:
# normalization
normalized = (data - data.mean()) / data.std()
normalized

Unnamed: 0,close,fng
2018-02-01,-0.730132,-0.608339
2018-02-02,-0.750958,-1.299943
2018-02-03,-0.729864,-0.147271
2018-02-04,-0.784898,-0.884981
2018-02-05,-0.865909,-1.484370
...,...,...
2023-07-27,0.498161,0.359905
2023-07-28,0.504814,0.406012
2023-07-29,0.507123,0.406012
2023-07-30,0.502121,0.406012


In [5]:
processed = pd.concat([normalized, features], axis=1) # sin/cos are already [-1, 1]
processed

Unnamed: 0,close,fng,dayofweek_sin,dayofweek_cos,dayofyear_sin,dayofyear_cos,week_sin,week_cos
2018-02-01,-0.730132,-0.608339,0.433884,-0.900969,0.523104,0.852269,0.566700,0.823924
2018-02-02,-0.750958,-1.299943,-0.433884,-0.900969,0.537687,0.843144,0.566700,0.823924
2018-02-03,-0.729864,-0.147271,-0.974928,-0.222521,0.552111,0.833770,0.566700,0.823924
2018-02-04,-0.784898,-0.884981,-0.781831,0.623490,0.566372,0.824150,0.566700,0.823924
2018-02-05,-0.865909,-1.484370,0.000000,1.000000,0.580465,0.814285,0.661633,0.749828
...,...,...,...,...,...,...,...,...
2023-07-27,0.498161,0.359905,0.433884,-0.900969,-0.422847,-0.906201,-0.455898,-0.890032
2023-07-28,0.504814,0.406012,-0.433884,-0.900969,-0.438373,-0.898793,-0.455898,-0.890032
2023-07-29,0.507123,0.406012,-0.974928,-0.222521,-0.453769,-0.891119,-0.455898,-0.890032
2023-07-30,0.502121,0.406012,-0.781831,0.623490,-0.469031,-0.883182,-0.455898,-0.890032


In [8]:
# https://www.tensorflow.org/tutorials/structured_data/time_series#data_windowing
class WindowGenerator:
    def __init__(self, input_width:int, label_width:int, shift:int,
                 data:pd.DataFrame, train_idx:pd.Index, valid_idx:pd.Index, #test_idx:pd.Index,
                 label_cols:list[str]=None) -> None:
        # store raw data
        self._train_data = data.iloc[train_idx, :]
        self._valid_data = data.iloc[valid_idx, :]
        # self._test_data = data.iloc[test_idx, :]
        # get label columns
        self.label_cols = label_cols
        if label_cols is not None:
            self.label_cols_idxs = {name: i for i, name in enumerate(label_cols)}
        self.col_idxs = {name: i for i, name in enumerate(data.columns)}
        # get window params
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        self.window_size = input_width + shift
        self.input_slice = slice(0, input_width) # TIL
        self.label_start = self.window_size - self.label_width
        self.label_slice = slice(self.label_start, None)
        idxs = np.arange(self.window_size)
        self.input_idxs = idxs[self.input_slice]
        self.label_idxs = idxs[self.label_slice]

    def __repr__(self) -> str:
        return '\n'.join([
            f'Window size: {self.window_size}',
            f'Input indices: {self.input_idxs}',
            f'Label indices: {self.label_idxs}',
            f'Label column(s): {self.label_cols}',
        ])
    
    def split_window(self, tensor:tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]:
        inputs = tensor[:, self.input_slice, :]
        labels = tensor[:, self.label_slice, :]
        if self.label_cols is not None:
            labels = tf.stack([labels[:, :, self.col_idxs[name]] for name in self.label_cols], axis=-1)
        return inputs, labels
    
    def make_dataset(self, data:pd.DataFrame) -> tf.data.Dataset:
        data = np.array(data, dtype=np.float32)
        dataset = utils.timeseries_dataset_from_array(data=data, targets=None, sequence_length=self.window_size,
                                                      sequence_stride=1, shuffle=True, batch_size=32)
        dataset = dataset.map(self.split_window)
        return dataset
    
    @property
    def train_data(self) -> tf.data.Dataset:
        return self.make_dataset(self._train_data)
    
    @property
    def valid_data(self) -> tf.data.Dataset:
        return self.make_dataset(self._valid_data)

In [15]:
for i, (train_idx, valid_idx) in enumerate(TimeSeriesSplit().split(data)):
    w1 = WindowGenerator(input_width=6, label_width=1, shift=0,
                         data=processed, train_idx=train_idx, valid_idx=valid_idx, label_cols=['close'])
    for ei, el in w1.train_data.take(1):
        display(ei.shape)
        display(el.shape)

TensorShape([32, 6, 8])

TensorShape([32, 1, 1])

TensorShape([32, 6, 8])

TensorShape([32, 1, 1])

TensorShape([32, 6, 8])

TensorShape([32, 1, 1])

TensorShape([32, 6, 8])

TensorShape([32, 1, 1])

TensorShape([32, 6, 8])

TensorShape([32, 1, 1])

In [None]:
# checkpoint: https://www.tensorflow.org/tutorials/structured_data/time_series#single_step_models