# Stock Forecasting using Transformers

In this notebook we implement a Transformer model to forecast stock data.

In [41]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # https://stackoverflow.com/a/64438413

In [42]:
import glob
import logging
import pandas as pd
from pathlib import Path
import seaborn as sns
import sys
import tensorflow as tf
import tensorflow.keras as keras

## Environment Setup

This section contains code that is modifies output path locations, random seed, and logging.

In [43]:
DATASET_ROOT = Path('~/ml/datasets').expanduser()
if not DATASET_ROOT.exists(): raise ValueError(f"Dataset root directory does not exist at {DATASET_ROOT}")

In [44]:
# Set random seeds.
SEED = 0
tf.random.set_seed(SEED) # Only this works on ARC (since tensorflow==2.4).

In [45]:
# Setup logging (useful for ARC systems).
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) # Must be lowest of all handlers listed below.
while logger.hasHandlers(): logger.removeHandler(logger.handlers[0]) # Clear all existing handlers.

# Custom log formatting.
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')

# Log to STDOUT (uses default formatting).
sh = logging.StreamHandler(stream=sys.stdout)
sh.setLevel(logging.INFO)
logger.addHandler(sh)

# Set Tensorflow logging level.
tf.get_logger().setLevel('ERROR') # 'INFO'

In [60]:
# Request user for Kaggle login if JSON file does not exist.
kaggle_config_file = Path("~/.kaggle/kaggle.json").expanduser()
if not kaggle_config_file.exists() and os.environ.get("KAGGLE_USERNAME", None) is None and os.environ.get("KAGGLE_KEY", None) is None:
    import json
    import getpass
    entry = getpass.getpass(prompt="Please enter your Kaggle username or JSON blob: ")
    try:
        blob = json.loads(entry)
        os.environ["KAGGLE_USERNAME"] = blob['username']
        os.environ["KAGGLE_KEY"] = blob['key']
    except:
        api_key = getpass.getpass(prompt="Please enter your Kaggle API KEY: ")
        os.environ["KAGGLE_USERNAME"] = entry
        os.environ["KAGGLE_KEY"] = api_key
else:
    logger.info('Kaggle API configured')

Kaggle API configured


In [46]:
# List all GPUs visible to TensorFlow.
gpus = tf.config.list_physical_devices('GPU')
logger.info(f"Num GPUs Available: {len(gpus)}")
for gpu in gpus:
    logger.info(f"Name: {gpu.name}, Type: {gpu.device_type}")

Num GPUs Available: 0


## Huge Stock Market Dataset from Kaggle

https://www.kaggle.com/datasets/borismarjanovic/price-volume-data-for-all-us-stocks-etfs

In [54]:
class HugeStockMarketDataset:
    """Wrapper for Huge Stock Market Dataset by Boris Marjanovic on Kaggle.

    Source URL is: https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs/version/3

    This class can be used like a Python dictionary, where keys are the stock/etf names, and values are
    `pandas.DataFrame` objects corresponding to that stock/etf.
    """
    root = 'HugeStockMarketDataset'

    def __init__(self, 
        path: str,
        files: list = None,
        quiet: bool = False,
        exclude_stocks: bool = False,
        exclude_etfs: bool = False,
        usecols: list[str] = ['Date','Open','High','Low','Close','Volume','OpenInt'],
        ):
        self.exclude_stocks = exclude_stocks
        self.exclude_etfs = exclude_etfs
        self.usecols = usecols
        self._index = {}

        # Download the dataset if necessary.
        newpath = Path(path).expanduser()/self.root
        if not newpath.exists():
            self.download(newpath, files, quiet=quiet)
        else:
            self.path = newpath
            self._build_index()

    def _build_index(self):
        """Creates an internal index of stocks and ETFs for lookup."""

        # Helper function to index a folder of files.
        def _index_folder(dir: Path):
            for file in glob.iglob(str(dir/'*.txt'), recursive=True):
                filename = Path(file).name
                product_name = filename.split('.', maxsplit=1)[0]
                self._index[product_name] = file

        # Index all stocks.
        if not self.exclude_stocks:
            _index_folder(self.path/'Stocks')
        
        # Index all ETFs.
        if not self.exclude_etfs:
            _index_folder(self.path/'ETFs')

    def download(self, path: str, files: list = None, quiet: bool = True):
        """Downloads the dataset from Kaggle.

        Args:
            path (str): The path to place the download.
            files (list, optional): Subset list of files to download instead of entire dataset. Defaults to None.
            quiet (bool, optional): Suppress verbose output. Defaults to True.
        """
        import kaggle
        kaggle_dataset = 'borismarjanovic/price-volume-data-for-all-us-stocks-etfs'
        kaggle.api.authenticate()

        # Save the new downloaded path.
        self.path = Path(path).expanduser()

        # Specific file list was given.
        if files is not None:
            for f in files:
                kaggle.api.dataset_download_file(
                    dataset=kaggle_dataset,
                    file_name=f,
                    path=path/f,
                    quiet=quiet,
                )
        # Download all files.
        else:
            kaggle.api.dataset_download_files(
                dataset=kaggle_dataset,
                path=path,
                unzip=True,
                quiet=quiet,
            )

        # Force rebuild the index after downloading.
        logger.info("Building file index")
        self._build_index()

    def get_dataframe(self, 
        key: str,
        **kwargs,
        ) -> pd.DataFrame:
        """Obtain historical data for stock or ETF in a pandas dataframe.

        Optional keyword arguments are passed directly to `pandas.read_csv` function.

        Args:
            key (str): The identifier for the stock or ETF.

        Returns:
            pd.DataFrame: Historical data.
        """
        return pd.read_csv(self._index[key], **kwargs)

    #### Dictionary Override ######

    def __getitem__(self, key):
        if isinstance(key, str):
            return self.get_dataframe(key)
        elif isinstance(key, list):
            return [self.get_dataframe(asset) for asset in key]

    def __delitem__(self, key):
        del self._index[key]

    def __iter__(self):
        return iter(self._index)

    def items(self):
        for key in self:
            yield key, self.get_dataframe(key)

    def __len__(self):
        return len(self._index)

    def keys(self):
        """Returns a list of all downloaded stocks and ETFs."""
        return self._index.keys()

    ###############################

Now lets actually load the dataset. In this case, we're only looking at the `aapl` stock.

In [None]:
dataset = HugeStockMarketDataset(DATASET_ROOT)
df = dataset['aapl']

## Time2Vec Embedding

https://arxiv.org/abs/1907.05321

In [48]:
class Time2Vec(keras.layers.Layer):
    def __init__(self, embed_dim: int, activation: str = 'sin', **kwargs):
        """Vector embedding representation of time.

        Based on the original concept proposed by Kazemi et al., 2019 (https://arxiv.org/abs/1907.05321).

        Args:
            embed_dim (int): Length of the time embedding vector.
            activation (str, optional): Periodic activation function. Possible values are ['sin', 'cos']. Defaults to 'sin'.
        """
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.activation = activation.lower() # Convert to lower-case.

        # Set periodic activation function.
        if self.activation.startswith('sin'):
            self.activation_func = tf.sin
        elif self.activation.startswith('cos'):
            self.activation_func = tf.cos
        else:
            raise ValueError(f'Unsupported periodic activation function "{activation}"')

    def build(self, input_shape: list[int]):

        # Weight and bias term for linear portion (i = 0)
        # of embedding.
        self.w_linear = self.add_weight(
            name='w_linear',
            shape=(input_shape[1], 1,),
            initializer='uniform',
            trainable=True,
        )
        self.b_linear = self.add_weight(
            name='b_linear',
            shape=(input_shape[1], 1,),
            initializer='uniform',
            trainable=True,
        )

        # Weight and bias terms for the periodic
        # portion (1 <= i <= k) of embedding.
        self.w_periodic = self.add_weight(
            name='w_periodic',
            shape=(input_shape[-1], self.embed_dim,),
            initializer='uniform',
            trainable=True,
        )
        self.b_periodic = self.add_weight(
            name='b_periodic',
            shape=(input_shape[1], self.embed_dim,),
            initializer='uniform',
            trainable=True,
        )

    def call(self, x: tf.Tensor) -> tf.Tensor:
        """Embed input into linear and periodic feature components.

        Args:
            x (tf.Tensor): Input tensor with shape (batch_size, sequence_length, feature_size)

        Returns:
            tf.Tensor: Output tensor with shape (batch_size, sequence_length, embed_dim + 1)
        """

        # Linear term (i = 0).
        embed_linear = tf.tensordot(x, self.w_linear, axes=1) + self.b_linear

        # Periodic terms (1 <= i <= k).
        inner = tf.tensordot(x, self.w_periodic, axes=1) + self.b_periodic
        embed_periodic = self.activation_func(inner)

        # Return concatenated linear and periodic features.
        return tf.concat([embed_linear, embed_periodic], axis=-1)

    def get_config(self) -> dict:
        """Retreive custom layer configuration for future loading.

        Returns:
            dict: Configuration dictionary.
        """
        config = super().get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'activation': self.activation,
        })
        return config

stock_feat = 5
seq_len = 128
embed_dim = 32
inp = keras.Input(shape=(seq_len, stock_feat))
logger.info(f"{inp.shape=}")
x = Time2Vec(embed_dim)(inp)
logger.info(f"{x.shape=}")
x = keras.layers.Concatenate(axis=-1)([inp, x])
logger.info(f"{x.shape=}")

inp.shape=TensorShape([None, 128, 5])
x.shape=TensorShape([None, 128, 33])
x.shape=TensorShape([None, 128, 38])


## Transformer Architecture

https://arxiv.org/abs/1706.03762

### Attention Layers

Currently uses attention layers provided by TensorFlow. See https://www.tensorflow.org/api_docs/python/tf/keras/layers/MultiHeadAttention.

In [49]:
# class MultiHeadAttention(keras.layers.Layer):
#     def __init__(self, d_k: int, d_v: int, n_heads: int):
#         """Single-head attention layer.

#         Based on the original concept proposed by Vaswani et al., 2017 (https://arxiv.org/abs/1706.03762).

#         Args:
#             d_k (int): Key dimension (also used for Query dimension).
#             d_v (int): Value dimension.
#             n_heads (int): Number of attention heads.
#         """
#         self.d_k = d_k # Query and Key have same dimension.
#         self.d_v = d_v
#         self.n_heads = n_heads # Number of attention heads.
#         self.heads = [] # List of attention layers as heads.

#     def build(self, input_shape: list[int]):

#         # Build attention heads.
#         self.heads = [
#             keras.layers.Attention()
#             for i in range(self.n_heads)
#         ]

#         # Build linear relationship between 

In [50]:
# class Attention(keras.layers.Layer):
#     def __init__(self, d_k: int, d_v: int):
#         """Single-head attention layer.

#         Based on the original concept proposed by Vaswani et al., 2017 (https://arxiv.org/abs/1706.03762).

#         Args:
#             d_k (int): Key dimension (also used for Query dimension).
#             d_v (int): Value dimension.
#         """
#         self.d_k = d_k # Query and Key have same dimension.
#         self.d_v = d_v

#     def build(self, input_shape: list[int]):
#         self.query = keras.layers.Dense(
#             units=self.d_k,
#             input_shape=input_shape,
#             kernel_initializer='glorot_uniform', 
#             bias_initializer='glorot_uniform',
#         )
#         self.key = keras.layers.Dense(
#             units=self.d_k,
#             input_shape=input_shape,
#             kernel_initializer='glorot_uniform', 
#             bias_initializer='glorot_uniform',
#         )
#         self.value = keras.layers.Dense(
#             units=self.d_v,
#             input_shape=input_shape,
#             kernel_initializer='glorot_uniform', 
#             bias_initializer='glorot_uniform',
#         )

#     def call(self, x: tf.Tensor) -> tf.Tensor:
        


#     def get_config(self) -> dict:
#         """Retreive custom layer configuration for future loading.

#         Returns:
#             dict: Configuration dictionary.
#         """
#         config = super().get_config().copy()
#         config.update({
#             'd_k': self.d_k,
#             'd_v': self.d_v,
#         })
#         return config

### Transformer Encoder Layer

In [51]:
class TransformerEncoder(keras.layers.Layer):
    def __init__(self,
        d_k: int,
        d_v: int,
        n_heads: int,
        d_model: int,
        dropout: float = 0.0,
        **kwargs,
        ):
        """Transformer encoder layer.

        Based on the original concept proposed by Vaswani et al., 2017 (https://arxiv.org/abs/1706.03762).

        Args:
            d_k (int): Key dimension (also used for Query dimension).
            d_v (int): Value dimension.
            n_heads (int): Number of attention heads.
            d_model (int): Dimension of the feed forward sublayer.
            dropout (float, optional): Dropout rate. Defaults to 0.0.
        """
        super().__init__(**kwargs)
        self.d_k = d_k # Query and Key have same dimension.
        self.d_v = d_v
        self.n_heads = n_heads # Number of attention heads.
        self.d_model = d_model
        self.dropout = dropout

    def build(self, input_shape: tuple[tf.TensorShape,tf.TensorShape,tf.TensorShape]):

        # First sublayer.
        # Multi-head attention with add and norm.
        self.attn_multi = keras.layers.MultiHeadAttention(
            num_heads=self.n_heads,
            key_dim=self.d_k,
            value_dim=self.d_v,
        )
        self.attn_multi._build_from_signature(*input_shape)
        self.attn_dropout = keras.layers.Dropout(rate=self.dropout)
        self.attn_add = keras.layers.Add()
        self.attn_norm = keras.layers.LayerNormalization(epsilon=1e-6)

        # Second sublayer.
        # Feed forward with add and norm.
        d_query_feat = input_shape[0][-1] # Query feature size.
        self.ff_dense_1 = keras.layers.Dense(units=self.d_model, activation='relu')
        self.ff_dense_2 = keras.layers.Dense(units=d_query_feat)
        self.ff_dropout = keras.layers.Dropout(rate=self.dropout)
        self.ff_add = keras.layers.Add()
        self.ff_norm = keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, x: tuple[tf.Tensor,tf.Tensor,tf.Tensor]) -> tf.Tensor:
        """Encode input using multi-head self-attention mechanisms.

        Args:
            x (tf.Tensor): Tuple of Query, Value, and Key tensors. Note that the Key tensor is optional, if omitted the Value tensor will be used for both Key and Value.

        Returns:
            tf.Tensor: Output tensor with shape (batch_size, sequence_length, embed_dim + 1)
        """
        # x = (query, value, key)
        # note that "key" is optional.

        # First, do the attention sublayer.
        x_attn = self.attn_multi(*x) # Unpack input as Query, Value, and optional Key.
        x_attn = self.attn_dropout(x_attn)
        x_attn = self.attn_add([x[0], x_attn]) # (residual) Add Query matrix with result of attention layer.
        x_attn = self.attn_norm(x_attn) # Normalize the residual.

        # Second, do the feed forward sublayer.
        x_ff = self.ff_dense_1(x_attn)
        x_ff = self.ff_dense_2(x_ff)
        x_ff = self.ff_dropout(x_ff)
        x_ff = self.ff_add([x_attn, x_ff])
        x_ff = self.ff_norm(x_ff)

        # Return output of feed forward sublayer.
        return x_ff

    def get_config(self) -> dict:
        """Retreive custom layer configuration for future loading.

        Returns:
            dict: Configuration dictionary.
        """
        config = super().get_config().copy()
        config.update({
            'n_heads': self.n_heads,
            'd_k': self.d_k,
            'd_v': self.d_v,
            'd_model': self.d_model,
            'dropout': self.dropout,
        })
        return config


stock_feat = 5
seq_len = 128
embed_dim = 32
d_k = 512
d_v = 256
n_heads = 8
d_model = 512
inp = keras.Input(shape=(seq_len, stock_feat))
logger.info(f"{inp.shape=}")
x = Time2Vec(embed_dim)(inp)
logger.info(f"Time2Vec {x.shape=}")
x = keras.layers.Concatenate(axis=-1)([inp, x])
logger.info(f"Concatenate {x.shape=}")
x = TransformerEncoder(d_k, d_v, n_heads, d_model)([x, x, x])
logger.info(f"TransformerEncoder {x.shape=}")
x = TransformerEncoder(d_k, d_v, n_heads, d_model)([x, x, x])
logger.info(f"TransformerEncoder {x.shape=}")
x = TransformerEncoder(d_k, d_v, n_heads, d_model)([x, x, x])
logger.info(f"TransformerEncoder {x.shape=}")
x = keras.layers.GlobalAvgPool1D(data_format='channels_first')(x)
logger.info(f"GlobalAvgPool1D {x.shape=}")

inp.shape=TensorShape([None, 128, 5])
Time2Vec x.shape=TensorShape([None, 128, 33])
Concatenate x.shape=TensorShape([None, 128, 38])
TransformerEncoder x.shape=TensorShape([None, 128, 38])
TransformerEncoder x.shape=TensorShape([None, 128, 38])
TransformerEncoder x.shape=TensorShape([None, 128, 38])
GlobalAvgPool1D x.shape=TensorShape([None, 128])


### Model Definition

In [52]:
def build_model(
    in_seq_len: int,
    in_feat: int,
    out_feat: int,
    fc_units: list[int], # list of fully-connected dimensions before classifier.
    embed_dim: int,
    d_k: int,
    d_v: int,
    n_heads: int,
    d_model: int,
    dropout: float = 0.0,
    n_encoders: int = 3,
    ):

    # Input sequence of features.
    inp = keras.Input(shape=(in_seq_len, in_feat))
    # Time embedding.
    x = Time2Vec(embed_dim)(inp)
    # Combine input with embedding to form attention input features.
    x = keras.layers.Concatenate(axis=-1)([inp, x])
    # Pass combined featured through cascaded self-attention encoder sublayers.
    for _ in range(n_encoders):
        x = TransformerEncoder(
            d_k=d_k,
            d_v=d_v,
            n_heads=n_heads,
            d_model=d_model,
            dropout=dropout,
        )((x, x, x)) # (query, value, key)
    # Downsample to the original sequence dimension.
    x = keras.layers.GlobalAvgPool1D(data_format='channels_first')(x) # shape=(in_seq_len,)
    x = keras.layers.Dropout(rate=dropout)(x)
    # Fully-connected network before classifier.
    for units in fc_units: 
        x = keras.layers.Dense(units=units, activation='relu')(x)
        x = keras.layers.Dropout(rate=dropout)(x)
    # Classifier.
    x = keras.layers.Dense(units=out_feat, activation='linear')(x)

    # Construct model class and return.
    return keras.Model(inputs=inp, outputs=x)


kwargs = dict(
    in_seq_len=128, # Number of days in the past.
    in_feat=5, # Number of features for each day in the past.
    out_feat=3, # Number of features on 1-day horizon.
    fc_units=[64,64],
    embed_dim=32,
    d_k=512,
    d_v=256,
    n_heads=8,
    d_model=512,
)
model = build_model(**kwargs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 128, 5)]     0           []                               
                                                                                                  
 time2_vec_2 (Time2Vec)         (None, 128, 33)      4512        ['input_4[0][0]']                
                                                                                                  
 concatenate_2 (Concatenate)    (None, 128, 38)      0           ['input_4[0][0]',                
                                                                  'time2_vec_2[0][0]']            
                                                                                                  
 transformer_encoder_3 (Transfo  (None, 128, 38)     516836      ['concatenate_2[0][0]',      