In [1]:
#| default_exp models.timellm

In [2]:
#| hide

%load_ext autoreload
%autoreload 2

# Time-LLM
Time-LLM is a reprogramming framework to repurpose LLMs for general time series forecasting with the backbone language models kept intact. In other words, it transforms a forecasting task into a "language task" that can be tackled by an off-the-shelf LLM.

In [3]:
#| hide
from fastcore.test import test_eq
from nbdev.showdoc import show_doc

In [4]:
#| export
import warnings
import torch
import torch.nn as nn
import pytorch_lightning as pl
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import optuna
import math
from optuna.trial import Trial
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import AutoConfig, AutoModel, AutoTokenizer
from gen_time_llm.common._base_model import BaseModel
from gen_time_llm.common._modules import RevIN

## 1. Auxiliary Functions

In [5]:
#| export

class ReplicationPad1d(nn.Module):
    """
    ReplicationPad1d
    """       
    def __init__(self, padding):
        super(ReplicationPad1d, self).__init__()
        self.padding = padding

    def forward(self, input):
        replicate_padding = input[:, :, -1].unsqueeze(-1).repeat(1, 1, self.padding[-1])
        output = torch.cat([input, replicate_padding], dim=-1)
        return output
    
class TokenEmbedding(nn.Module):
    """
    TokenEmbedding
    """       
    def __init__(self, c_in, d_model):
        super(TokenEmbedding, self).__init__()
        padding = 1 if torch.__version__ >= '1.5.0' else 2
        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_in', nonlinearity='leaky_relu')

    def forward(self, x):
        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
        return x
    
class PatchEmbedding(nn.Module):
    """
    PatchEmbedding
    """      
    def __init__(self, d_model, patch_len, stride, dropout):
        super(PatchEmbedding, self).__init__()
        # Patching
        self.patch_len = patch_len
        self.stride = stride
        self.padding_patch_layer = ReplicationPad1d((0, stride))

        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
        self.value_embedding = TokenEmbedding(patch_len, d_model)

        # Positional embedding
        # self.position_embedding = PositionalEmbedding(d_model)

        # Residual dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # do patching
        n_vars = x.shape[1]
        x = self.padding_patch_layer(x)
        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
        # Input encoding
        x = self.value_embedding(x)
        return self.dropout(x), n_vars
    
class FlattenHead(nn.Module):
    """
    FlattenHead
    """       
    def __init__(self, n_vars, nf, target_window, head_dropout=0):
        super().__init__()
        self.n_vars = n_vars
        self.flatten = nn.Flatten(start_dim=-2)
        self.linear = nn.Linear(nf, target_window)
        self.dropout = nn.Dropout(head_dropout)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear(x)
        x = self.dropout(x)
        return x
    
class ReprogrammingLayer(nn.Module):
    """
    ReprogrammingLayer
    """       
    def __init__(self, d_model, n_heads, d_keys=None, d_llm=None, attention_dropout=0.1):
        super(ReprogrammingLayer, self).__init__()

        d_keys = d_keys or (d_model // n_heads)

        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_llm, d_keys * n_heads)
        self.value_projection = nn.Linear(d_llm, d_keys * n_heads)
        self.out_projection = nn.Linear(d_keys * n_heads, d_llm)
        self.n_heads = n_heads
        self.dropout = nn.Dropout(attention_dropout)

    def forward(self, target_embedding, source_embedding, value_embedding):
        B, L, _ = target_embedding.shape
        S, _ = source_embedding.shape
        H = self.n_heads

        target_embedding = self.query_projection(target_embedding).view(B, L, H, -1)
        source_embedding = self.key_projection(source_embedding).view(S, H, -1)
        value_embedding = self.value_projection(value_embedding).view(S, H, -1)

        out = self.reprogramming(target_embedding, source_embedding, value_embedding)

        out = out.reshape(B, L, -1)

        return self.out_projection(out)

    def reprogramming(self, target_embedding, source_embedding, value_embedding):
        B, L, H, E = target_embedding.shape

        scale = 1. / math.sqrt(E)

        scores = torch.einsum("blhe,she->bhls", target_embedding, source_embedding)

        A = self.dropout(torch.softmax(scale * scores, dim=-1))
        reprogramming_embedding = torch.einsum("bhls,she->blhe", A, value_embedding)

        return reprogramming_embedding

## 2. TimeLLM

In [6]:
class TimeLLM(BaseModel):

    """ TimeLLM

    Time-LLM is a reprogramming framework to repurpose an off-the-shelf LLM for time series forecasting.

    It trains a reprogramming layer that translates the observed series into a language task. This is fed to the LLM and an output
    projection layer translates the output back to numerical predictions.
    """

    def __init__(
        self,
        random_seed,
        input_size,
        patch_len: int = 4,
        stride: int = 2,
        d_ff: int = 128,
        top_k: int = 5,
        d_llm: int = 768,
        d_model: int = 32,
        n_heads: int = 8,
        enc_in: int = 7,
        dec_in: int  = 7,
        llm = None,
        llm_config = None,
        llm_tokenizer = None,
        llm_num_hidden_layers = 32,
        llm_output_attention: bool = True,
        llm_output_hidden_states: bool = True,
        dropout=0.1,
        base_lr=1e-5,  # Learning rate
        max_length=512,  # Maximum length of generated sequences
        num_beams=3,  # Number of beams for beam search
        **kwargs
    ):
        super().__init__(
            random_seed=random_seed,
            max_length=max_length,
            num_beams=num_beams,
            **kwargs
        )

        self.base_lr = base_lr

        self.patch_len = patch_len
        self.stride = stride
        self.d_ff = d_ff
        self.top_k = top_k
        self.d_llm = d_llm
        self.d_model = d_model
        self.dropout = dropout
        self.n_heads = n_heads
        self.enc_in = enc_in
        self.dec_in = dec_in

        DEFAULT_MODEL = "openai-community/gpt2"

        if llm is None:
            print(f"Using {DEFAULT_MODEL} as default.")
            model_name = DEFAULT_MODEL
        else:
            model_name = llm

        if llm_config is not None or llm_tokenizer is not None:
            warnings.warn("'llm_config' and 'llm_tokenizer' parameters are deprecated and will be ignored. "
                        "The config and tokenizer will be automatically loaded from the specified model.", 
                        DeprecationWarning)

        try:
            self.llm_config = AutoConfig.from_pretrained(model_name)
            self.llm = AutoModel.from_pretrained(model_name, config=self.llm_config)
            self.llm_tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.llm_head = GPT2LMHeadModel.from_pretrained(model_name)
            print(f"Successfully loaded model: {model_name}")
        except EnvironmentError:
            print(f"Failed to load {model_name}. Loading the default model ({DEFAULT_MODEL})...")
            self.llm_config = AutoConfig.from_pretrained(DEFAULT_MODEL)
            self.llm = AutoModel.from_pretrained(DEFAULT_MODEL, config=self.llm_config)
            self.llm_tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL)

        self.llm_num_hidden_layers = llm_num_hidden_layers
        self.llm_output_attention = llm_output_attention
        self.llm_output_hidden_states = llm_output_hidden_states

        if self.llm_tokenizer.eos_token:
            self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
        else:
            pad_token = '[PAD]'
            self.llm_tokenizer.add_special_tokens({'pad_token': pad_token})
            self.llm_tokenizer.pad_token = pad_token

        for param in self.llm.parameters():
            param.requires_grad = False

        self.patch_embedding = PatchEmbedding(
            self.d_model, self.patch_len, self.stride, self.dropout)
        
        self.word_embeddings = self.llm.get_input_embeddings().weight
        self.vocab_size = self.word_embeddings.shape[0]
        self.num_tokens = 1024
        self.mapping_layer = nn.Linear(self.vocab_size, self.num_tokens)

        self.reprogramming_layer = ReprogrammingLayer(self.d_model, self.n_heads, self.d_ff, self.d_llm)

        self.patch_nums = int((input_size - self.patch_len) / self.stride + 2)
        self.normalize_layers = RevIN(self.enc_in, affine=False)

    def select_top_features_by_variance(self, time_series, top_k=20):
        # time_series is assumed to be of shape (B, T, N)
        # Compute variance for each feature over time (dim=1 -> T)
        feature_variances = torch.var(time_series, dim=1).mean(dim=0)  # Mean variance per feature across batches
        
        # Get indices of the top `top_k` features by variance
        top_features = torch.topk(feature_variances, top_k).indices
        return top_features

    def encode(self, time_series, country, sector, columns):
        x_enc = self.normalize_layers(time_series, 'norm')

        # Select top 10 important features based on variance
        selected_features = self.select_top_features_by_variance(x_enc, top_k=10)

        # Select the corresponding column names for the selected features
        selected_columns = [columns[i] for i in selected_features.tolist()]

        # Select only the top 10 important features
        x_enc = x_enc[:, :, selected_features]  # Shape will be (B, T, 10)

        B, T, N = x_enc.size()

        min_values = torch.min(x_enc, dim=1)[0]  # Min over time (T) for each feature (N)
        max_values = torch.max(x_enc, dim=1)[0]  # Max over time (T) for each feature (N)
        medians = torch.median(x_enc, dim=1).values  # Median over time (T) for each feature (N)
        trends = x_enc.diff(dim=1).sum(dim=1)  # Sum of differences over time (T) for each feature (N)

        prompt = []
        for b in range(B):
            feature_prompts = []
            for n in range(N):
                min_values_str = str(min_values[b, n].item())  # No need for flattening now
                max_values_str = str(max_values[b, n].item())
                median_values_str = str(medians[b, n].item())
                trend_str = 'upward' if trends[b, n] > 0 else 'downward'

                feature_prompt = (
                    f"Feature {columns[n]} statistics: "
                    f"min value {min_values_str}, "
                    f"max value {max_values_str}, "
                    f"median value {median_values_str}, "
                    f"the trend is {trend_str}"
                )

                feature_prompts.append(feature_prompt)

            prompt_ = (
                f"<|start_prompt|>Focused country: {country[b]}, sectors: {', '.join(sector[b])} "
                f"Task description: generate climate policy summary according to the given information; "
                f"{' '.join(feature_prompts)}<||>"
            )
            
            prompt.append(prompt_)

        prompt = self.llm_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).input_ids
        prompt_embeddings = self.llm.get_input_embeddings()(prompt.to(x_enc.device))  # (batch, prompt_token, dim)

        source_embeddings = self.mapping_layer(self.word_embeddings.permute(1, 0)).permute(1, 0)

        x_enc = x_enc.permute(0, 2, 1).contiguous()
        enc_out, n_vars = self.patch_embedding(x_enc.to(torch.float32))
        enc_out = self.reprogramming_layer(enc_out, source_embeddings, source_embeddings)
        H_enc = enc_out.size(2)
        enc_out = enc_out.view(B, -1, H_enc)  # torch.Size([4, 50, 768])
        llm_enc_out = torch.cat([prompt_embeddings, enc_out], dim=1)

        return llm_enc_out


    def forward(self, batch, target):
        output = self.encode(batch['temporal_series'], batch['country'], batch['sector'], batch['temporal_cols'])
        dec_out = self.llm_head(inputs_embeds=output).logits

        loss_fn = torch.nn.CrossEntropyLoss(ignore_index=self.llm_tokenizer.eos_token_id)

        # Check the size of the generated output and target
        output_length = dec_out.size(1)
        target_length = target.size(1)

        # Case 1: If generated output is longer, trim the output
        if output_length > target_length:
            dec_out = dec_out[:, :target_length, :]
        # Case 2: If target is longer, pad the output to match the target length
        elif output_length < target_length:
            padding_size = target_length - output_length
            padding = torch.zeros(dec_out.size(0), padding_size, dec_out.size(2), device=dec_out.device)
            dec_out = torch.cat([dec_out, padding], dim=1)

        # Reshape output and target for CrossEntropyLoss
        dec_out = dec_out.reshape(-1, dec_out.size(-1))  # Reshape to (batch_size * seq_len, vocab_size)
        target = target.reshape(-1)  # Reshape to (batch_size * seq_len)

        # Compute loss
        loss = loss_fn(dec_out, target)

        return loss


    def configure_optimizers(self):
        """
        Configure optimizers and learning rate scheduler.
        """
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.base_lr)
        return optimizer

In [None]:
from gen_time_llm.tsdataset import TimeSeriesDataset, TimeSeriesDataModule
from torch.utils.data import random_split

import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2')
dataset = TimeSeriesDataset.from_jsonl('data/processed_train_data.jsonl', tokenizer=tokenizer)

# Instantiate the model with the trial's hyperparameters
model = TimeLLM(
    input_keys=['temporal_series', 'sector', 'country'],
    random_seed=42,
    input_size=218,
    base_lr=1e-3
)

trainer = Trainer(
    max_epochs=100,
    overfit_batches=1,
    accelerator='cpu'
)

# Assuming dataset is an instance of your TimeSeriesDataset
total_size = len(dataset)

# Define the lengths for training and validation sets
train_size = int(0.8 * total_size)
val_size = total_size - train_size

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create a TimeSeriesDataModule using the given datasets and tokenizer
datamodule = TimeSeriesDataModule(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer,  # Use the same tokenizer
    batch_size=4,
    valid_batch_size=4
)

# Train the model
trainer.fit(model, datamodule=datamodule)

Seed set to 42


Using openai-community/gpt2 as default.
Successfully loaded model: openai-community/gpt2


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/thamolwanp/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | llm                 | GPT2Model          | 124 M  | eval 
1 | llm_head            | GPT2LMHeadModel    | 124 M  | eval 
2 | patch_embedding     | PatchEmbedding     | 384    | train
3 | mapping_layer       | Linear             | 51.5 M | train
4 | reprogramming_layer | ReprogrammingLayer | 2.4 M  | train
5 | normalize_layers    | RevIN              | 0      | train
-------------------------------------------------------------------
178 M     Trainable params
124 M     Non-trainable params
302 M  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/thamolwanp/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/thamolwanp/anaconda3/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
/Users/thamolwanp/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:251: You requested to overfit but enabled train dataloader shuffling. We are turning off the train dataloader shuffling for you.
/Users/thamolwanp/anaconda3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be

Training: |          | 0/? [00:00<?, ?it/s]