In [49]:
#| default_exp models.gru

In [1]:
#| hide
%load_ext autoreload
%autoreload 2

In [2]:
#| hide
from nbdev.showdoc import show_doc
from gen_time_llm.tsdataset import TimeSeriesDataset, TimeSeriesDataModule
from torch.utils.data import random_split

In [3]:
#| export
import torch
import torch.nn as nn
import pytorch_lightning as pl
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import optuna
from optuna.trial import Trial
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from gen_time_llm.common._base_model import BaseModel

In [15]:
#| export

class GRUGPTModel(BaseModel):
    """
    Model combining a GRU encoder for time series and a GPT-based decoder for text generation,
    with temporal normalization/scaling.
    """

    def __init__(
        self,
        random_seed,
        loss,  # Loss function for training
        tokenizer,  # Tokenizer for decoding generated text
        hidden_size=256,  # Hidden size of the GRU encoder
        num_layers=4,  # Number of GRU layers
        base_lr=1e-5,  # Learning rate
        max_length=512,  # Maximum length of generated sequences
        num_beams=3,  # Number of beams for beam search
        gru_input_size=128,  # Size of the input for the GRU (e.g., number of features in the time series)
        **kwargs
    ):
        super().__init__(
            random_seed=random_seed,
            loss=loss,
            tokenizer=tokenizer,
            max_length=max_length,
            num_beams=num_beams,
            **kwargs
        )

        self.tokenizer = tokenizer
        self.max_length = max_length

        # GRU Encoder
        self.gru = nn.GRU(
            input_size=gru_input_size,  # Number of input features in the time series
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )

        # GPT Decoder
        self.gpt = GPT2LMHeadModel.from_pretrained("gpt2")
        # Freeze the GPT model parameters
        for param in self.gpt.parameters():
            param.requires_grad = False

        # Put the GPT model in evaluation mode
        self.gpt.eval()  # Ensure GPT is in evaluation mode

        # Mapping GRU hidden state to the GPT's embedding size
        self.hidden_to_gpt = nn.Linear(hidden_size, self.gpt.config.n_embd)

        # Learning rate
        self.base_lr = base_lr

    def forward(self, batch, targets=None, use_teacher_forcing=False):
        """
        Forward pass of the model.
        - time_series: Time series input (batch_size, seq_length, num_features)
        - targets: Target text used for teacher forcing (optional)
        - use_teacher_forcing: Boolean flag for using teacher forcing
        Returns:
        - gpt_output.loss if using teacher forcing
        - gpt_output logits if autoregressive generation
        """
        inputs = {key: batch[key] for key in self.input_keys}
        time_series = inputs['temporal_series']

        # GRU encoding
        _, hidden_state = self.gru(time_series)
        hidden_state = hidden_state[-1]

        # Map hidden state to GPT's input size (this is the time series representation)
        gpt_input = self.hidden_to_gpt(hidden_state).unsqueeze(1)  # (batch_size, 1, gpt_hidden_size)

        if use_teacher_forcing and targets is not None:
            # Teacher forcing: pass inputs and labels to GPT for loss computation
            gpt_input_ids = targets[:, :-1]  # Input part of the target sequence (ignore last token)
            token_embeddings = self.gpt.transformer.wte(gpt_input_ids)
            gpt_input_combined = torch.cat([gpt_input, token_embeddings], dim=1)
            
            # Pass to GPT and compute loss directly
            gpt_output = self.gpt(inputs_embeds=gpt_input_combined, labels=targets)
            return gpt_output.loss

        else:
            # Autoregressive generation with past_key_values management
            outputs = []

            for _ in range(self.max_length):
                # Generate the next token with time series embedding (gpt_input) and past_key_values
                gpt_output = self.gpt(inputs_embeds=gpt_input)
                logits = gpt_output.logits[:, -1, :]
                outputs.append(logits.unsqueeze(1))

                next_token = torch.argmax(logits, dim=-1)
                gpt_input = self.gpt.transformer.wte(next_token).unsqueeze(1)

            outputs = torch.cat(outputs, dim=1)  # Concatenate the outputs along sequence dimension

        return outputs

    def generate(self, time_series, max_length=None, num_beams=3):
      """
      Generate text from time series data using autoregressive generation and beam search.
      """
      max_length = max_length if max_length is not None else self.max_length

      # GRU encoding
      _, hidden_state = self.gru(time_series)
      hidden_state = hidden_state[-1]  # Get the last layer's hidden state (batch_size, hidden_size)

      # Map hidden state to GPT's input size
      gpt_input = self.hidden_to_gpt(hidden_state)

      # Start token (e.g., <BOS> or a special token depending on your tokenizer)
      start_token = self.tokenizer.bos_token_id if self.tokenizer.bos_token_id else self.tokenizer.eos_token_id
      input_ids = torch.full((time_series.size(0), 1), start_token, dtype=torch.long, device=self.device)

      # Pass the hidden state to GPT as embeddings (initial hidden state for generation)
      inputs_embeds = gpt_input.unsqueeze(1)  # (batch_size, 1, hidden_dim)

      # Generate text using the built-in generate() function from transformers
      generated_ids = self.gpt.generate(
          inputs_embeds=inputs_embeds,  # Use the hidden state as input embeddings
          max_length=max_length,
          num_beams=num_beams,
          early_stopping=True,  # Stop when all beams generate the <EOS> token
          no_repeat_ngram_size=2,  # Optionally prevent repetition
      )

      # Decode the generated token IDs back into text
      generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

      return generated_text


    def configure_optimizers(self):
        """
        Configure optimizers and learning rate scheduler.
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=self.base_lr)
        return optimizer

In [16]:
show_doc(GRUGPTModel)

---

### GRUGPTModel

>      GRUGPTModel (random_seed, loss, tokenizer, hidden_size=256, num_layers=4,
>                   base_lr=1e-05, max_length=512, num_beams=3,
>                   gru_input_size=128, **kwargs)

*Model combining a GRU encoder for time series and a GPT-based decoder for text generation,
with temporal normalization/scaling.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| random_seed |  |  |  |
| loss |  |  | Loss function for training |
| tokenizer |  |  | Tokenizer for decoding generated text |
| hidden_size | int | 256 | Hidden size of the GRU encoder |
| num_layers | int | 4 | Number of GRU layers |
| base_lr | float | 1e-05 | Learning rate |
| max_length | int | 512 | Maximum length of generated sequences |
| num_beams | int | 3 | Number of beams for beam search |
| gru_input_size | int | 128 | Size of the input for the GRU (e.g., number of features in the time series) |
| kwargs |  |  |  |

In [17]:
#| hide
def objective(trial: Trial):
    """
    Optuna objective function to tune the hyperparameters for the GRUGPTModel.
    """
    # Hyperparameter search space
    hidden_size = trial.suggest_int("hidden_size", 64, 256, step=64)
    num_layers = trial.suggest_int("num_layers", 1, 4)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Instantiate the model with the trial's hyperparameters
    model = GRUGPTModel(
        input_keys=['temporal_series'],
        gru_input_size=218,
        random_seed=42,
        loss=torch.nn.CrossEntropyLoss(ignore_index=tokenizer.eos_token_id),
        tokenizer=tokenizer,  # Assume you have a tokenizer loaded
        hidden_size=hidden_size,
        num_layers=num_layers,
        base_lr=learning_rate
    )

    # Define a PyTorch Lightning Trainer
    trainer = Trainer(
        max_epochs=5,
        logger=TensorBoardLogger("lightning_logs/", name="optuna_gru_gpt"),
        callbacks=[EarlyStopping(monitor="val_loss", patience=3, mode="min")],
    )

    dataset = TimeSeriesDataset.from_jsonl('data/processed_train_data.jsonl', tokenizer=tokenizer)

    # Assuming dataset is an instance of your TimeSeriesDataset
    total_size = len(dataset)

    # Define the lengths for training and validation sets
    train_size = int(0.8 * total_size)
    val_size = total_size - train_size

    # Split the dataset
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Create a TimeSeriesDataModule using the given datasets and tokenizer
    datamodule = TimeSeriesDataModule(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        tokenizer=tokenizer,  # Use the same tokenizer
        batch_size=4,
        valid_batch_size=4
    )

    # Train the model
    trainer.fit(model, datamodule=datamodule)

    # Return validation loss for Optuna to optimize
    return trainer.callback_metrics["val_loss"].item()


# # Running the Optuna study
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=20)

# # Print the best hyperparameters
# print(study.best_params)

In [19]:
#| hide

# Step 1: Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Step 2: Instantiate the model
model = GRUGPTModel(
    random_seed=42,
    loss=torch.nn.CrossEntropyLoss(ignore_index=tokenizer.eos_token_id),  # Loss is not needed for inference but required for initialization
    tokenizer=tokenizer,
    hidden_size=256,  # Use the same hidden_size as in your trained model
    num_layers=4,  # Same number of layers as in your trained model
    gru_input_size=218,  # Adjust this based on your input features
)

# Step 3: Load pre-trained weights if you have them
model.eval()  # Set the model to evaluation mode

dataset = TimeSeriesDataset.from_jsonl('data/processed_train_data.jsonl', tokenizer=tokenizer)

# Step 5: Use the `generate` method to generate text
generated_text = model.generate(time_series=dataset[0]['temporal_series'].unsqueeze(0), max_length=50, num_beams=3)

# Step 6: Print the generated text
print("Generated Text: ", generated_text)


/Users/thamolwanp/anaconda3/lib/python3.11/site-packages/pytorch_lightning/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
Seed set to 42
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 1, 768])
Generated Text:  ['.\n\n"I\'m not going to say it\'s a bad thing," he said. "It\'s just that I think it would be nice to have a little bit more freedom to do what I want to. I don\'t know if']
