In [None]:
import math
import os
import tempfile
import warnings

import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments, set_seed


warnings.filterwarnings("ignore")

In [2]:
from utils import mask_generate, mse

from tsfm_public import TimeSeriesPreprocessor, get_datasets
from tsfm_public.models.tspulse import TSPulseForReconstruction
from tsfm_public.toolkit.lr_finder import optimal_lr_finder

In [3]:
seed = 42
set_seed(seed)

In [4]:
device = "cuda"
CONTEXT_LEN = 512
FORECAST_LEN = 0

## Dataset, mask_ratio and mask_type to run TSPulse Finetuned Imputation

In [5]:
DATASET = "ETTh1"
mask_ratio = 0.375
mask_type = "hybrid"

In [6]:
# Dataset
if DATASET in ["ETTh1", "ETTh2", "ETTm1", "ETTm2"]:
    dataset_path = f"https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/{DATASET}.csv"
else:
    dataset_path = f"datasets/{DATASET}/{DATASET}.csv"

timestamp_column = "date"
id_columns = []  # mention the ids that uniquely identify a time-series.

if DATASET in ["ETTh1", "ETTh2"]:
    split_config = {
        "train": [0, 8640],
        "valid": [8640, 11520],
        "test": [
            11520,
            14400,
        ],
    }
elif DATASET in ["ETTm1", "ETTm2"]:
    split_config = {
        "train": [0, 34560],
        "valid": [34560, 46080],
        "test": [
            46080,
            57600,
        ],
    }
else:
    split_config = {
        "train": 0.7,
        "test": 0.2,
    }


data = pd.read_csv(
    dataset_path,
    parse_dates=[timestamp_column],
)

target_columns = data.columns.to_list()[1:]  # all the columns from the data except 'date'

column_specifiers = {
    "timestamp_column": timestamp_column,
    "id_columns": id_columns,
    "target_columns": target_columns,
    "control_columns": [],
}

tsp = TimeSeriesPreprocessor(
    **column_specifiers,
    context_length=CONTEXT_LEN,
    prediction_length=FORECAST_LEN,
    scaling=True,
    encode_categorical=False,
    scaler_type="standard",
)

train_dataset, valid_dataset, test_dataset = get_datasets(tsp, data, split_config)

In [7]:
model_dict = {
    "mask_ratio": mask_ratio,
    "mask_type": mask_type,
    "prediction_length": 0,
    "fft_time_add_forecasting_pt_loss": False,
    "enable_fft_prob_loss": False,
    "fft_time_consistent_masking": True,
    "fft_original_signal_loss_weight": 0,
    "loss_apply_mode": "mask",
    "fft_weight": 0,
    "num_full_patches_for_hybrid_mask": int((mask_ratio / 0.125) * 4),
    "decoder_mode": "mix_channel",
    "channel_consistent_masking": False,
    "dropout": 0,
    "head_dropout": 0,
}

model_dict["num_input_channels"] = tsp.num_input_channels

model = TSPulseForReconstruction.from_pretrained(
    "ibm-granite/granite-timeseries-tspulse-r1", revision="tspulse-hybrid-dualhead-512-p8-r1", **model_dict
).to(device)

INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Initializing Linear layers with method: pytorch
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Initializing Linear layers with method: pytorch
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Initializing Linear layers with method: pytorch
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Identity Init in Module: , TSPulseChannelFeatureMixerBlock
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_identity_weights:Init identity weights for channel mixing
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_identity_weights:Try identity init in Gated Attention.
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Initializing Linear layers with method: pytorch
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Initializing Linear layers with method: pytorch
INFO:p-3063973:t-22956491035648:modeling_tspulse.py:_init_weights:Initializi

In [8]:
OUT_DIR = "tspulse_finetuned_models/"

In [9]:
model = model.to("cuda").float()

In [10]:
for param in model.parameters():
    param.requires_grad = True

## Finetuning the Full model

In [11]:
temp_dir = tempfile.mkdtemp()

suggested_lr = None

train_dict = {
    "overwrite_output_dir": True,
    "learning_rate": 0.0001,
    "num_train_epochs": 100,
    "evaluation_strategy": "epoch",
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "dataloader_num_workers": 1,
    "eval_accumulation_steps": 50,
    "ddp_find_unused_parameters": False,
    "report_to": "tensorboard",
    "save_strategy": "epoch",
    "logging_strategy": "epoch",
    "save_total_limit": 1,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
    "seed": 42,
}

EPOCHS = train_dict["num_train_epochs"]
BATCH_SIZE = train_dict["per_device_train_batch_size"]
eval_accumulation_steps = train_dict["eval_accumulation_steps"]
NUM_WORKERS = 1
NUM_GPUS = 1

set_seed(42)
if suggested_lr is None:
    lr, model = optimal_lr_finder(
        model,
        train_dataset,
        batch_size=BATCH_SIZE,
    )
    suggested_lr = lr

finetune_args = TrainingArguments(
    output_dir=temp_dir,
    overwrite_output_dir=True,
    learning_rate=suggested_lr,
    num_train_epochs=EPOCHS,
    do_eval=True,
    eval_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_accumulation_steps=eval_accumulation_steps,
    dataloader_num_workers=NUM_WORKERS,
    report_to="tensorboard",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    logging_dir=os.path.join(OUT_DIR, "output"),  # Make sure to specify a logging directory
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    greater_is_better=False,  # For loss
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # Number of epochs with no improvement after which to stop
    early_stopping_threshold=0.0001,  # Minimum improvement required to consider as improvement
)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=suggested_lr)
scheduler = OneCycleLR(
    optimizer,
    suggested_lr,
    epochs=EPOCHS,
    steps_per_epoch=math.ceil(len(train_dataset) / (BATCH_SIZE * NUM_GPUS)),
)

finetune_trainer = Trainer(
    model=model,
    args=finetune_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[early_stopping_callback],
    optimizers=(optimizer, scheduler),
)

# Fine tune
finetune_trainer.train()

INFO:p-3063973:t-22956491035648:lr_finder.py:optimal_lr_finder:LR Finder: Running learning rate (LR) finder algorithm. If the suggested LR is very low, we suggest setting the LR manually.
INFO:p-3063973:t-22956491035648:lr_finder.py:optimal_lr_finder:LR Finder: Using cuda:0.
INFO:p-3063973:t-22956491035648:lr_finder.py:optimal_lr_finder:LR Finder: Suggested learning rate = 0.00020565123083486514


Epoch,Training Loss,Validation Loss
1,0.1245,0.147419
2,0.1139,0.142947
3,0.1085,0.140042
4,0.1047,0.138109
5,0.1003,0.13623
6,0.0941,0.134347
7,0.0863,0.137088
8,0.0794,0.134114
9,0.0747,0.130689
10,0.0716,0.128349


TrainOutput(global_step=45765, training_loss=0.06296408124065368, metrics={'train_runtime': 1498.6298, 'train_samples_per_second': 542.429, 'train_steps_per_second': 67.862, 'total_flos': 7837779087452160.0, 'train_loss': 0.06296408124065368, 'epoch': 45.0})

In [12]:
# save the finetuned model
os.makedirs("finetuned_models", exist_ok=True)
path_to_save_model = f"finetuned_models/finetuned_model_{DATASET}_{mask_ratio}_{mask_type}"
finetune_trainer.save_model(path_to_save_model)

In [15]:
if DATASET in ["ETTh1", "ETTh2", "ETTm1", "ETTm2"]:
    batch_size = 64
else:
    batch_size = 4


def collate_only_past_values(batch):
    return torch.stack([item["past_values"] for item in batch])


test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_only_past_values)


model_path = path_to_save_model

# load the finetuned model
model = TSPulseForReconstruction.from_pretrained(
    model_path, fft_time_add_forecasting_pt_loss=False, num_input_channels=tsp.num_input_channels, mask_type="user"
).to(device)


seed = 42
g = torch.Generator(device=device)
g.manual_seed(seed)

trues, preds, masks = [], [], []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        batch_x = batch.to(device)  # b l c

        mask = mask_generate(g, batch_x, 8, mask_ratio, mask_type)

        output = model(past_values=batch_x, past_observed_mask=~mask)

        reconstructed_output = output.reconstruction_outputs

        trues.append(batch_x.detach().cpu().numpy())
        preds.append(reconstructed_output.detach().cpu().numpy())
        masks.append(mask.detach().cpu().numpy())

    preds = np.concatenate(preds)
    trues = np.concatenate(trues)
    masks = np.concatenate(masks)

    MSE = mse(y=trues[masks == 1], y_hat=preds[masks == 1], reduction="mean")
    print(f"Dataset = {DATASET}  : Mask Type = {mask_type}  : Mask Ratio = {mask_ratio}")
    print(f"Mean Squarred Error (MSE)={MSE:.3f}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 50.19it/s]


Dataset = ETTh1  : Mask Type = hybrid  : Mask Ratio = 0.375
Mean Squarred Error (MSE)=0.066
