<a href="https://colab.research.google.com/github/swaraj0009/AI_Models/blob/master/TFT/notebooks/4_ttf_resource_forecast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Drive Loading

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Install Libraries

In [3]:
# ⚡ Quick Setup - Run after runtime reset (CPU/GPU Switch)
# Installs essential packages silently to save output clutter

!pip install dask pytz torch pytorch-forecasting pytorch-lightning \
    rich colorama matplotlib seaborn pandas numpy tensorboard \
    'lightning[extra]' pyarrow fastparquet --quiet > /dev/null

print("\033[92m✅ All required packages installed successfully.\033[0m")

[92m✅ All required packages installed successfully.[0m


## Import Libraries

In [21]:
# Standard Library
import os
import datetime
import glob
import json
import shutil
import math
import pytz
from concurrent.futures import ThreadPoolExecutor

# Third-Party Libraries
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

import torch

# PyTorch Lightning
# from datetime import datetime
import pytorch_forecasting
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

# PyTorch Forecasting
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import RMSE
from pytorch_forecasting.data import NaNLabelEncoder
from torch.utils.data import DataLoader
from pytorch_forecasting.data.encoders import GroupNormalizer


ist = pytz.timezone('Asia/Kolkata')
now_ist = datetime.datetime.now(ist)
timestamp = now_ist.strftime("%Y%m%d-%H%M%S")
print(f"All Libraries are loaded : {timestamp}")

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"⚡ Using device: {device.upper()}")

All Libraries are loaded : 20250707-031228
⚡ Using device: CUDA


## User Configurable Parameters

In [65]:

# Data path & VM selection
parquet_path = "/content/drive/MyDrive/datasets/processed/FeatureEng_full_streak.parquet"

# Model training parameters
train_config = {
    # 🎯 Prediction Targets

    # [
    #     "cpu_utilization_ratio",
    #     "memory_utilization_ratio",
    #     "disk_total_throughput",
    #     "network_total_throughput"
    # ],

    "targets": ["cpu_utilization_ratio"],

    # 📅 Known time-dependent features (known at prediction time)
    "time_varying_known_reals": [
        "time_idx",
        "hour", "day", "dayofweek", "month", "is_weekend",
        "hour_sin", "hour_cos",
        "dayofweek_sin", "dayofweek_cos",
        "month_sin", "month_cos"
    ],

    # 📈 Features only known up to current timestep (future unknown)
    "time_varying_unknown_reals": [
        "cpu_util_prev", "cpu_util_diff",
        "memory_util_prev", "memory_util_diff",
        "network_total_prev", "network_total_diff",
        "disk_write_prev", "disk_write_diff",
        "disk_rolling_mean", "network_rolling_mean"
    ],

    # 🔐 Grouping feature
    "group_ids": ["vm_id"],

    # 🧠 Sequence lengths (adjust based on resources)
    "max_encoder_length": 5000,      # input length
    "max_prediction_length": 3000,     # forecast horizon

    # ⚙️ Model Hyperparameters (tune later)
    "hidden_size": 16,
    "dropout": 0.2,
    "learning_rate": 0.001,
    "batch_size": 4,
    "num_workers": 2,

    # 🛑 Early stopping
    "early_stopping_patience": 5,
    "epochs": 10,

    # 🧮 Loss function
    "loss_fn": RMSE(),

    # 💾 Output paths
    "output_base_dir": "/content/drive/MyDrive/output",
    "log_dir": "/content/drive/MyDrive/output/logs"
}

In [6]:
df5 = pd.read_parquet(parquet_path)

print(f"✅ Loaded data shape: {df5.shape}")
print(f"🔢 Unique VMs: {df5['vm_id'].nunique()}")

✅ Loaded data shape: (6487138, 38)
🔢 Unique VMs: 751


In [7]:
# Group by VM and count how many time steps each VM has
vm_streaks = df5.groupby("vm_id").agg(
    total_points=("time_idx", "count"),
    max_time_idx=("time_idx", "max")
).reset_index()

# Sort by total_points (or max_time_idx) descending
top_200_vms = vm_streaks.sort_values(by="total_points", ascending=False).head(200)["vm_id"]

print(f"✅ Selected top 200 VMs with longest data streaks.")

✅ Selected top 200 VMs with longest data streaks.


## VM Configure

In [8]:
# Filter original DataFrame for these 200 VMs
df6 = df5[df5["vm_id"].isin(top_200_vms)].copy()
print(f"✅ Filtered data shape (top 200 VMs): {df6.shape}")

✅ Filtered data shape (top 200 VMs): (1727600, 38)


In [9]:
print(f"🎯 VMs in final dataset: {df6['vm_id'].nunique()}")  # Should be 200

🎯 VMs in final dataset: 200


## Columns Filter

In [10]:
# ✅ Drop unused columns based on train_config
columns_to_keep = (
    train_config["time_varying_known_reals"]
    + train_config["time_varying_unknown_reals"]
    + train_config["targets"]
    + train_config["group_ids"]
    + ['time_idx', 'timestamp']
)

# 🔁 Remove duplicates in case of overlaps
columns_to_keep = list(set(columns_to_keep))

# 📉 Filter DataFrame
df6 = df6[columns_to_keep]

print(f"✅ Columns after filtering: {len(df6.columns)}")

# 🧼 Optimize category column
if "vm_id" in df6.columns:
    df6["vm_id"] = df6["vm_id"].astype("category")
    df6["vm_id"] = df6["vm_id"].cat.remove_unused_categories()

print(f"\033[94mℹ️ Clean DataFrame → Columns: {len(df6.columns)} | Shape: {df6.shape}\033[0m")

✅ Columns after filtering: 25
[94mℹ️ Clean DataFrame → Columns: 25 | Shape: (1727600, 25)[0m


## Split Logic with Reset Index

In [63]:
train_ratio = 0.51

train_df_list = []
val_df_list = []

for vm_id, group in df6.groupby("vm_id",observed=False):
    group = group.sort_values("time_idx")
    split_idx = int(len(group) * train_ratio)

    train_df_list.append(group.iloc[:split_idx])
    val_df_list.append(group.iloc[split_idx:])

# Combine all
train_df = pd.concat(train_df_list).reset_index(drop=True)
val_df = pd.concat(val_df_list).reset_index(drop=True)

print(f"✅ Train shape: {train_df.shape}")
print(f"✅ Val shape: {val_df.shape}")

✅ Train shape: (881000, 25)
✅ Val shape: (846600, 25)


## Pre validation check for split for Encoder & prediction

In [35]:
# Step 1: Define required steps
required_train_steps = train_config["max_encoder_length"] + train_config["max_prediction_length"]
required_val_steps = train_config["max_prediction_length"]

# Step 2: Get the split point
max_time_idx = df6["time_idx"].max()
split_point = int(max_time_idx * train_ratio)

# Step 3: Split data
train_df = df6[df6["time_idx"] <= split_point].copy()
val_df = df6[df6["time_idx"] > split_point].copy()

# Step 4: Validate VMs having enough time points
vm_train_counts = train_df.groupby("vm_id",observed=False)["time_idx"].nunique()
vm_val_counts = val_df.groupby("vm_id",observed=False)["time_idx"].nunique()

# Step 5: Filter valid VMs
valid_train_vms = vm_train_counts[vm_train_counts >= required_train_steps].index
valid_val_vms = vm_val_counts[vm_val_counts >= required_val_steps].index

# Step 6: Filter DataFrames
train_df = train_df[train_df["vm_id"].isin(valid_train_vms)].copy()
val_df = val_df[val_df["vm_id"].isin(valid_val_vms)].copy()

# Step 7: Summary
print("\n📊 VM-Level Split Window Check\n")
print(f"✅ VMs valid for training   : {len(valid_train_vms)} / {vm_train_counts.shape[0]}")
print(f"✅ VMs valid for validation : {len(valid_val_vms)} / {vm_val_counts.shape[0]}")


📊 VM-Level Split Window Check

✅ VMs valid for training   : 200 / 200
✅ VMs valid for validation : 200 / 200


## Set Configuration

In [31]:
max_encoder_length = train_config["max_encoder_length"]
max_prediction_length = train_config["max_prediction_length"]
group_ids = train_config["group_ids"]
targets = train_config["targets"]  # e.g., ["cpu_utilization_ratio"]
time_varying_known_reals = train_config["time_varying_known_reals"]
time_varying_unknown_reals = train_config["time_varying_unknown_reals"]
batchsize = train_config["batch_size"]
numworkers = train_config["num_workers"]
learningrate = train_config["learning_rate"]
hiddensize = train_config["hidden_size"]
lossfn = train_config["loss_fn"]
dropout = train_config["dropout"]

## TimeSeriesDataSet

In [66]:
# Training Dataset
tft_dataset = TimeSeriesDataSet(
    train_df,
    time_idx="time_idx",
    target=targets[0],  # Start with first target (e.g., "cpu_utilization_ratio")
    group_ids=group_ids,
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    static_categoricals=[],
    static_reals=[],
    target_normalizer=GroupNormalizer(groups=group_ids),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

val_dataset = tft_dataset.from_dataset(
    tft_dataset,
    val_df,
    predict=True,
    stop_randomization=True
)

from torch.utils.data import DataLoader

train_dataloader = tft_dataset.to_dataloader(train=True, batch_size=batchsize, num_workers=numworkers)
val_dataloader = val_dataset.to_dataloader(train=False, batch_size=batchsize, num_workers=numworkers)

## Output & Log Folder Creation

In [15]:
# Step 1: Short name for target
def get_short_target_name(targets):
    short_map = {
        "cpu_utilization_ratio": "cpu",
        "memory_utilization_ratio": "mem",
        "disk_total_throughput": "disk",
        "network_total_throughput": "net"
    }
    if isinstance(targets, list) and targets:
        return short_map.get(targets[0], targets[0][:3])
    return "unknown"

# Step 2: Folder naming using extracted vars
def get_run_folder_name(vm_count):
    return "_".join([
        get_short_target_name(targets),
        f"{vm_count}vms",
        f"past{max_encoder_length}",
        f"fut{max_prediction_length}",
        f"bs{batchsize}",
        f"lr{learningrate:.0e}".replace('+0', ''),
        f"hid{hiddensize}",
        timestamp
    ])

# Step 3: Build folder name
vm_count = df6["vm_id"].nunique() # Calculate vm_count from df6
folder_name = get_run_folder_name(vm_count)
train_config["output_base_dir"] = os.path.join(train_config["output_base_dir"], folder_name)
train_config["log_dir"] = os.path.join(train_config["log_dir"], folder_name)

# Step 4: Create folders
os.makedirs(train_config["output_base_dir"], exist_ok=True)
os.makedirs(train_config["log_dir"], exist_ok=True)

# Step 5: Print summary
print("✅ Output directory:", train_config["output_base_dir"])
print("✅ Log directory   :", train_config["log_dir"])

✅ Output directory: /content/drive/MyDrive/output/cpu_200vms_past2100_fut2100_bs4_lr1e-03_hid16_20250707-030926
✅ Log directory   : /content/drive/MyDrive/output/logs/cpu_200vms_past2100_fut2100_bs4_lr1e-03_hid16_20250707-030926


In [16]:
tft_df = df6.copy()

## Logging & Callbacks

In [17]:
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# Use your existing function
def get_short_target_name_single(target):
    short_map = {
        "cpu_utilization_ratio": "cpu",
        "memory_utilization_ratio": "mem",
        "disk_total_throughput": "disk",
        "network_total_throughput": "net"
    }
    if isinstance(targets, list) and targets:
        return short_map.get(targets[0], targets[0][:3])
    return "unknown"

for target in targets:
    short_target = get_short_target_name_single(target)  # ✅ Now uses each target in loop

    print(f"\n🔁 Training for target: {target}")

    run_dir = os.path.join(train_config["output_base_dir"], f"{short_target}_run_{timestamp}")
    os.makedirs(run_dir, exist_ok=True)

    tft_df.to_csv(os.path.join(run_dir, "tft_df.csv"), index=False)
    tft_dataset.save(os.path.join(run_dir, "tft_df_metadata"))

    meta_cols = ['vm_id', 'timestamp', 'time_idx']
    if all(col in val_df.columns for col in meta_cols):
        meta_df = val_df[meta_cols].reset_index(drop=True)
        meta_df.to_csv(os.path.join(run_dir, "forecast_metadata.csv"), index=False)
        print(f"✅ Metadata saved to: {run_dir}/forecast_metadata.csv")
    else:
        print(f"⚠️ Skipping metadata save — missing columns: {meta_cols}")

    logger = CSVLogger(
        save_dir=train_config["log_dir"],
        name=f"{short_target}_log"  # ✅ Each log is now uniquely named per target
    )

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=run_dir,
        filename="tft-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        save_last=True,
        mode="min"
    )

    early_stopping = EarlyStopping(
        monitor="val_loss",
        patience=train_config["early_stopping_patience"],
        mode="min"
    )


🔁 Training for target: cpu_utilization_ratio
✅ Metadata saved to: /content/drive/MyDrive/output/cpu_200vms_past2100_fut2100_bs4_lr1e-03_hid16_20250707-030926/cpu_run_20250707-030926/forecast_metadata.csv


## Model, Lightning, Trainer

In [20]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"⚡ Using device: {device.upper()}")


import pytorch_lightning as pl
import torch
from pytorch_forecasting.models import TemporalFusionTransformer

class TFTLightningModule(pl.LightningModule):
    def __init__(self, tft_model: TemporalFusionTransformer, learning_rate: float, loss_fn: torch.nn.Module):
        super().__init__()
        self.tft_model = tft_model
        self.learning_rate = learning_rate
        self.loss_fn = loss_fn

    def forward(self, x):
        return self.tft_model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        x = x.to(self.device)
        y = y.to(self.device)
        y_hat = self(x)
        loss = self.loss_fn(y_hat.prediction, y)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = x.to(self.device)
        y = y.to(self.device)
        y_hat = self(x)
        loss = self.loss_fn(y_hat.prediction, y)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

# Create the TFT model
tft_model = TemporalFusionTransformer.from_dataset(
    tft_dataset,
    learning_rate=train_config["learning_rate"],
    hidden_size=train_config["hidden_size"],
    dropout=train_config["dropout"],
    loss=train_config["loss_fn"],
    log_interval=10,
    reduce_on_plateau_patience=4
)

# Wrap the TFT model in a LightningModule
model = TFTLightningModule(
    tft_model=tft_model,
    learning_rate=train_config["learning_rate"],
    loss_fn=train_config["loss_fn"]
)

# Setup Trainer
if torch.cuda.is_available():
    accelerator = "gpu"
    devices = 1
else:
    accelerator = "cpu"
    devices = 1

trainer = Trainer(
    max_epochs=train_config["epochs"],
    accelerator=accelerator,
    devices=devices,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping],
    enable_checkpointing=True
)

# Fit the model
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

⚡ Using device: CUDA


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                      | Params | Mode 
----------------------------------------------------------------
0 | tft_model | TemporalFusionTransformer | 40.8 K | train
1 | loss_fn   | RMSE                      | 0      | eval 
----------------------------------------------------------------
40.8 K    Trainable params
0         Non-trainable params
40.8 K    Total params
0.163     Total estimated model params size (MB)
733       Modules in train mode
1         Modules in 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

AttributeError: 'dict' object has no attribute 'to'

# Actual vs Prediction Graphs

In [None]:
# 🔮 Step 1: Make raw predictions on validation set
prediction_output = model.tft_model.predict(
    val_dataloader, mode='raw', return_x=True
)

# ✅ Step 2: Extract input and output
x = prediction_output["x"]
predictions = prediction_output["output"]

# ✅ Step 3: Extract forecast values as numpy array (for CSV export)
forecast = predictions["prediction"].detach().cpu().numpy()

# ✅ Step 4: Plot forecast using built-in TFT visualization
fig = model.tft_model.plot_prediction(
    x, predictions, idx=0, show_future_observed=True
)
plt.title(f"Prediction Plot for {target}")

# ✅ Reduce legend size and move it neatly outside
plt.legend(
    loc='upper left',
    bbox_to_anchor=(1, 1),
    fontsize='small',
    frameon=True
)

plt.tight_layout()

# ✅ Step 5: Save the plot as PNG
plt.savefig(f"{run_dir}/plot.png", bbox_inches='tight')
plt.close()
print(f"✅ Prediction plot saved at: {run_dir}/plot.png")

# ✅ Step 6: Save forecast to CSV
pd.DataFrame(forecast, columns=[f'{target}_forecast']).to_csv(
    f"{run_dir}/predictions.csv", index=False
)
print(f"✅ Forecast values saved to: {run_dir}/predictions.csv")

## Spike Detection & Save Metadata

In [None]:
import os
import json
import numpy as np

# Ensure forecast is a NumPy array
if isinstance(forecast, torch.Tensor):
    forecast = forecast.detach().cpu().numpy()
elif isinstance(forecast, pd.Series):
    forecast = forecast.values
else:
    forecast = np.array(forecast)

# Detect spikes above the 95th percentile
spikes = forecast > np.percentile(forecast, 95)
spike_count = int(spikes.sum())

# 🔹 Save notes.txt with target name and spike info
notes_path = os.path.join(run_dir, "notes.txt")
with open(notes_path, "w") as f:
    f.write(f"Target: {target}\n")
    f.write(f"Spikes > 95th percentile: {spike_count}\n")
    f.write("Review plot.png and predictions.csv for further insights.\n")

print(f"📄 Notes saved at: {notes_path}")

# 🔹 Prepare train_config for JSON (remove non-serializable objects)
serializable_train_config = train_config.copy()
serializable_train_config["loss_fn"] = serializable_train_config["loss_fn"].__class__.__name__

# 🔹 Save model config as JSON
config_path = os.path.join(run_dir, "modelconfig.json")
with open(config_path, "w") as f:
    json.dump(serializable_train_config, f, indent=2)

print(f"✅ Config saved at: {config_path}")
print(f"✅ Run complete. Outputs saved at: {run_dir}")

forecast_path = os.path.join(run_dir, "predictions.csv")
forecast_df.to_csv(forecast_path, index=False)
print(f"✅ Forecast values saved to: {forecast_path}")