In [1]:
# prompt: write code for google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# ⚡ Quick Setup - Run after runtime reset (CPU/GPU Switch)
# Installs essential packages silently to save output clutter

!pip install torch pytorch-forecasting pytorch-lightning \
    rich colorama matplotlib seaborn pandas numpy tensorboard \
    'lightning[extra]' pyarrow fastparquet --quiet

print("\033[92m✅ All required packages installed successfully.\033[0m")

[92m✅ All required packages installed successfully.[0m


In [13]:
# Standard Library
import os
import glob
import json
import shutil
from concurrent.futures import ThreadPoolExecutor

# Third-Party Libraries
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

import torch

# PyTorch Lightning
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

# PyTorch Forecasting
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import RMSE

In [14]:
import os

parquet_path = "/content/drive/MyDrive/datasets/processed/FeatureEngcolab"

# List all VM partitions (folder names)
vm_folders = sorted([
    name.split('=')[1] for name in os.listdir(parquet_path) if name.startswith("VM=")
])

print(f"Available VMs: {vm_folders[:10]} ... Total: {len(vm_folders)}")

Available VMs: ['1', '10', '100', '1000', '1001', '1002', '1003', '1004', '1005', '1006'] ... Total: 1250


In [5]:
# Load First N VMs Dynamically [100, 250, 500, 750, 1000, 1250]

for N in [30]:
    selected_vms = vm_folders[:N]

    df3 = dd.read_parquet(
        parquet_path,
        filters=[("VM", "in", selected_vms)]
    ).compute()

    print(f"✅ Loaded {N} VMs → Shape: {df3.shape}")

    # Optionally: Run model here

✅ Loaded 30 VMs → Shape: (264533, 50)


In [6]:
print(df3.columns.tolist())

['Timestamp [s]', 'CPU cores', 'CPU capacity provisioned [MHZ]', 'CPU usage [MHZ]', 'CPU usage [%]', 'Memory capacity provisioned [KB]', 'Memory usage [KB]', 'Disk read throughput [KB/s]', 'Disk write throughput [KB/s]', 'Network received throughput [KB/s]', 'Network transmitted throughput [KB/s]', 'Timestamp', 'time_idx', 'time_diff', 'hour', 'dayofweek', 'is_weekend', 'month', 'day', 'hour_sin', 'hour_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos', 'cpu_utilization_ratio', 'memory_utilization_ratio', 'cpu_util_percent', 'memory_util_percent', 'cpu_util_prev', 'cpu_util_diff', 'memory_util_prev', 'memory_util_diff', 'disk_total_throughput', 'disk_rolling_mean', 'disk_rolling_std', 'network_total_throughput', 'network_rolling_mean', 'network_rolling_std', 'disk_read_prev', 'disk_read_diff', 'disk_write_prev', 'disk_write_diff', 'network_received_prev', 'network_received_diff', 'network_transmitted_prev', 'network_transmitted_diff', 'network_total_prev', 'network_total

In [7]:
df3 = df3.rename(columns={'VM': 'vm_id'})

tft_df = df3.dropna(subset=[
    'cpu_utilization_ratio',
    'memory_utilization_ratio',
    'disk_total_throughput',
    'network_total_throughput'
])

tft_df = tft_df.compute() if 'dask' in str(type(tft_df)) else tft_df

In [8]:
# Define target variables
# targets = ['cpu_utilization_ratio', 'memory_utilization_ratio', 'disk_total_throughput', 'network_total_throughput']

In [9]:
# Unified config — modify only here
train_config = {
    "targets": ['cpu_utilization_ratio'],
    "time_varying_known_reals": ['time_idx', 'hour_sin', 'hour_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos'],
    "group_ids": ['vm_id'],
    "max_encoder_length": 30,
    "max_prediction_length": 6,
    "hidden_size": 8,
    "dropout": 0.1,
    "learning_rate": 0.01,
    "batch_size": 64,
    "num_workers": 2,
    "epochs": 2,
    "loss_fn": RMSE(),
    "output_base_dir": "/content/drive/MyDrive/output",
    "log_dir": "/content/drive/MyDrive/output/logs"
}

# Ensure output folders exist
os.makedirs(train_config["output_base_dir"], exist_ok=True)
os.makedirs(train_config["log_dir"], exist_ok=True)

In [10]:
val_df = tft_df[tft_df.time_idx > tft_df['time_idx'].max() * 0.8]
print(f"Validation data points: {len(val_df)}")
print(f"Minimum required: {train_config['max_encoder_length'] + train_config['max_prediction_length']}")

Validation data points: 42999
Minimum required: 36


In [11]:
# Reset index (important for unique indexing)
tft_df = tft_df.reset_index(drop=True)

# Prepare TimeSeriesDataSet for training portion (80%)
dataset = TimeSeriesDataSet(
    tft_df[tft_df.time_idx <= tft_df['time_idx'].max() * 0.8],
    time_idx='time_idx',
    target=train_config["targets"][0],  # 'cpu_utilization_ratio' here
    group_ids=train_config["group_ids"],
    max_encoder_length=train_config["max_encoder_length"],
    max_prediction_length=train_config["max_prediction_length"],
    time_varying_known_reals=train_config["time_varying_known_reals"],
    time_varying_unknown_reals=train_config["targets"],
    target_normalizer=GroupNormalizer(groups=train_config["group_ids"]),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True
)

# Validation dataset for prediction (no randomization, full data)
val_dataset = TimeSeriesDataSet.from_dataset(
    dataset, tft_df, predict=True, stop_randomization=True
)

# Create dataloaders

train_dataloader = dataset.to_dataloader(
    train=True,
    batch_size=train_config["batch_size"],
    num_workers=train_config["num_workers"]
)

val_dataloader = val_dataset.to_dataloader(
    train=False,
    batch_size=train_config["batch_size"],
    num_workers=train_config["num_workers"]
)


print(f"✅ Dataset and dataloaders ready. Train batches: {len(train_dataloader)}, Val batches: {len(val_dataloader)}")

✅ Dataset and dataloaders ready. Train batches: 2898, Val batches: 1


In [12]:
from pytorch_lightning.callbacks import EarlyStopping

for target in train_config["targets"]:
    print(f"\n🔁 Training for target: {target}")

    run_dir = os.path.join(train_config["output_base_dir"], f"{target}_run")
    os.makedirs(run_dir, exist_ok=True)

    # Save cleaned dataset snapshot for debugging
    tft_df.to_csv(f"{run_dir}/tft_df.csv", index=False)

    # Setup logging & checkpointing
    logger = CSVLogger(save_dir=train_config["log_dir"], name=f"{target}_log")

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=run_dir,
        filename="tft-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        save_last=True,
        mode="min"
    )

    early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")


🔁 Training for target: cpu_utilization_ratio


Unnamed: 0,Timestamp [s],CPU cores,CPU capacity provisioned [MHZ],CPU usage [MHZ],CPU usage [%],Memory capacity provisioned [KB],Memory usage [KB],Disk read throughput [KB/s],Disk write throughput [KB/s],Network received throughput [KB/s],...,disk_read_diff,disk_write_prev,disk_write_diff,network_received_prev,network_received_diff,network_transmitted_prev,network_transmitted_diff,network_total_prev,network_total_diff,vm_id
0,1376314846,4,11703.99824,10912.027692,93.233333,67108864.0,6129274.0,0.133333,15981.6,0.0,...,,,,,,,,,,1
1,1376315146,4,11703.99824,10890.570362,93.05,67108864.0,6755624.0,1.333333,19137.333333,0.0,...,1.2,15981.6,3155.733333,0.0,0.0,2.133333,0.466667,2.133333,0.466667,1
2,1376315446,4,11703.99824,10434.114431,89.15,67108864.0,8947846.0,2.533333,19974.933333,535.666667,...,1.2,19137.333333,837.6,0.0,535.666667,2.6,21.333333,2.6,557.0,1
3,1376315746,4,11703.99824,10539.450415,90.05,67108864.0,18790480.0,5.466667,8791.8,349.666667,...,2.933333,19974.933333,-11183.133333,535.666667,-186.0,23.933333,-18.466667,559.6,-204.466667,1
4,1376316046,4,11703.99824,10951.04102,93.566667,67108864.0,9305761.0,5.4,15679.533333,0.0,...,-0.066667,8791.8,6887.733333,349.666667,-349.666667,5.466667,-3.4,355.133333,-353.066667,1


In [None]:
# ckpt_path = os.path.join(run_dir, "tft-last.ckpt")

# if os.path.exists(ckpt_path):
#     print(f"📦 Resuming from checkpoint: {ckpt_path}")
#     model = TemporalFusionTransformer.load_from_checkpoint(
#         checkpoint_path=ckpt_path,
#         dataset=dataset,
#         loss=train_config["loss_fn"]
#     )
# else:
#     print("🆕 Starting new model")
#     model = TemporalFusionTransformer.from_dataset(
#         dataset,
#         learning_rate=train_config["learning_rate"],
#         hidden_size=train_config["hidden_size"],
#         dropout=train_config["dropout"],
#         loss=train_config["loss_fn"],
#         log_interval=10,
#         reduce_on_plateau_patience=4
#     )

# # ✅ Fixed indentation below:
# if torch.cuda.is_available():
#     accelerator = "gpu"
#     devices = 1
# else:
#     accelerator = "cpu"
#     devices = 1

# trainer = Trainer(
#     max_epochs=train_config["epochs"],
#     accelerator=accelerator,
#     devices=devices,
#     logger=logger,
#     callbacks=[checkpoint_callback, early_stopping],
#     enable_checkpointing=True
# )

# trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

import pytorch_lightning as pl

class TFTLightningModule(pl.LightningModule):
    def __init__(self, tft_model: TemporalFusionTransformer, learning_rate: float, loss_fn: torch.nn.Module):
        super().__init__()
        self.tft_model = tft_model
        self.learning_rate = learning_rate
        self.loss_fn = loss_fn

    def forward(self, x):
        return self.tft_model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.prediction, y) # Extract prediction from output
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat.prediction, y) # Extract prediction from output
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

# Create the TFT model
tft_model = TemporalFusionTransformer.from_dataset(
    dataset,
    learning_rate=train_config["learning_rate"],
    hidden_size=train_config["hidden_size"],
    dropout=train_config["dropout"],
    loss=train_config["loss_fn"],
    log_interval=10,
    reduce_on_plateau_patience=4
)

# Wrap the TFT model in a LightningModule
model = TFTLightningModule(
    tft_model=tft_model,
    learning_rate=train_config["learning_rate"],
    loss_fn=train_config["loss_fn"]
)

# Setup Trainer
if torch.cuda.is_available():
    accelerator = "gpu"
    devices = 1
else:
    accelerator = "cpu"
    devices = 1

trainer = Trainer(
    max_epochs=train_config["epochs"],
    accelerator=accelerator,
    devices=devices,
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping],
    enable_checkpointing=True
)

# Fit the model
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type                      |

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 30. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Training: |          | 0/? [00:00<?, ?it/s]