In [6]:
# prompt: write code for google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# # Install Required Packages
# !pip install torch pytorch-forecasting pytorch-lightning rich colorama matplotlib seaborn pandas numpy tensorboard lightning[extra] pyarrow fastparquet

In [8]:
# Standard Library
import os
import glob
import json
import shutil
from concurrent.futures import ThreadPoolExecutor

# Third-Party Libraries
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

import torch

# PyTorch Lightning
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

# PyTorch Forecasting
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import RMSE

In [9]:
import os

parquet_path = "/content/drive/MyDrive/datasets/processed/FeatureEngcolab"

# List all VM partitions (folder names)
vm_folders = sorted([
    name.split('=')[1] for name in os.listdir(parquet_path) if name.startswith("VM=")
])

print(f"Available VMs: {vm_folders[:10]} ... Total: {len(vm_folders)}")

Available VMs: ['1', '10', '100', '1000', '1001', '1002', '1003', '1004', '1005', '1006'] ... Total: 1250


In [10]:
# Load First N VMs Dynamically [100, 250, 500, 750, 1000, 1250]

for N in [50]:
    selected_vms = vm_folders[:N]

    df3 = dd.read_parquet(
        parquet_path,
        filters=[("VM", "in", selected_vms)]
    ).compute()

    print(f"✅ Loaded {N} VMs → Shape: {df3.shape}")

    # Optionally: Run model here

✅ Loaded 50 VMs → Shape: (441151, 50)


In [12]:
print(df3.columns.tolist())

['Timestamp [s]', 'CPU cores', 'CPU capacity provisioned [MHZ]', 'CPU usage [MHZ]', 'CPU usage [%]', 'Memory capacity provisioned [KB]', 'Memory usage [KB]', 'Disk read throughput [KB/s]', 'Disk write throughput [KB/s]', 'Network received throughput [KB/s]', 'Network transmitted throughput [KB/s]', 'VM', 'Timestamp', 'time_idx', 'time_diff', 'hour', 'dayofweek', 'is_weekend', 'month', 'day', 'hour_sin', 'hour_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos', 'cpu_utilization_ratio', 'memory_utilization_ratio', 'cpu_util_percent', 'memory_util_percent', 'cpu_util_prev', 'cpu_util_diff', 'memory_util_prev', 'memory_util_diff', 'disk_total_throughput', 'disk_rolling_mean', 'disk_rolling_std', 'network_total_throughput', 'network_rolling_mean', 'network_rolling_std', 'disk_read_prev', 'disk_read_diff', 'disk_write_prev', 'disk_write_diff', 'network_received_prev', 'network_received_diff', 'network_transmitted_prev', 'network_transmitted_diff', 'network_total_prev', 'network

In [13]:
df3 = df3.rename(columns={'VM': 'vm_id'})

In [14]:
tft_df = df3.dropna(subset=[
    'cpu_utilization_ratio',
    'memory_utilization_ratio',
    'disk_total_throughput',
    'network_total_throughput'
]).compute()

In [15]:
# Define target variables
# targets = ['cpu_utilization_ratio', 'memory_utilization_ratio', 'disk_total_throughput', 'network_total_throughput']

targets = ['cpu_utilization_ratio']
time_varying_known_reals = [
    'time_idx',
    'hour_sin', 'hour_cos',
    'dayofweek_sin', 'dayofweek_cos',
    'month_sin', 'month_cos'
]

In [None]:
# 🔧 Step 4: Unified config — update only here
train_config = {
    "targets": ['cpu_utilization_ratio'],
    "time_varying_known_reals": ['time_idx', 'hour_sin', 'hour_cos', 'dayofweek_sin', 'dayofweek_cos', 'month_sin', 'month_cos'],
    "group_ids": ['vm_id'],
    "max_encoder_length": 12,
    "max_prediction_length": 3,
    "hidden_size": 8,
    "dropout": 0.1,
    "learning_rate": 0.03,
    "batch_size": 16,
    "epochs": 2,
    "loss_fn": RMSE(),
    "output_base_dir": "/home/output",
    "log_dir": "/home/output/logs"
}

# 🚀 Step 5: Train for each target
for target in train_config["targets"]:
    print(f"\n🔁 Training for target: {target}")

    run_dir = os.path.join(train_config["output_base_dir"], f"{target}_run")
    os.makedirs(run_dir, exist_ok=True)

    # 💾 Optional: Save cleaned df snapshot
    tft_df.to_csv(f"{run_dir}/tft_df.csv", index=False)

    # 🧪 Dataset preparation
    dataset = TimeSeriesDataSet(
        tft_df[tft_df.time_idx <= tft_df['time_idx'].max() * 0.8],
        time_idx='time_idx',
        target=target,
        group_ids=train_config["group_ids"],
        max_encoder_length=train_config["max_encoder_length"],
        max_prediction_length=train_config["max_prediction_length"],
        time_varying_known_reals=train_config["time_varying_known_reals"],
        time_varying_unknown_reals=train_config["targets"],
        target_normalizer=GroupNormalizer(groups=train_config["group_ids"]),
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
        allow_missing_timesteps=True
    )

    val_dataset = TimeSeriesDataSet.from_dataset(dataset, tft_df, predict=True, stop_randomization=True)

    train_dataloader = dataset.to_dataloader(train=True, batch_size=train_config["batch_size"], num_workers=0)
    val_dataloader = val_dataset.to_dataloader(train=False, batch_size=train_config["batch_size"], num_workers=0)

    # 📊 Logging and checkpoints
    logger = CSVLogger(save_dir=train_config["log_dir"], name=f"{target}_log")
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=run_dir,
        filename="tft-{epoch:02d}-{val_loss:.2f}",
        save_top_k=1,
        save_last=True,
        mode="min"
    )

    # 🔁 Resume or initialize model
    ckpt_path = os.path.join(run_dir, "tft-last.ckpt")
    if os.path.exists(ckpt_path):
        print(f"📦 Resuming from checkpoint: {ckpt_path}")
        model = TemporalFusionTransformer.load_from_checkpoint(
            checkpoint_path=ckpt_path,
            dataset=dataset,
            loss=train_config["loss_fn"]
        )
    else:
        print("🆕 Starting new model")
        model = TemporalFusionTransformer.from_dataset(
            dataset,
            learning_rate=train_config["learning_rate"],
            hidden_size=train_config["hidden_size"],
            dropout=train_config["dropout"],
            loss=train_config["loss_fn"],
            log_interval=10,
            reduce_on_plateau_patience=4,
        )

    # 🏋️ Trainer
    trainer = Trainer(
        max_epochs=train_config["epochs"],
        accelerator='auto',
        devices=1 if torch.cuda.is_available() else None,
        logger=logger,
        callbacks=[checkpoint_callback],
        enable_checkpointing=True
    )

    trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

    # 🔍 Predict
    predictions, x = model.predict(val_dataloader, mode='raw', return_x=True)
    forecast = predictions['prediction'][0].detach().cpu().numpy()

    # 📈 Plot forecast
    plt.figure(figsize=(10, 6))
    model.plot_prediction(x, predictions, idx=0, show_future_observed=True)
    plt.title(f"Prediction Plot for {target}")
    plt.savefig(f"{run_dir}/plot.png")
    plt.close()

    # 💾 Save forecast data
    pd.DataFrame(forecast, columns=[f'{target}_forecast']).to_csv(f"{run_dir}/predictions.csv", index=False)

    # 💾 Save training log
    log_csv_path = os.path.join(logger.log_dir, "metrics.csv")
    if os.path.exists(log_csv_path):
        shutil.copy(log_csv_path, f"{run_dir}/loss_log.csv")

    # 💾 Save training config
    with open(f"{run_dir}/params.json", "w") as f:
        json.dump(train_config, f, indent=2)

    # 📌 Save spike info
    spikes = forecast > np.percentile(forecast, 95)
    with open(f"{run_dir}/notes.txt", "w") as f:
        f.write(f"Target: {target}\n")
        f.write(f"Spikes > 95th percentile: {int(spikes.sum())}\n")
        f.write("Review plot.png and predictions.csv for further insights.\n")

    print(f"✅ Run complete — outputs saved at: {run_dir}")



🔁 Training for target: cpu_utilization_ratio
