<a href="https://colab.research.google.com/github/swaraj0009/AI_Models/blob/master/model_results_inspection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

## Setup paths and imports

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_forecasting import TimeSeriesDataSet

run_dir = "/path/to/your/output_folder"  # e.g. "cpu_utilization_ratio_run_20250703-163744"

## Load and plot training metrics

In [None]:
metrics = pd.read_csv(os.path.join(run_dir, "metrics.csv"))
print(metrics.head())

plt.figure(figsize=(10,6))
plt.plot(metrics['epoch'], metrics['train_loss'], label='Train Loss')
plt.plot(metrics['epoch'], metrics['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Train & Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

## Load cleaned data snapshot and inspect

In [None]:
df = pd.read_csv(os.path.join(run_dir, "tft_df.csv"))
print(df.describe())
print(df.head())

plt.figure(figsize=(8,4))
sns.histplot(df['cpu_utilization_ratio'], bins=50, kde=True)
plt.title("CPU Utilization Distribution")
plt.show()

## Load saved dataset for inference consistency

In [None]:
dataset = TimeSeriesDataSet.load(os.path.join(run_dir, "saved_dataset"))
print(dataset)

## Simple tuning suggestions based on metrics

In [None]:
def analyze_metrics(metrics_df):
    train_last = metrics_df['train_loss'].iloc[-1]
    val_last = metrics_df['val_loss'].iloc[-1]
    diff = val_last - train_last

    print(f"Last Train Loss: {train_last:.4f}")
    print(f"Last Val Loss: {val_last:.4f}")
    print(f"Difference (Val - Train): {diff:.4f}")

    if val_last > 0.5:
        print("High validation loss → try increasing model size or training longer.")
    elif diff > 0.1:
        print("Possible overfitting → try increasing dropout or early stopping patience.")
    elif train_last > 0.5:
        print("High training loss → try reducing dropout or increasing hidden size.")
    else:
        print("Training looks good. Consider lowering learning rate for fine tuning.")

analyze_metrics(metrics)

## Inspect checkpoint metadata

In [None]:
import torch
ckpt_path = os.path.join(run_dir, "tft-epoch=XX-val_loss=YY.ckpt")
ckpt = torch.load(ckpt_path, map_location="cpu")

print("Checkpoint keys:", ckpt.keys())
print("Epoch:", ckpt['epoch'])
print("Global step:", ckpt['global_step'])
print("Hyperparameters:", ckpt['hyper_parameters'])