In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler

import joblib

In [None]:
PROJECT_PATH = '../'

In [None]:
WINDOW_SIZE = 30
FORECAST_HORIZON = 5

In [None]:
df = pd.read_csv(PROJECT_PATH + '/LSTM_Ready_Dataset_Old.csv')
df

In [None]:
df.index = pd.to_datetime(df['timestamp'], format = '%Y-%m-%d %H:%M:%S')
df

In [None]:
# =====================================================================
# SETUP & DATA LOADING
# =====================================================================

cluster_cpu_req = pd.DataFrame(
    {'cluster_cpu_req': df['node_cpu_req_aj-aung-k8s-worker1'] + df['node_cpu_req_aj-aung-k8s-worker2']}
)

cluster_cpu_cap = pd.DataFrame(
    {'cluster_cpu_cap': df['node_cpu_cap_aj-aung-k8s-worker1'] + df['node_cpu_cap_aj-aung-k8s-worker2']}
)

cluster_mem_req = pd.DataFrame(
    {'cluster_mem_req': df['node_mem_req_aj-aung-k8s-worker1'] + df['node_mem_req_aj-aung-k8s-worker2']}
)

cluster_mem_cap = pd.DataFrame(
    {'cluster_mem_cap': df['node_mem_cap_aj-aung-k8s-worker1'] + df['node_mem_cap_aj-aung-k8s-worker2']}
)

features = pd.concat([cluster_cpu_req, cluster_cpu_cap, cluster_mem_req, cluster_mem_cap, df['cluster_pods_pending']], axis = 1)

target = cluster_cpu_req

In [None]:
features.info()
features

In [None]:
features.dtypes

In [None]:
target.info()
target

In [None]:
target.dtypes

In [None]:
# =====================================================================
# CHRONOLOGICAL SPLIT (70/15/15)
# =====================================================================

n = len(features)

train_idx = int(n * 0.7)
val_idx   = int(n * 0.85)

# Split features (for X)
X_train_raw = features[:train_idx]
X_val_raw   = features[train_idx:val_idx]
X_test_raw  = features[val_idx:]

# Split target (for y)
y_train_raw = target[:train_idx]
y_val_raw   = target[train_idx:val_idx]
y_test_raw  = target[val_idx:]

X_train_raw.shape, X_val_raw.shape, X_test_raw.shape, y_train_raw.shape, y_val_raw.shape, y_test_raw.shape

In [None]:
# =====================================================================
# DUAL SCALING (CRITICAL STEP)
# =====================================================================

scaler_inputs  = MinMaxScaler(feature_range = (0, 1))
X_train_scaled = scaler_inputs.fit_transform(X_train_raw)
X_val_scaled   = scaler_inputs.transform(X_val_raw)
X_test_scaled  = scaler_inputs.transform(X_test_raw)

scaler_target  = MinMaxScaler(feature_range = (0, 1))
y_train_scaled = scaler_target.fit_transform(y_train_raw)
y_val_scaled   = scaler_target.transform(y_val_raw)
y_test_scaled  = scaler_target.transform(y_test_raw)

joblib.dump(scaler_inputs, 'Multi-Var_Scaler_Inputs.pkl')
joblib.dump(scaler_target, 'Multi-Var_Scaler_Target.pkl')
print("Success: 'Multi-Var_Scaler_Inputs.pkl' & 'Multi-Var_Scaler_Target.pkl' saved.")

In [None]:
# =====================================================================
# SLIDING WINDOW (MULTIVARIATE)
# =====================================================================

def multivariate_data(dataset, target, start_index, end_index, history_size, target_size):
	data   = []
	labels = []

	start_index = start_index + history_size
	if end_index is None:
		end_index = len(dataset) - target_size

	for i in range(start_index, end_index):
		indices = range(i - history_size, i)
		data.append(dataset[indices])
		labels.append(target[i + target_size])

	return np.array(data), np.array(labels)

X_train, y_train = multivariate_data(X_train_scaled, y_train_scaled, 0, None, WINDOW_SIZE, FORECAST_HORIZON)
X_val  , y_val   = multivariate_data(X_val_scaled  , y_val_scaled  , 0, None, WINDOW_SIZE, FORECAST_HORIZON)
X_test , y_test  = multivariate_data(X_test_scaled , y_test_scaled , 0, None, WINDOW_SIZE, FORECAST_HORIZON)

print(f"Train Shape: {X_train.shape}")
print(f"Target Shape: {y_train.shape}")

In [None]:
X_train.shape[1], X_train.shape[2]

In [None]:
model = Sequential()
model.add(LSTM(units = 256, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units = 256))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss = 'mae', optimizer = 'adam')
model.summary()
history = model.fit(X_train, y_train, epochs = 100, batch_size = 32, validation_data = (X_val, y_val), verbose = 1)

In [None]:
plt.figure(figsize = (25, 10))
plt.plot(history.history['loss'], label = 'Train Loss')
plt.plot(history.history['val_loss'], label = 'Validation Loss')
plt.legend()
plt.show()

In [None]:
# =====================================================================
# EVALUATION & PLOTTING
# =====================================================================

y_scaled_predicted = model.predict(X_test)

y_predicted_converted_back = scaler_target.inverse_transform(y_scaled_predicted)
y_actuals                  = scaler_target.inverse_transform(y_test.reshape(-1, 1))

rmse = np.sqrt(mean_squared_error(y_actuals, y_predicted_converted_back))
mae = mean_absolute_error(y_actuals, y_predicted_converted_back)
r2 = r2_score(y_actuals, y_predicted_converted_back)
mape = mean_absolute_percentage_error(y_actuals, y_predicted_converted_back) * 100

print("\n--- OFFICIAL AUTOSCALER PERFORMANCE ---")
print(f"Test RMSE: {rmse:.2f} vCores")
print(f"Test MAE:  {mae:.2f} vCores")
print(f"On average, the model's 5-minute forecast is off by {mae:.2f} vCores.")
print(f"R-squared Score: {r2:.4f} (Model explains {r2 * 100:.2f}% of the variance)")
print(f"MAPE: {mape:.2f}% (Predictions are off by an average of {mape:.2f}%)")

In [None]:
plt.figure(figsize = (25, 10))
plt.plot(history.history['loss'], label = 'Training Loss (MSE)', color = 'blue')
plt.plot(history.history['val_loss'], label = 'Validation Loss (MSE)', color = 'orange')
plt.title('LSTM Learning Curve: Training Loss vs. Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (Mean Squared Error)')
plt.legend(loc = 'upper right')
plt.grid(True, linestyle = '--', alpha = 0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (25, 10))
plt.plot(y_predicted_converted_back, label = 'Actual vCores', color = 'blue', alpha = 0.6)
plt.plot(y_actuals, label = 'Predicted vCores (5m ahead)', color='red', linestyle = '--', alpha = 0.8)
plt.title('Test Data: Actual CPU Requests vs Predicted CPU Requests')
plt.xlabel('Time Steps')
plt.ylabel('vCores')
plt.legend(loc = 'upper right')
plt.grid(True, linestyle = '--', alpha = 0.5)
plt.tight_layout()
plt.show()

In [None]:
model.save('Multi-Variable_LSTM_Model.keras')
print(f"Success: Model fully saved to Multi-Variable_LSTM_Model.keras")

In [None]:
allowed_error = y_actuals * 0.10  # 10% margin of error
absolute_errors = np.abs(y_actuals - y_predicted_converted_back)
correct_predictions = np.sum(absolute_errors <= allowed_error)

custom_accuracy = (correct_predictions / len(y_actuals)) * 100
print(f"Threshold Accuracy: {custom_accuracy:.2f}% of predictions were within a 10% margin of error.")