In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor



In [9]:
train_df = pd.read_csv('../Data/Processed_Train_Resource_utilization.csv')
test_df = pd.read_csv('../Data/Processed_Test_Resource_utilization.csv')

print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")

Training set shape: (21044, 20)
Testing set shape: (5261, 20)


In [10]:
features = [
    'cpu_utilization', 'memory_usage', 'storage_usage', 'hour_of_day', 'day_index',
    'workload_lag_1', 'workload_lag_2', 'workload_lag_3',
    'workload_rolling_mean_3', 'workload_rolling_std_3', 'workload_expanding_mean',
    'cpu_utilization_lag_1', 'cpu_utilization_lag_2', 'cpu_utilization_lag_3',
    'cpu_utilization_expanding_mean'
]

X_train = train_df[features]
y_train = train_df['workload']
X_test = test_df[features]
y_test = test_df['workload']
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Use the same train/test split and scaling as above
X_train = train_df[features]
y_train = train_df['workload']
X_test = test_df[features]
y_test = test_df['workload']
test_timestamps = test_df['timestamp'] if 'timestamp' in test_df.columns else None
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = XGBRegressor(
    n_estimators=100,     
    max_depth=8,          
    learning_rate=0.05,   
    subsample=0.8,        
    colsample_bytree=0.8, 
    random_state=42,
    objective='reg:squarederror'
    )
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
                           Feature  Importance
0                  cpu_utilization    0.517359
3                      hour_of_day    0.180240
1                     memory_usage    0.118703
5                   workload_lag_1    0.029104
6                   workload_lag_2    0.029017
12           cpu_utilization_lag_2    0.023319
7                   workload_lag_3    0.022435
13           cpu_utilization_lag_3    0.022287
11           cpu_utilization_lag_1    0.022109
2                    storage_usage    0.019485
4                        day_index    0.015943
10         workload_expanding_mean    0.000000
9           workload_rolling_std_3    0.000000
8          workload_rolling_mean_3    0.000000
14  cpu_utilization_expanding_mean    0.000000


In [12]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 28.2251
R² Score: 0.7720


In [13]:
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7720


In [14]:
df_save = pd.read_csv('../Data/Resource_utilization.csv')
train_size = len(train_df)
df_save = df_save.iloc[train_size:]
df_save['predicted_workload'] = y_pred
df_save.to_csv('../Data/Predicted_Resource_utilization.csv', index=False)

In [15]:
import joblib
import json

# Save the trained model
joblib.dump(model, "../Model/xgb_model.pkl")

# Save metrics
metrics = {
    "mse": float(mse),
    "r2": float(r2),
    "accuracy": float(accuracy)
}
with open("../Model/model_metrics.json", "w") as f:
    json.dump(metrics, f)

# Save parameters
params = model.get_params()
with open("../Model/model_params.json", "w") as f:
    json.dump(params, f)

pred_df = pd.DataFrame({
    "timestamp": test_timestamps,
    "Actual": y_test.values,
    "Predicted": y_pred
})
pred_df.to_csv("../Model/predictions.csv", index=False)

# Save feature importance
feature_importance.to_csv("../Model/feature_importance.csv", index=False)    