In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor



In [3]:
df = pd.read_csv('Processed_Resource_utilization.csv')

In [4]:
# 3. Define features and target
features = [
    'cpu_utilization', 'memory_usage', 'storage_usage', 'hour_of_day', 'day_index',
    'workload_lag_1', 'workload_lag_2', 'workload_lag_3',
    'workload_rolling_mean_3', 'workload_rolling_std_3', 'workload_expanding_mean',
    'cpu_utilization_lag_1', 'cpu_utilization_lag_2', 'cpu_utilization_lag_3',
    'cpu_utilization_expanding_mean'
]
X = df[features]
y = df['workload']

In [5]:
train_size = int(0.8 * len(df))
X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]
X_test = X.iloc[train_size:]
y_test = y.iloc[train_size:]
test_timestamps = df['timestamp'].iloc[train_size:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = XGBRegressor(
    n_estimators=500,     
    max_depth=6,          
    learning_rate=0.05,   
    subsample=0.8,        
    colsample_bytree=0.8, 
    random_state=42,
    objective='reg:squarederror'
)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)



Feature Importance:
                           Feature  Importance
0                  cpu_utilization    0.379023
3                      hour_of_day    0.216572
1                     memory_usage    0.119166
11           cpu_utilization_lag_1    0.050615
5                   workload_lag_1    0.024197
9           workload_rolling_std_3    0.023024
13           cpu_utilization_lag_3    0.022516
8          workload_rolling_mean_3    0.021907
6                   workload_lag_2    0.021558
10         workload_expanding_mean    0.021228
2                    storage_usage    0.020910
12           cpu_utilization_lag_2    0.020854
14  cpu_utilization_expanding_mean    0.020542
7                   workload_lag_3    0.019598
4                        day_index    0.018290


In [6]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 34.0071
R² Score: 0.7253


In [7]:
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7253


In [8]:
df_save =pd.read_csv('Resource_utilization.csv')
#add predicted workload to dataframe
df_save = df_save.iloc[train_size:]
df_save['predicted_workload'] = y_pred
df_save.to_csv('Predicted_Resource_utilization.csv', index=False)

In [9]:
import joblib

# After training
joblib.dump(model, "workload_model.pkl")

['workload_model.pkl']