In [None]:
# Cell 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore


plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

In [46]:
df=pd.read_csv('Resource_utilization.csv')

In [47]:
df.head()

Unnamed: 0,timestamp,cpu_utilization,memory_usage,storage_usage,workload,Resource Allocation
0,2024-05-15 06:00:00,93.667656,65.739742,81.202149,100.0,76.398051
1,2022-03-20 00:00:00,,45.771232,56.178079,96.20153,52.783747
2,2024-03-31 02:00:00,66.270283,85.537221,61.594411,100.0,71.699889
3,2022-10-05 07:00:00,50.30559,55.492008,100.419747,100.0,69.456805
4,2022-07-18 08:00:00,58.567954,77.339342,60.446444,100.0,66.624977


In [48]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [49]:
df.isnull().sum()

timestamp                 0
cpu_utilization        2819
memory_usage           3071
storage_usage          3306
workload                  0
Resource Allocation    2572
dtype: int64

In [50]:
for col in ['cpu_utilization', 'memory_usage', 'storage_usage', 'workload', 'Resource Allocation']:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].ffill()
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].mean())

In [51]:
df.isnull().sum()

timestamp              0
cpu_utilization        0
memory_usage           0
storage_usage          0
workload               0
Resource Allocation    0
dtype: int64

In [52]:
for col in ['cpu_utilization', 'memory_usage', 'storage_usage', 'Resource Allocation']:
    z = np.abs(zscore(df[col]))
    outliers = (z > 3).sum()
    if outliers > 0:
        print(col)
        df.loc[z > 3, col] = np.nan
        df[col] = df[col].ffill()
        df[col] = df[col].fillna(df[col].mean())

metrics_to_clip = {
    'cpu_utilization': 100,
    'storage_usage': 100,
    'workload': 100
}

for metric, upper_limit in metrics_to_clip.items():
    outlier_count = df[df[metric] > upper_limit].shape[0]
    if outlier_count > 0:
        print(f"Clipping {outlier_count} values in '{metric}' that exceed {upper_limit}.")
        df[metric] = np.clip(df[metric], 0, upper_limit)

df['date_only'] = df['timestamp'].dt.date
df['hour_of_day'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.day_name()
df['day_index'] = df['timestamp'].dt.dayofweek

cpu_utilization
Resource Allocation
Clipping 1594 values in 'storage_usage' that exceed 100.
Clipping 263 values in 'workload' that exceed 100.


In [53]:
df.head()

Unnamed: 0,timestamp,cpu_utilization,memory_usage,storage_usage,workload,Resource Allocation,date_only,hour_of_day,day_of_week,day_index
0,2024-05-15 06:00:00,93.667656,65.739742,81.202149,100.0,76.398051,2024-05-15,6,Wednesday,2
1,2022-03-20 00:00:00,93.667656,45.771232,56.178079,96.20153,52.783747,2022-03-20,0,Sunday,6
2,2024-03-31 02:00:00,66.270283,85.537221,61.594411,100.0,71.699889,2024-03-31,2,Sunday,6
3,2022-10-05 07:00:00,50.30559,55.492008,100.0,100.0,69.456805,2022-10-05,7,Wednesday,2
4,2022-07-18 08:00:00,58.567954,77.339342,60.446444,100.0,66.624977,2022-07-18,8,Monday,0


In [54]:
for lag in [1, 2, 3]:
    df[f'workload_lag_{lag}'] = df['workload'].shift(lag)
    df[f'cpu_utilization_lag_{lag}'] = df['cpu_utilization'].shift(lag)

df['workload_rolling_mean_3'] = df['workload'].shift(1).rolling(window=5).mean()
df['workload_rolling_std_3'] = df['workload'].shift(1).rolling(window=5).std()
df['workload_expanding_mean'] = df['workload'].shift(1).expanding().mean()
df['cpu_utilization_expanding_mean'] = df['cpu_utilization'].shift(1).expanding().mean()

df = df.ffill().bfill()

df = df.sort_values('timestamp')

In [55]:
df.to_csv('Processed_Resource_utilization.csv', index=False)