In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
# data is from  here https://www.kaggle.com/datasets/abdurraziq01/cloud-computing-performance-metrics/code

# The data in this dataset was collected from a simulated cloud computing environment. The values represent a wide range of possible states and conditions in a cloud computing system.

df_kaggle = pd.read_csv("../data/raw/cloud_computing_performance_metrics.zip")
print(df_kaggle.shape)
df_kaggle.head().T

In [None]:
# this almost exactly matches a normal distribution
df_kaggle.describe().T

In [None]:
df_kaggle.columns

In [None]:
# the vms_id are unique except for null
df_kaggle["vm_id"].value_counts(dropna=False).reset_index().sort_values(
    by="count", ascending=False
)

In [None]:


cols = ["cpu_usage", "memory_usage", "network_traffic", "power_consumption"]

fig, axes = plt.subplots(1, len(cols), figsize=(5 * len(cols), 4))
if len(cols) == 1:
    axes = [axes]

for ax, col in zip(axes, cols):
    if col not in df_kaggle.columns:
        ax.axis("off")
        ax.text(0.5, 0.5, f"Missing: {col}", ha="center", va="center")
        continue

    data = pd.to_numeric(df_kaggle[col], errors="coerce").dropna()
    if data.empty:
        ax.axis("off")
        ax.text(0.5, 0.5, f"No valid values: {col}", ha="center", va="center")
        continue

    if len(data) > 10000:
        data = data.sample(10000, random_state=0)

    sns.kdeplot(data, ax=ax, fill=True, bw_method="scott")
    ax.set_title(f"{col} â€” density (KDE)")
    ax.grid(True)

plt.tight_layout()
plt.show()

**Figure 1**: Comparing the density of cpu_usage, memory_usage, and network traffic

***

* `cpu_useage` and `memory_usage` are scaled with few discernable peaks or correlations.


In [None]:

df_kaggle[["cpu_usage", "memory_usage", "network_traffic", "power_consumption"]].corr()