# ML model and Dataset creation


In [None]:
import pandas as pd
import numpy as np

# Defining sample data
data = {
    'Timestamp': pd.date_range(start='2023-01-01', periods=100, freq='D'),
    'IOPS': np.random.randint(10000, 100000, size=100),
    'Compute': np.random.choice(['2 vCPUs', '4 vCPUs', '8 vCPUs', '16 vCPUs'], size=100),
    'NetworkReq': np.random.choice(['100 Mbps', '1 Gbps', '10 Gbps'], size=100),
    'BackupDedupeRatio': np.random.uniform(1.0, 4.0, size=100),
    'Storage': np.random.randint(100, 10000, size=100),  # In GB
    'RAM': np.random.choice(['8 GB', '16 GB', '32 GB', '64 GB'], size=100),
    'Latency': np.random.randint(1, 100, size=100),  # In milliseconds
    'WorkloadType': np.random.choice(['Web Server', 'Database', 'File Storage', 'Application'], size=100),
    # Adding 'DeploymentCU' as the target variable with example categories
    'DeploymentCU': np.random.choice(['Small', 'Medium', 'Large', 'X-Large'], size=100),
}

# Create DataFrame
df = pd.DataFrame(data)
df['Timestamp'] = df['Timestamp'].dt.strftime('%Y-%m-%d')

# Save to CSV
csv_file_path = 'cloud_workload_data.csv'
df.to_csv(csv_file_path, index=False)

print(f"Updated CSV file '{csv_file_path}' has been created with DeploymentCU column.")


Updated CSV file 'cloud_workload_data.csv' has been created with DeploymentCU column.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Load Data
df = pd.read_csv('/content/cloud_workload_data.csv')

# Clean Data (as before)
df.dropna(inplace=True)

# Feature Engineering & Preprocessing
categorical_features = ['Compute', 'NetworkReq', 'RAM', 'WorkloadType']  # Assuming these are your features
numerical_features = ['IOPS', 'BackupDedupeRatio', 'Storage', 'Latency']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Split Data
# Now, 'DeploymentCU' is our target variable
X = df.drop(['DeploymentCU', 'Timestamp'], axis=1)  # Dropping 'Timestamp' since it's not used here.
y = df['DeploymentCU']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying transformations and training a model
model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

model.fit(X_train, y_train)
model.predict(X_test)

array(['Large', 'Large', 'Large', 'Small', 'X-Large', 'X-Large', 'Large',
       'Large', 'Small', 'Medium', 'Small', 'Small', 'Large', 'Large',
       'X-Large', 'Small', 'Large', 'Small', 'Small', 'Small'],
      dtype=object)

In [None]:
from joblib import dump, load

# Save the model to a file named 'model.pkl'
model_filename = '/content/model.pkl'
dump(model, model_filename)



['/content/model.pkl']