Create New Columns for AWS Metrics

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('cloud-computing-performance-metrics.csv')

# 1. EC2 Metrics
df['EC2_CPUUtilization'] = df['cpu_usage']
df['EC2_MemoryUtilization'] = df['memory_usage']
df['EC2_DiskReadOps'] = df['num_executed_instructions']  # Simulated
df['EC2_DiskWriteOps'] = df['num_executed_instructions']  # Simulated
df['EC2_NetworkIn'] = df['network_traffic']  # Simulated
df['EC2_NetworkOut'] = df['network_traffic']  # Simulated

# 2. RDS Metrics
df['RDS_CPUUtilization'] = df['cpu_usage']
df['RDS_FreeableMemory'] = 100 - df['memory_usage']  # Simulated freeable memory
df['RDS_DatabaseConnections'] = df['task_status'].apply(lambda x: 1 if x == 'running' else 0)  # Simulated
df['RDS_ReadIOPS'] = df['num_executed_instructions']  # Simulated
df['RDS_WriteIOPS'] = df['num_executed_instructions']  # Simulated

# 3. ECS Metrics
df['ECS_CPUUtilization'] = df['cpu_usage']
df['ECS_MemoryUtilization'] = df['memory_usage']
df['ECS_RunningTaskCount'] = df['task_status'].apply(lambda x: 1 if x == 'running' else 0)  # Simulated

# List of unwanted columns
unwanted_columns = [
    'vm_id', 'power_consumption', 'num_executed_instructions', 
    'execution_time', 'energy_efficiency', 'task_type', 
    'task_priority', 'task_status', 'cpu_usage', 'memory_usage', 'network_traffic'
]

# Drop unwanted columns
df.drop(columns=unwanted_columns, inplace=True)

# Display the remaining columns
print(df.columns)

# Save the mapped dataset
df.to_csv('mapped_cloud_metrics.csv', index=False)

Index(['timestamp', 'EC2_CPUUtilization', 'EC2_MemoryUtilization',
       'EC2_DiskReadOps', 'EC2_DiskWriteOps', 'EC2_NetworkIn',
       'EC2_NetworkOut', 'RDS_CPUUtilization', 'RDS_FreeableMemory',
       'RDS_DatabaseConnections', 'RDS_ReadIOPS', 'RDS_WriteIOPS',
       'ECS_CPUUtilization', 'ECS_MemoryUtilization', 'ECS_RunningTaskCount'],
      dtype='object')


Load the Mapped Dataset

In [3]:
# Load the mapped dataset
df = pd.read_csv('mapped_cloud_metrics.csv')

# Display the first few rows
print(df.head())

             timestamp  EC2_CPUUtilization  EC2_MemoryUtilization  \
0  2023-01-25 09:10:54           54.881350              78.950861   
1  2023-01-26 04:46:34           71.518937              29.901883   
2  2023-01-13 23:39:47                 NaN              92.709195   
3  2023-02-09 11:45:49           54.488318              88.100960   
4  2023-06-14 08:27:26           42.365480                    NaN   

   EC2_DiskReadOps  EC2_DiskWriteOps  EC2_NetworkIn  EC2_NetworkOut  \
0           7527.0            7527.0     164.775973      164.775973   
1           5348.0            5348.0            NaN             NaN   
2           5483.0            5483.0     203.674847      203.674847   
3           5876.0            5876.0            NaN             NaN   
4           3361.0            3361.0            NaN             NaN   

   RDS_CPUUtilization  RDS_FreeableMemory  RDS_DatabaseConnections  \
0           54.881350           21.049139                        0   
1           71.518

Handle missing values

In [4]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Step 4: Handle missing values
print(df.isnull().sum())
df.fillna(df.mean(), inplace=True)

# Step 5: Save the cleaned dataset
df.to_csv('cleaned_cloud_metrics.csv', index=False)

timestamp                  200666
EC2_CPUUtilization         199038
EC2_MemoryUtilization      200510
EC2_DiskReadOps            199686
EC2_DiskWriteOps           199686
EC2_NetworkIn              199481
EC2_NetworkOut             199481
RDS_CPUUtilization         199038
RDS_FreeableMemory         200510
RDS_DatabaseConnections         0
RDS_ReadIOPS               199686
RDS_WriteIOPS              199686
ECS_CPUUtilization         199038
ECS_MemoryUtilization      200510
ECS_RunningTaskCount            0
dtype: int64


Feature Engineering
1. Temporal features
2. Rolling Averages
3. Lagged features
4. Utilization ratios

In [5]:
import numpy as np

# Load the preprocessed dataset
df = pd.read_csv('cleaned_cloud_metrics.csv')

# Ensure timestamp is in datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 1. Temporal Features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek  # Monday=0, Sunday=6
df['month'] = df['timestamp'].dt.month

# 2. Rolling Averages (e.g., 5-time-step rolling average for CPU and memory)
window_size = 5
df['cpu_rolling_avg'] = df['EC2_CPUUtilization'].rolling(window=window_size).mean()
df['memory_rolling_avg'] = df['EC2_MemoryUtilization'].rolling(window=window_size).mean()

# Fill NaN values in rolling averages (first few rows)
df.fillna(method='bfill', inplace=True)  # Backward fill

# 3. Lagged Features (e.g., CPU usage at t-1, t-2)
df['cpu_lag_1'] = df['EC2_CPUUtilization'].shift(1)
df['cpu_lag_2'] = df['EC2_CPUUtilization'].shift(2)

# Fill NaN values in lagged features
df.bfill(inplace=True)  # Backward fill

# 4. Utilization Ratios
df['cpu_memory_ratio'] = df['EC2_CPUUtilization'] / df['EC2_MemoryUtilization']
df['network_in_out_ratio'] = df['EC2_NetworkIn'] / df['EC2_NetworkOut']

# Handle division by zero (replace infinite values with NaN and then fill with 0)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)

# Display the new features
print(df.head())

  df.fillna(method='bfill', inplace=True)  # Backward fill


            timestamp  EC2_CPUUtilization  EC2_MemoryUtilization  \
0 2023-01-25 09:10:54           54.881350              78.950861   
1 2023-01-26 04:46:34           71.518937              29.901883   
2 2023-01-13 23:39:47           50.012232              92.709195   
3 2023-02-09 11:45:49           54.488318              88.100960   
4 2023-06-14 08:27:26           42.365480              49.981898   

   EC2_DiskReadOps  EC2_DiskWriteOps  EC2_NetworkIn  EC2_NetworkOut  \
0           7527.0            7527.0     164.775973      164.775973   
1           5348.0            5348.0     500.007572      500.007572   
2           5483.0            5483.0     203.674847      203.674847   
3           5876.0            5876.0     500.007572      500.007572   
4           3361.0            3361.0     500.007572      500.007572   

   RDS_CPUUtilization  RDS_FreeableMemory  RDS_DatabaseConnections  ...  \
0           54.881350           21.049139                        0  ...   
1           71

Splitting the Data

In [6]:
from sklearn.model_selection import train_test_split

# Sort by timestamp (ensure data is in chronological order)
df.sort_values('timestamp', inplace=True)

# Define features (X) and target (y)
# For LSTM: Target could be future CPU usage (e.g., next time step)
X = df.drop(columns=['timestamp'])  # Drop timestamp (not a feature)
y = df['EC2_CPUUtilization']  # Example target (can be adjusted)

# Time-based split (e.g., 70% train, 15% validation, 15% test)
train_size = int(0.7 * len(df))
val_size = int(0.15 * len(df))

X_train, X_val_test = X[:train_size], X[train_size:]
y_train, y_val_test = y[:train_size], y[train_size:]

X_val, X_test = X_val_test[:val_size], X_val_test[val_size:]
y_val, y_test = y_val_test[:val_size], y_val_test[val_size:]

# Verify the splits
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 1400000
Validation set size: 300000
Testing set size: 300000


Normalize the Data

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Normalize the target (if needed)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = scaler.transform(y_val.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

Reshape Data for LSTM

In [8]:
import numpy as np

# Function to create sequences
def create_sequences(data, targets, time_steps=10):
    X_seq, y_seq = [], []
    for i in range(len(data) - time_steps):
        X_seq.append(data[i:i+time_steps])
        y_seq.append(targets[i+time_steps])  # Predict the next CPU usage
    return np.array(X_seq), np.array(y_seq)

# Define time steps (e.g., 10 time steps per sequence)
time_steps = 10

# Create sequences for training, validation, and testing
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, time_steps)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_scaled, time_steps)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_scaled, time_steps)

# Verify the shapes
print(f"Training sequences: {X_train_seq.shape}, Targets: {y_train_seq.shape}")
print(f"Validation sequences: {X_val_seq.shape}, Targets: {y_val_seq.shape}")
print(f"Testing sequences: {X_test_seq.shape}, Targets: {y_test_seq.shape}")

Training sequences: (1399990, 10, 23), Targets: (1399990, 1)
Validation sequences: (299990, 10, 23), Targets: (299990, 1)
Testing sequences: (299990, 10, 23), Targets: (299990, 1)


Train LSTM model

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential()

# Add LSTM layers
model.add(LSTM(50, return_sequences=True, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model.add(Dropout(0.2))  # Dropout for regularization
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))

# Add a Dense output layer
model.add(Dense(1))  # Output layer (predicts a single value)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=20,  # Number of epochs
    batch_size=32,  # Batch size
    verbose=1
)

ModuleNotFoundError: No module named 'tensorflow'