In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load dataset from Google Drive
file_path = "reduced_merged_cloud_metrics.csv"
df = pd.read_csv(file_path)

# Convert timestamp to datetime and set as index
df['Timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('Timestamp', inplace=True)

# Feature Engineering
# Extract time-based features
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek

# Create utilization ratios
df['EC2_CPU_Memory_Ratio'] = df['EC2_CPUUtilization'] / (df['EC2_MemoryUtilization'] + 1e-5)
df['RDS_Connections_Per_CPU'] = df['RDS_DatabaseConnections'] / (df['RDS_CPUUtilization'] + 1e-5)

# Compute moving averages
df['EC2_CPU_rolling_mean'] = df['EC2_CPUUtilization'].rolling(window=5).mean()
df['RDS_CPU_rolling_mean'] = df['RDS_CPUUtilization'].rolling(window=5).mean()
df.fillna(method='bfill', inplace=True)  # Fill missing values after rolling operations

# Ensure no NaN values remain
df.dropna(inplace=True)

# Select features and target variables
features = [
    'EC2_CPUUtilization', 'EC2_MemoryUtilization', 'EC2_DiskWriteOps', 'EC2_NetworkIn',
    'RDS_CPUUtilization', 'RDS_FreeableMemory', 'RDS_DatabaseConnections', 'RDS_WriteIOPS',
    'ECS_CPUUtilization', 'ECS_MemoryUtilization', 'ECS_RunningTaskCount',
    'hour', 'day_of_week', 'EC2_CPU_Memory_Ratio', 'RDS_Connections_Per_CPU',
    'EC2_CPU_rolling_mean', 'RDS_CPU_rolling_mean'
]

target_cols = ['EC2_CPUUtilization', 'RDS_CPUUtilization', 'ECS_CPUUtilization']

# Normalize data
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# Convert dataset to sequences for LSTM
SEQ_LENGTH = 30  # Using past 30 timesteps to predict the next step

def create_sequences(data, target_cols, seq_length):
    sequences, labels = [], []
    target_indices = [features.index(col) for col in target_cols]
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i+seq_length])
        labels.append([data[i+seq_length, idx] for idx in target_indices])  # Ensure proper label extraction
    return np.array(sequences), np.array(labels)

# Prepare input-output sequences
data = df[features].values
sequences, labels = create_sequences(data, target_cols, SEQ_LENGTH)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42, shuffle=False)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define PyTorch dataset
class CloudDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CloudDataset(X_train_tensor, y_train_tensor)
test_dataset = CloudDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])  # Taking only last LSTM output

# Model parameters
input_size = len(features)  # Number of input features
hidden_size = 128  # Increased LSTM hidden units
num_layers = 3  # Increased LSTM layers
output_size = len(target_cols)  # Predicting EC2, RDS, ECS CPU utilization

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Training loop with validation
EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_train_loss += loss.item()

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            output = model(batch_X)
            val_loss = criterion(output, batch_y)
            if torch.isnan(val_loss):  # Check for NaN loss
                print("Warning: NaN detected in validation loss!")
                val_loss = torch.tensor(0.0, device=device)
            total_val_loss += val_loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {total_train_loss/len(train_loader):.6f}, Val Loss: {total_val_loss/len(test_loader):.6f}")

# Save the trained model to Google Drive
model_path = "lstm_cloud_model.pth"
torch.save(model.state_dict(), model_path)

print(f"Model saved at: {model_path}")


  df.fillna(method='bfill', inplace=True)  # Fill missing values after rolling operations


Epoch 1/20, Train Loss: 0.049503, Val Loss: 0.043602


KeyboardInterrupt: 