In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import Dataset, DataLoader
import os
import matplotlib.pyplot as plt
import csv

import torch
import captum
from captum.attr import IntegratedGradients
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [53]:
def load_sampled_data(raw_data, sample_fraction=0.01):
    # Extract features and target
    target_col = 'target'
    target = raw_data[target_col]

    # Handle missing target values: either fill with mode or drop rows with NaN in target
    target.fillna(target.mode()[0], inplace=True)  # Filling NaN with the mode of the target
    
    # Initialize StratifiedShuffleSplit to split the data
    sss = StratifiedShuffleSplit(n_splits=1, test_size=sample_fraction, random_state=42)

    # Sample sample_fraction% of data maintaining class distribution
    for train_idx, test_idx in sss.split(raw_data, target):
        sampled_data = raw_data.iloc[test_idx]

    return sampled_data

def create_sequences(data, seq_length, horizon, combined_samples_targets):
    X, X_targets, y, y_targets = [], [], [], []
    feature_names = [col for col in data.columns if col != 'target']  # List of feature column names
    
    for i in range(len(data) - seq_length - horizon + 1):
        # X should contain all columns except 'target' (make sure it's a DataFrame)
        X_seq = data[feature_names].iloc[i:i + seq_length]

        # X_targets should contain only the 'target' column
        X_targets_seq = data[combined_samples_targets].iloc[i:i + seq_length]
        
        # y should contain the entire row for each sequence, except 'target'
        y_seq = data[feature_names].iloc[i + seq_length + horizon - 1]
        
        # y_targets should contain only the 'target' column
        y_target = data[combined_samples_targets].iloc[i + seq_length + horizon - 1]
        
        # Append sequences to the respective lists
        X.append(X_seq)
        X_targets.append(X_targets_seq)
        y.append(y_seq)
        y_targets.append(y_target)

    return np.array(X), np.array(X_targets), np.array(y), np.array(y_targets)




# **Define LSTM Model**
class LSTMForecaster(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMForecaster, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])
        return out


# **Custom Dataset**
class SequenceDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]



In [54]:
# Define the Generator
class Generator(nn.Module):
    def __init__(self, latent_dim, num_features, num_classes):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.num_features = num_features
        self.num_classes = num_classes
        
        # Adjust the first layer to match the input dimensions
        self.model = nn.Sequential(
            nn.Linear(latent_dim + num_classes, 512),  # Adjust this to match input dimensions
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_features),
            nn.Tanh()
        )
    
    def forward(self, z, labels):
        # Concatenate latent vector and labels
        inputs = torch.cat([z, labels], dim=1)
        return self.model(inputs)

# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self, num_features, num_classes):
        super(Discriminator, self).__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        
        self.model = nn.Sequential(
            nn.Linear(num_features + num_classes, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x, labels):
        # Concatenate features and labels
        inputs = torch.cat([x, labels], dim=1)
        return self.model(inputs)
# Define the GAN
class GAN(nn.Module):
    def __init__(self, generator, discriminator):
        super(GAN, self).__init__()
        self.generator = generator
        self.discriminator = discriminator
    
    def forward(self, z, labels):
        generated_data = self.generator(z, labels)
        validity = self.discriminator(generated_data, labels)
        return generated_data, validity
# Function to train the GAN
def train_gan(gan, dataloader, num_epochs, latent_dim, num_classes, device):
    generator = gan.generator
    discriminator = gan.discriminator
    
    # Loss function
    adversarial_loss = nn.BCELoss()
    
    # Optimizers
    optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)
    
    for epoch in range(num_epochs):
        for i, (real_data, labels) in enumerate(dataloader):
            real_data = real_data.to(device)
            labels = labels.to(device)
            
            # Create labels for real and fake data
            valid_labels = torch.ones(real_data.size(0), 1).to(device)
            fake_labels = torch.zeros(real_data.size(0), 1).to(device)
            
            # Train Generator
            optimizer_G.zero_grad()
            
            z = torch.randn(real_data.size(0), latent_dim).to(device)
            gen_labels = torch.randint(0, num_classes, (real_data.size(0),)).to(device)
            gen_labels_one_hot = torch.nn.functional.one_hot(gen_labels, num_classes=num_classes).float().to(device)
            
            generated_data = generator(z, gen_labels_one_hot)
            g_loss = adversarial_loss(discriminator(generated_data, gen_labels_one_hot), valid_labels)
            
            g_loss.backward()
            optimizer_G.step()
            
            # Train Discriminator
            optimizer_D.zero_grad()
            
            real_loss = adversarial_loss(discriminator(real_data, labels.float()), valid_labels)
            fake_loss = adversarial_loss(discriminator(generated_data.detach(), gen_labels_one_hot), fake_labels)
            
            d_loss = (real_loss + fake_loss) / 2
            
            d_loss.backward()
            optimizer_D.step()
        
        print(f"[Epoch {epoch}/{num_epochs}] [D loss: {d_loss.item()}] [G loss: {g_loss.item()}]")
    return generator, discriminator
# Function to generate synthetic data using the trained GAN
def generate_synthetic_data(generator, num_samples, latent_dim, num_classes, device):
    z = torch.randn(num_samples, latent_dim).to(device)
    gen_labels = torch.randint(0, num_classes, (num_samples,)).to(device)
    gen_labels_one_hot = torch.nn.functional.one_hot(gen_labels, num_classes=num_classes).float().to(device)
    
    synthetic_data = generator(z, gen_labels_one_hot)
    return synthetic_data.cpu().detach().numpy(), gen_labels.cpu().detach().numpy()
# Function to evaluate the model
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for sequences, targets in dataloader:
            sequences = sequences.to(device)
            targets = targets.to(device)
            
            outputs = model(sequences)
            _, preds = torch.max(outputs, 1)
            
            all_preds.append(preds.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    
    accuracy = accuracy_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return all_preds, all_targets
# Function to visualize the results
def visualize_results(preds, targets, output_dir):
    # Create confusion matrix
    cm = confusion_matrix(targets, preds)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 7))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    
    tick_marks = np.arange(len(np.unique(targets)))
    plt.xticks(tick_marks, np.unique(targets))
    plt.yticks(tick_marks, np.unique(targets))
    
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()
# Function to save classification report
def save_classification_report(preds, targets, output_dir):
    report = classification_report(targets, preds, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    # Save the classification report as a CSV file
    report_df.to_csv(os.path.join(output_dir, 'classification_report.csv'), index=True)
    
    # Print the classification report
    print(report_df)
# Function to save the model
def save_model(model, output_dir, model_name):
    torch.save(model.state_dict(), os.path.join(output_dir, f"{model_name}.pth"))
    print(f"Model saved to {os.path.join(output_dir, f'{model_name}.pth')}")
# Function to load the model
def load_model(model, model_path):
    model.load_state_dict(torch.load(model_path))
    model.eval()
    print(f"Model loaded from {model_path}")
# Function to plot feature importance
def plot_feature_importance(model, feature_names, output_dir):
    # Get the weights of the first layer
    weights = model.lstm.weight_ih_l0.data.cpu().numpy()
    
    # Calculate feature importance
    feature_importance = np.abs(weights).sum(axis=0)
    
    # Normalize feature importance
    feature_importance /= feature_importance.sum()
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance')
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'feature_importance.png'))
    plt.close()
# Function to plot training history
def plot_training_history(history, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(history['loss'], label='Loss')
    plt.plot(history['accuracy'], label='Accuracy')
    plt.title('Training History')
    plt.xlabel('Epochs')
    plt.ylabel('Value')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_history.png'))
    plt.close()
# Function to plot PCA
def plot_pca(data, labels, output_dir):
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data)
    
    plt.figure(figsize=(10, 7))
    scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=labels, cmap='viridis', alpha=0.5)
    plt.title('PCA of Data')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    
    # Create a legend
    legend1 = plt.legend(*scatter.legend_elements(), title="Classes")
    plt.gca().add_artist(legend1)
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'pca_plot.png'))
    plt.close()
# Function to plot SHAP values
def plot_shap_values(model, data, feature_names, output_dir):
    # Use Integrated Gradients for SHAP values
    ig = IntegratedGradients(model)
    
    # Calculate SHAP values
    shap_values = ig.attribute(data, target=0)
    
    # Convert to numpy array
    shap_values = shap_values.cpu().detach().numpy()
    
    # Create a DataFrame for better visualization
    shap_df = pd.DataFrame(shap_values, columns=feature_names)
    
    # Plot SHAP values
    plt.figure(figsize=(10, 6))
    plt.barh(shap_df.columns, shap_df.mean(axis=0))
    plt.xlabel('SHAP Value')
    plt.title('SHAP Values')
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'shap_values.png'))
    plt.close()
# Function to create output directory
def create_output_directory(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print(f"Output directory created at: {output_dir}")
# Function to save synthetic data
def save_synthetic_data(data, labels, output_dir):
    synthetic_data_df = pd.DataFrame(data)
    synthetic_data_df['target'] = labels
    synthetic_data_df.to_csv(os.path.join(output_dir, 'synthetic_data.csv'), index=False)
    print(f"Synthetic data saved to {os.path.join(output_dir, 'synthetic_data.csv')}")
# Function to save the generator model
def save_generator_model(generator, output_dir):
    torch.save(generator.state_dict(), os.path.join(output_dir, 'generator.pth'))
    print(f"Generator model saved to {os.path.join(output_dir, 'generator.pth')}")
# Function to save the discriminator model
def save_discriminator_model(discriminator, output_dir):
    torch.save(discriminator.state_dict(), os.path.join(output_dir, 'discriminator.pth'))
    print(f"Discriminator model saved to {os.path.join(output_dir, 'discriminator.pth')}")
# Function to load the generator model
def load_generator_model(generator, model_path):
    generator.load_state_dict(torch.load(model_path))
    generator.eval()
    print(f"Generator model loaded from {model_path}")
# Function to load the discriminator model
def load_discriminator_model(discriminator, model_path):
    discriminator.load_state_dict(torch.load(model_path))
    discriminator.eval()
    print(f"Discriminator model loaded from {model_path}")
# Function to save the GAN model
def save_gan_model(gan, output_dir):
    torch.save(gan.state_dict(), os.path.join(output_dir, 'gan.pth'))
    print(f"GAN model saved to {os.path.join(output_dir, 'gan.pth')}")
# Function to load the GAN model
def load_gan_model(gan, model_path):
    gan.load_state_dict(torch.load(model_path))
    gan.eval()
    print(f"GAN model loaded from {model_path}")
# Function to save the training history
def save_training_history(history, output_dir):
    with open(os.path.join(output_dir, 'training_history.csv'), 'w', newline='') as csvfile:
        fieldnames = ['epoch', 'loss', 'accuracy']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for epoch, (loss, accuracy) in enumerate(zip(history['loss'], history['accuracy'])):
            writer.writerow({'epoch': epoch, 'loss': loss, 'accuracy': accuracy})
    
    print(f"Training history saved to {os.path.join(output_dir, 'training_history.csv')}")
# Function to save the model architecture
def save_model_architecture(model, output_dir):
    with open(os.path.join(output_dir, 'model_architecture.txt'), 'w') as f:
        f.write(str(model))
    print(f"Model architecture saved to {os.path.join(output_dir, 'model_architecture.txt')}")
# Function to save the training configuration
def save_training_configuration(config, output_dir):
    with open(os.path.join(output_dir, 'training_configuration.txt'), 'w') as f:
        for key, value in config.items():
            f.write(f"{key}: {value}\n")
    print(f"Training configuration saved to {os.path.join(output_dir, 'training_configuration.txt')}")
# Function to save the training and validation loss
def save_training_validation_loss(train_loss, val_loss, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_loss.png'))
    plt.close()
# Function to save the training and validation accuracy
def save_training_validation_accuracy(train_accuracy, val_accuracy, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_accuracy, label='Training Accuracy')
    plt.plot(val_accuracy, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_accuracy.png'))
    plt.close()
# Function to save the training and validation F1 score
def save_training_validation_f1(train_f1, val_f1, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_f1, label='Training F1 Score')
    plt.plot(val_f1, label='Validation F1 Score')
    plt.title('Training and Validation F1 Score')
    plt.xlabel('Epochs')
    plt.ylabel('F1 Score')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_f1.png'))
    plt.close()
# Function to save the training and validation metrics
def save_training_validation_metrics(train_metrics, val_metrics, output_dir):
    metrics_df = pd.DataFrame({'Epoch': range(len(train_metrics)), 'Train Metrics': train_metrics, 'Validation Metrics': val_metrics})
    metrics_df.to_csv(os.path.join(output_dir, 'training_validation_metrics.csv'), index=False)
    print(f"Training and validation metrics saved to {os.path.join(output_dir, 'training_validation_metrics.csv')}")
# Function to save the training and validation ROC curve
def save_training_validation_roc_curve(train_fpr, train_tpr, val_fpr, val_tpr, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_fpr, train_tpr, label='Training ROC Curve')
    plt.plot(val_fpr, val_tpr, label='Validation ROC Curve')
    plt.title('Training and Validation ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_roc_curve.png'))
    plt.close()
# Function to save the training and validation precision-recall curve
def save_training_validation_precision_recall_curve(train_precision, train_recall, val_precision, val_recall, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_recall, train_precision, label='Training Precision-Recall Curve')
    plt.plot(val_recall, val_precision, label='Validation Precision-Recall Curve')
    plt.title('Training and Validation Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_precision_recall_curve.png'))
    plt.close()
# Function to save the training and validation confusion matrix
def save_training_validation_confusion_matrix(train_confusion_matrix, val_confusion_matrix, output_dir):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(train_confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Training Confusion Matrix')
    plt.colorbar()
    
    plt.subplot(1, 2, 2)
    plt.imshow(val_confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Validation Confusion Matrix')
    plt.colorbar()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_confusion_matrix.png'))
    plt.close()
# Function to save the training and validation classification report
def save_training_validation_classification_report(train_report, val_report, output_dir):
    train_report_df = pd.DataFrame(train_report).transpose()
    val_report_df = pd.DataFrame(val_report).transpose()
    
    # Save the classification reports as CSV files
    train_report_df.to_csv(os.path.join(output_dir, 'training_classification_report.csv'), index=True)
    val_report_df.to_csv(os.path.join(output_dir, 'validation_classification_report.csv'), index=True)
    
    # Print the classification reports
    print(train_report_df)
    print(val_report_df)
# Function to save the training and validation metrics history
def save_training_validation_metrics_history(train_metrics_history, val_metrics_history, output_dir):
    metrics_df = pd.DataFrame({'Epoch': range(len(train_metrics_history)), 'Train Metrics': train_metrics_history, 'Validation Metrics': val_metrics_history})
    metrics_df.to_csv(os.path.join(output_dir, 'training_validation_metrics_history.csv'), index=False)
    print(f"Training and validation metrics history saved to {os.path.join(output_dir, 'training_validation_metrics_history.csv')}")
# Function to save the training and validation metrics history plot
def save_training_validation_metrics_history_plot(train_metrics_history, val_metrics_history, output_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_metrics_history, label='Training Metrics History')
    plt.plot(val_metrics_history, label='Validation Metrics History')
    plt.title('Training and Validation Metrics History')
    plt.xlabel('Epochs')
    plt.ylabel('Metrics')
    plt.legend()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'training_validation_metrics_history_plot.png'))
    plt.close()

In [55]:
pca_dim=10
lookforward=5
lookback=60
PCA_DIMENSION = 10

# Model parameters
input_dim = PCA_DIMENSION
hidden_dim = 32
num_layers = 2
output_dim = PCA_DIMENSION
epochs = 500
learning_rate = 0.005
batch_size = 128

In [None]:
# # Load dataset
# dataset = pd.read_csv('small_sample.csv')
# # dataset = dataset[:int(0.01*len(dataset))]

# dataset.index = dataset['Timestamp']
# dataset = dataset.drop(columns=['Timestamp'])
# print(dataset.head())

# # **Data Preprocessing**
# # Handle missing values
# dataset = dataset.apply(lambda x: x.fillna(0) if x.isna().all() else x)
# threshold = 0.6 * len(dataset)
# for col in dataset.columns:
#     if dataset[col].isna().sum() > threshold:
#         mode_value = dataset[col].mode().iloc[0] if not dataset[col].mode().empty else 0
#         dataset.fillna({col: mode_value}, inplace=True)
# #            dataset[col].fillna(mode_value, inplace=True)
# # dataset = dataset.dropna(subset=['target'])
# numeric_cols = dataset.select_dtypes(include=[np.number]).columns
# dataset[numeric_cols] = dataset[numeric_cols].fillna(dataset[numeric_cols].mean())



# dataset.shape

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create dummy dataset with 4 features and 2 targets
num_samples = 1000
data = {
    'host': np.random.randn(num_samples),
    'srscu0': np.random.randn(num_samples),
    'srscu1': np.random.randn(num_samples),
    'srscu2': np.random.randn(num_samples),
    'srscu3': np.random.randn(num_samples),
    'srsdu0': np.random.randn(num_samples),
    'srsdu1': np.random.randn(num_samples),
    'srsdu2': np.random.randn(num_samples),
    'srsdu3': np.random.randn(num_samples),
    'srscu0_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srscu1_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srscu2_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srscu3_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srsdu0_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srsdu1_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srsdu2_stressType': np.random.randint(0, 5, num_samples),  # Binary target
    'srsdu3_stressType': np.random.randint(0, 5, num_samples),  # Binary target
}

for i in range(20):
    data[f'application_feature_{i}'] = np.random.randn(num_samples)  # Additional features


# Create a DataFrame
dataset = pd.DataFrame(data)

# Create a timestamp index (assuming the start date is '2025-04-01')
dataset.index = pd.date_range(start='2025-04-01', periods=num_samples, freq='D')


# Show the first few rows of the dataset
print(dataset.head())

                host    srscu0    srscu1    srscu2    srscu3    srsdu0  \
2025-04-01  0.496714  1.399355 -0.675178 -1.907808 -0.863494 -0.423760   
2025-04-02 -0.138264  0.924634 -0.144519 -0.860385 -0.031203 -0.453414   
2025-04-03  0.647689  0.059630 -0.792420 -0.413606  0.018017 -1.795643   
2025-04-04  1.523030 -0.646937 -0.307962  1.887688  0.472630 -0.330090   
2025-04-05 -0.234153  0.698223 -1.893615  0.556553 -1.366858  0.732829   

              srsdu1    srsdu2    srsdu3  srscu0_stressType  ...  \
2025-04-01 -1.114081  0.785185 -0.033025                  1  ...   
2025-04-02 -0.630931 -1.777681 -0.503650                  2  ...   
2025-04-03 -0.942060  0.714746 -0.172375                  3  ...   
2025-04-04 -0.547996 -0.233724  0.714732                  4  ...   
2025-04-05 -0.214150  0.707458  1.277857                  4  ...   

            application_feature_10  application_feature_11  \
2025-04-01               -0.404063               -0.809919   
2025-04-02            

In [57]:
NoOfCUs = 4
NoOfDUs = 4

# Creating Topology
topology = {}

# Form the graph where srscu0 connects to srsdu0, srscu1 to srsdu1, and so on
for i in range(min(NoOfCUs, NoOfDUs)):  # Prevent index errors
    topology[f"srscu{i}"] = [f"srsdu{i}"]

# Display the graph
print(topology)


common_features = dataset.columns.tolist()
container_specific_features = {}

# Loop through and remove columns containing specific substrings
for i in range(NoOfDUs+1):
    common_features = [col for col in common_features if f"srscu{i}" not in col and f"srsdu{i}" not in col]

# Store container-specific dataframes instead of lists
for i in range(NoOfCUs+1):
    container_specific_features[f'srscu{i}'] = dataset[[col for col in dataset.columns.tolist() if f"srscu{i}" in col]]

for i in range(NoOfDUs+1):
    container_specific_features[f'srsdu{i}'] = dataset[[col for col in dataset.columns.tolist() if f"srsdu{i}" in col]]

# # Print the remaining features
# print(len(common_features), common_features)

# print("Before:")

# # Print container-specific features (as dataframes now)
# for i in range(NoOfCUs):
#     print(f"srscu{i}:")
#     print(container_specific_features[f'srscu{i}'].shape)
#     print(container_specific_features[f'srscu{i}'].head())

# for i in range(NoOfDUs):
#     print(f"srsdu{i}:")
#     print(container_specific_features[f'srsdu{i}'].shape)
#     print(container_specific_features[f'srsdu{i}'].head())

# Filter out columns containing 'stepStress' from the container-specific dataframes
for i in range(NoOfCUs):
    container_specific_features[f'srscu{i}'] = container_specific_features[f'srscu{i}'].loc[:, ~container_specific_features[f'srscu{i}'].columns.str.contains('stepStress')]

for i in range(NoOfDUs):
    container_specific_features[f'srsdu{i}'] = container_specific_features[f'srsdu{i}'].loc[:, ~container_specific_features[f'srsdu{i}'].columns.str.contains('stepStress')]

# print("After:")

# # Print container-specific features (after filtering)
# for i in range(NoOfCUs):
#     print(f"srscu{i}:")
#     print(container_specific_features[f'srscu{i}'].shape)
#     print(container_specific_features[f'srscu{i}'].head())

# for i in range(NoOfDUs):
#     print(f"srsdu{i}:")
#     print(container_specific_features[f'srsdu{i}'].shape)
#     print(container_specific_features[f'srsdu{i}'].head())


# Iterate through the topology and combine features
combined_samples = {}

for CU in topology.keys():
    # The CU container-specific features
    CU_features = container_specific_features[CU]
    
    # The connected DUs (from topology)
    connected_DUs = topology[CU]
    
    # Add CU-specific features to the combined list
    CU_features_list = CU_features.columns.tolist()
    
    # Extract the CU stress type column (if exists)
    CU_stressType = f'{CU}_stressType' if f'{CU}_stressType' in CU_features.columns else None
    
    # Add DU-specific features to the combined list for each connected DU
    for DU in connected_DUs:
        # Ensure DU exists in container_specific_features
        if DU in container_specific_features:
            DU_features = container_specific_features[DU]
            DU_features_list = DU_features.columns.tolist()

            # Combine CU and DU features (remove the stress type columns from features)
            combined_features = common_features.copy()  # Start with the common features
            
            # Modify these lines:
            combined_features.extend(CU_features_list)  # Keep all CU features
            combined_features.extend(DU_features_list)  # Keep all DU features

            

            # Extract targets and remove them from features
            targets = [col for col in combined_features if '_stressType' in col]

            # To keep stressType columns temporarily:
            combined_samples[(CU, DU)] = {
                'features': list(set(combined_features) - set(targets)),  # Include targets in features temporarily
                'targets': list(set(targets))
            }

        else:
            print(f"Error: {DU} not found in container_specific_features!")
            continue  # Skip this DU if not found in container_specific_features
    
print(combined_samples)
# Print the results for each CU-DU pair and its combined features
for (CU, DU), sample in combined_samples.items():
    print(f"Host and CU: {CU}, DU: {DU} - Combined Features:")
    print(f"Number of Features: {len(sample['features'])}")
    print(f"Number of Targets: {len(sample['targets'])}")
    print(sample['features'][:10])  # Print first 10 features as a preview
    print("----" * 10)


{'srscu0': ['srsdu0'], 'srscu1': ['srsdu1'], 'srscu2': ['srsdu2'], 'srscu3': ['srsdu3']}
{('srscu0', 'srsdu0'): {'features': ['application_feature_12', 'application_feature_13', 'application_feature_8', 'srsdu0', 'application_feature_10', 'host', 'application_feature_0', 'application_feature_16', 'application_feature_14', 'application_feature_4', 'application_feature_18', 'application_feature_9', 'application_feature_11', 'srscu0', 'application_feature_17', 'application_feature_7', 'application_feature_6', 'application_feature_1', 'application_feature_2', 'application_feature_5', 'application_feature_3', 'application_feature_15', 'application_feature_19'], 'targets': ['srscu0_stressType', 'srsdu0_stressType']}, ('srscu1', 'srsdu1'): {'features': ['application_feature_12', 'application_feature_13', 'application_feature_8', 'application_feature_10', 'host', 'application_feature_0', 'application_feature_16', 'application_feature_14', 'application_feature_4', 'application_feature_18', 'srs

In [None]:
# AUC_ROC = []
# PRECISION = []
# RECALL = []
# F1_SCORE = []

# for (CU, DU), sample in combined_samples.items():
#     print(f"Training on data of Host, {CU} and {DU}")


#     # Load data with duplicate handling
#     raw_data = dataset[sample['features'] + sample['targets']].copy()
#     raw_data = raw_data.loc[:, ~raw_data.columns.duplicated()]  # KEY FIX


#     # Filter using targets
#     for target_col in sample['targets']:
#         raw_data = raw_data[raw_data[target_col].isin([0, 1, 2, 3])]


#     # **Data Preprocessing**
#     # Handle missing values
#     raw_data = raw_data.apply(lambda x: x.fillna(0) if x.isna().all() else x)


#     threshold = 0.6 * len(raw_data)
#     raw_data = raw_data.loc[:, ~raw_data.columns.duplicated()]  # Remove duplicates

#     for col in raw_data.columns:
#         nan_count = raw_data[col].isna().sum()
#         if int(nan_count) > threshold:  # Explicit scalar conversion
#             mode_value = raw_data[col].mode().iloc[0] if not raw_data[col].mode().empty else 0
#             raw_data[col].fillna(mode_value, inplace=True)

#     numeric_cols = raw_data.select_dtypes(include=[np.number]).columns
#     raw_data[numeric_cols] = raw_data[numeric_cols].fillna(raw_data[numeric_cols].mean())


#     # **Convert target columns to binary (0 or 1)**
#     # Instead of using a loop over rows, we can do it in a vectorized way
#     for target_col in sample['targets']:
#         raw_data[target_col] = raw_data[target_col].apply(lambda x: 1 if x != 0 else 0)


#     # Create unified target column
#     raw_data['target'] = 0
#     for idx in raw_data.index:
#         if any(raw_data.loc[idx, sample['targets']] == 1):
#             raw_data.at[idx, 'target'] = 1

    
#     X = raw_data.drop(columns=sample['targets']+['target'])
#     Y = raw_data['target']

#     # To avoid division by zero:
#     X = (X - X.mean()) / (X.std() + 1e-8)

#     train_idx = int(0.8 * len(X))

#     # Concatenate X and Y into raw_data (pd.concat is used to join the features and targets)
#     raw_data = pd.concat([X, Y], axis=1)


#     raw_data_training = raw_data[:train_idx]
#     raw_data_testing = raw_data[train_idx:]


#     # Convert all columns to float16
#     raw_data_training = raw_data_training.astype(np.float16)
#     raw_data_testing = raw_data_testing.astype(np.float16)


#     # Convert to PyTorch tensors
#     features = torch.FloatTensor(raw_data_training.drop(columns=['target']).values).to(device)
#     labels = torch.LongTensor(raw_data_training['target'].values).to(device)

#     # Create dataset and dataloader
#     trainingDataset = TensorDataset(features, labels)
#     trainingDataloader = DataLoader(trainingDataset, batch_size=32, shuffle=True)


#     # Hyperparameters
#     latent_dim = raw_data_training.shape[1] - 1
#     # latent_dim = 100
#     num_features = raw_data_training.shape[1] - 1
#     num_classes = 2
#     lr = 0.0002
#     num_epochs = 100

#     print(f'\n\nJVGAN parameters: Latent Dimension = {latent_dim}, num_features = {num_features}, lr={lr}', end="\n\n")


#     # # Check if the models exist
#     # generator_model_path = 'generator.pth'
#     # discriminator_model_path = 'discriminator.pth'

#     # # Initialize the models if they don't exist, otherwise load the saved models
#     # if os.path.exists(generator_model_path) and os.path.exists(discriminator_model_path):
#     #     # Load pre-trained models
#     #     generator = Generator(latent_dim, num_features, num_classes).to(device)
#     #     discriminator = Discriminator(num_features, num_classes).to(device)
        
#     #     # Load the state dicts for both generator and discriminator
#     #     generator.load_state_dict(torch.load(generator_model_path))
#     #     discriminator.load_state_dict(torch.load(discriminator_model_path))
        
#     #     print("Loaded pre-trained generator and discriminator models.")
#     # else:
#     #     # Initialize models if they don't exist
#     #     generator = Generator(latent_dim, num_features, num_classes).to(device)
#     #     discriminator = Discriminator(num_features, num_classes).to(device)
        
#     #     print("Initialized new generator and discriminator models.")

#     # Initialize models
#     generator = Generator(latent_dim, num_features, num_classes).to(device)
#     discriminator = Discriminator(num_features, num_classes).to(device)

#     # Optimizers
#     g_optimizer = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
#     d_optimizer = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))

#     # Loss function
#     criterion = nn.BCELoss()

#     # Training loop
#     for epoch in range(num_epochs):
#         for i, (real_data, real_labels) in enumerate(trainingDataloader):
#             batch_size = real_data.size(0)
            
#             # Train Discriminator
#             d_optimizer.zero_grad()
            
#             # Real data
#             real_labels_onehot = nn.functional.one_hot(real_labels, num_classes).float().to(device)
#             real_validity = discriminator(real_data, real_labels_onehot)
#             d_real_loss = criterion(real_validity, torch.ones_like(real_validity).to(device))
            
#             # Fake data
#             z = torch.randn(batch_size, latent_dim).to(device)
#             fake_labels = torch.randint(0, num_classes, (batch_size,)).to(device)
#             fake_labels_onehot = nn.functional.one_hot(fake_labels, num_classes).float().to(device)
#             fake_data = generator(z, fake_labels_onehot)
#             fake_validity = discriminator(fake_data.detach(), fake_labels_onehot)
#             d_fake_loss = criterion(fake_validity, torch.zeros_like(fake_validity).to(device))
            
#             d_loss = d_real_loss + d_fake_loss
#             d_loss.backward()
#             d_optimizer.step()
            
#             # Train Generator
#             g_optimizer.zero_grad()
            
#             z = torch.randn(batch_size, latent_dim).to(device)
#             fake_labels = torch.randint(0, num_classes, (batch_size,)).to(device)
#             fake_labels_onehot = nn.functional.one_hot(fake_labels, num_classes).float().to(device)
#             fake_data = generator(z, fake_labels_onehot)
#             fake_validity = discriminator(fake_data, fake_labels_onehot)
#             g_loss = criterion(fake_validity, torch.ones_like(fake_validity).to(device))
            
#             g_loss.backward()
#             g_optimizer.step()
            
#             if i % 100 == 0:
#                 print(f"Epoch [{epoch}/{num_epochs}] Batch [{i}/{len(trainingDataloader)}] "
#                     f"D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")


#     # Testing
#     # Convert to PyTorch tensors
#     test_z = torch.FloatTensor(raw_data_testing.values[:, :-1]).to(device)
#     test_labels = torch.LongTensor(raw_data_testing['target'].values).to(device)
#     test_labels_onehot = nn.functional.one_hot(test_labels, num_classes).float().to(device)
#     test_data = generator(test_z, test_labels_onehot)

#     torch.save(generator.state_dict(), 'generator.pth')
#     torch.save(discriminator.state_dict(), 'discriminator.pth')

#     # Evaluate the generated data for anomaly detection
#     def evaluate_anomaly_detection(generator, real_data, labels, num_samples=1000):
#         # Generate synthetic data
#         z = torch.randn(num_samples, latent_dim).to(device)
#         synthetic_labels = torch.randint(0, num_classes, (num_samples,)).to(device)
#         synthetic_labels_onehot = nn.functional.one_hot(synthetic_labels, num_classes).float().to(device)
#         synthetic_data = generator(z, synthetic_labels_onehot)
        
#         # Combine real and synthetic data
#         all_data = torch.cat([real_data, synthetic_data], dim=0)
#         all_labels = torch.cat([labels, synthetic_labels], dim=0)
        
#         # Use discriminator to classify real vs synthetic
#         with torch.no_grad():
#             predictions = discriminator(all_data, nn.functional.one_hot(all_labels, num_classes).float().to(device))
        
#         # Convert predictions to binary (0 for synthetic, 1 for real)
#         predictions = (predictions > 0.5).float()
        
#         # Calculate anomaly detection metrics
#         real_labels = torch.ones(real_data.size(0)).to(device)
#         synthetic_labels = torch.zeros(synthetic_data.size(0)).to(device)
#         true_labels = torch.cat([real_labels, synthetic_labels], dim=0)
        
#         auc_roc = roc_auc_score(true_labels.cpu().numpy(), predictions.cpu().numpy())
#         precision, recall, f1, _ = precision_recall_fscore_support(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='binary')
        
#         return auc_roc, precision, recall, f1


#     # Evaluate anomaly detection performance
#     auc_roc, precision, recall, f1 = evaluate_anomaly_detection(generator, features, labels)
    
#     print(f"AUC-ROC: {auc_roc:.4f}")
#     print(f"Precision: {precision:.4f}")
#     print(f"Recall: {recall:.4f}")
#     print(f"F1 Score: {f1:.4f}")


#     AUC_ROC.append(auc_roc)
#     PRECISION.append(precision)
#     RECALL.append(recall)
#     F1_SCORE.append(f1)

# print(f"AUC-ROC: {AUC_ROC}")
# print(f"Precision: {PRECISION}")
# print(f"Recall: {RECALL}")
# print(f"F1-Score: {F1_SCORE}")

In [59]:
CU="srscu0"
DU="srsdu0"

raw_data = combined_samples[(CU, DU)]['features']
raw_data = dataset[raw_data + combined_samples[(CU, DU)]['targets']].copy()
raw_data = raw_data.loc[:, ~raw_data.columns.duplicated()]  # KEY FIX
# Filter using targets
for target_col in combined_samples[(CU, DU)]['targets']:
    raw_data = raw_data[raw_data[target_col].isin([0, 1, 2, 3])]

sorted_feature_names = sorted(raw_data.columns.tolist())
raw_data = raw_data[sorted_feature_names]

print(raw_data.head())

# Scale features and apply PCA
scaler = MinMaxScaler(feature_range=(-1, 1))
features_scaled = scaler.fit_transform(raw_data.drop(columns=combined_samples[(CU, DU)]['targets']))
features_scaled = pd.DataFrame(features_scaled, columns=raw_data.drop(columns=combined_samples[(CU, DU)]['targets']).columns.tolist())

pca = PCA(n_components=PCA_DIMENSION)
print(features_scaled.shape)
# Apply PCA
features_pca = pca.fit_transform(features_scaled)
# Convert to PyTorch tensors
features = torch.FloatTensor(features_pca).to(device)
labels = torch.LongTensor(raw_data[combined_samples[(CU, DU)]['targets']].values).to(device)

features_pca = pd.DataFrame(features_pca, columns=[f'pca_{i}' for i in range(PCA_DIMENSION)])

#combine the target column with the pca features
for target in combined_samples[(CU, DU)]['targets']:
    features_pca[target] = raw_data[target].values

features_pca = features_pca.dropna()
features_pca = features_pca.astype(np.float16)

X_features, X_targets, y_features, y_targets = create_sequences(features_pca, lookback, lookforward, combined_samples[(CU, DU)]['targets'])

# print(f"X_features shape: {X_features.shape}")
# print(f"y_features shape: {y_features.shape}")
# print(f"y_targets shape: {y_targets.shape}")
X_original_features, X_targets, y_original_features, y_targets = create_sequences(features_scaled, lookback, lookforward, combined_samples[(CU, DU)]['targets'])
# print(f"X_original_features shape: {X_original_features.shape}")
# print(f"y_original_features shape: {y_original_features.shape}")
# print(f"y_targets shape: {y_targets.shape}")

# Convert to PyTorch tensors
X_features = torch.FloatTensor(X_features).to(device)
X_targets = torch.FloatTensor(X_targets).to(device)
y_features = torch.FloatTensor(y_features).to(device)
y_targets = torch.FloatTensor(y_targets).to(device)

X_original_features = torch.FloatTensor(X_original_features).to(device)
X_original_features = X_original_features.view(X_original_features.size(0), -1)
y_original_features = torch.FloatTensor(y_original_features).to(device)
y_original_features = y_original_features.view(y_original_features.size(0), -1)

            application_feature_0  application_feature_1  \
2025-04-01               1.372963               0.502080   
2025-04-02              -0.139897               0.775624   
2025-04-07              -0.891497              -0.171421   
2025-04-08              -0.006526               0.807843   
2025-04-09               0.165892               0.026686   

            application_feature_10  application_feature_11  \
2025-04-01               -0.404063               -0.809919   
2025-04-02                1.445440               -0.133404   
2025-04-07                0.680433               -0.598605   
2025-04-08                0.460853               -1.269932   
2025-04-09               -0.892247                0.482648   

            application_feature_12  application_feature_13  \
2025-04-01                0.551614                0.773489   
2025-04-02                0.017327               -0.588120   
2025-04-07               -0.330304                0.134943   
2025-04-08        

KeyError: "None of [Index(['srscu0_stressType', 'srsdu0_stressType'], dtype='object')] are in the [columns]"

In [None]:
feature_names = dataset.columns
timestamps = dataset.index


# # Filter out samples where the target is not in {0, 1, 2, 3}
# dataset = dataset[dataset['target'].isin([0, 1, 2, 3])]


# # Separate features and target
# target_col = 'target'
# target = dataset[target_col]
# original_features = dataset.drop(columns=['Timestamp'], errors='ignore')

print(f"Number of columns in dataset: {len(dataset.columns)}")
print(dataset.head())

# # Binarize the target column
# target = dataset[target_col].apply(lambda x: 0 if x == 0 else 1)

# # print(f"Number of columns in dataset: {len(original_features.columns)}")

Number of columns in dataset: 17
                host    srscu0    srscu1    srscu2    srscu3    srsdu0  \
2025-04-01  0.496714 -1.415371  0.357787 -0.828995 -1.594428  0.926178   
2025-04-02 -0.138264 -0.420645  0.560785 -0.560181 -0.599375  1.909417   
2025-04-03  0.647689 -0.342715  1.083051  0.747294  0.005244 -1.398568   
2025-04-04  1.523030 -0.802277  1.053802  0.610370  0.046981  0.562969   
2025-04-05 -0.234153 -0.161286 -1.377669 -0.020902 -0.450065 -0.650643   

              srsdu1    srsdu2    srsdu3  srscu0_stressType  \
2025-04-01  0.756989 -0.522723  0.938284                  1   
2025-04-02 -0.922165  1.049009 -0.516045                  3   
2025-04-03  0.869606 -0.704344  0.096121                  1   
2025-04-04  1.355638 -1.408461 -0.462275                  4   
2025-04-05  0.413435 -1.556629 -0.434496                  4   

            srscu1_stressType  srscu2_stressType  srscu3_stressType  \
2025-04-01                  4                  4                  4   
2

In [None]:
# Scale features and apply PCA
scaler = MinMaxScaler(feature_range=(-1, 1))
features_scaled = scaler.fit_transform(original_features)
pca = PCA(n_components=PCA_DIMENSION)
features_pca = pca.fit_transform(features_scaled)

# print(f"Number of columns in dataset: {len(features.columns)}")
# print(f"Number of rows in dataset: {len(features)}")

# print(f"Number of columns in reduced dataset: {len(features_pca[0])}")
# print(f"Number of rows in reduced dataset: {len(features_pca)}")

features_pca = pd.DataFrame(features_pca, columns=[f'pca_{i}' for i in range(PCA_DIMENSION)])

#combine the target column with the pca features
for i in range(len(features_pca)):
    features_pca.at[i, 'target'] = target[i]

# print(features_pca.head())

NameError: name 'target' is not defined

In [None]:
# **Prepare Sequences**
# Combine the target column with the features
original_features['target'] = target

X_features, X_targets, y_features, y_targets = create_sequences(features_pca, lookback, lookforward)

# print(f"X_features shape: {X_features.shape}")
# print(f"y_features shape: {y_features.shape}")
# print(f"y_targets shape: {y_targets.shape}")

X_original_features, X_targets, y_original_features, y_targets = create_sequences(original_features, lookback, lookforward)


# print(f"X_original_features shape: {X_original_features.shape}")
# print(f"y_original_features shape: {y_original_features.shape}")
# print(f"y_targets shape: {y_targets.shape}")

In [None]:
# **Stratified K-Fold Cross-Validation**
n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize metric accumulators
accuracy_scores_actual = []
f1_scores_actual = []
classification_reports_actual = []
accuracy_scores_forecasted = []
f1_scores_forecasted = []
classification_reports_forecasted = []
total_conf_matrix_actual = None
total_conf_matrix_forecasted = None
RMSE_PCA = []

print("Unique values in target:", target.unique())
print("\nOverall class distribution:", dict(zip(*np.unique(y_targets, return_counts=True))))

Unique values in target: [1]

Overall class distribution: {1: 435}


In [None]:
class SequentialGenerator(nn.Module):
    def __init__(self, latent_dim, sequence_length, hidden_dim, output_dim):
        super(SequentialGenerator, self).__init__()
        self.latent_dim = latent_dim
        self.sequence_length = sequence_length
        self.hidden_dim = hidden_dim
        
        # LSTM to process the sequence data
        self.lstm = nn.LSTM(input_size=latent_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True)
        
        # Linear layer to map LSTM output to the desired output dimension
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, z, previous_sequence):
        # z shape: (batch_size, latent_dim)
        # previous_sequence shape: (batch_size, sequence_length)
        
        # Repeat z across the sequence length dimension
        z_repeated = z.unsqueeze(1).repeat(1, self.sequence_length, 1)
        
        # LSTM to generate new sequence
        lstm_out, _ = self.lstm(z_repeated)
        
        # Output layer to generate the next time step
        generated_output = self.fc(lstm_out[:, -1, :])
        
        return generated_output

class SequentialDiscriminator(nn.Module):
    def __init__(self, sequence_length, hidden_dim, input_size):  # Changed parameter name
        super(SequentialDiscriminator, self).__init__()
        self.sequence_length = sequence_length
        self.hidden_dim = hidden_dim
        
        # Match input_size to actual feature dimension (100)
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, sequence):
        lstm_out, _ = self.lstm(sequence)  # lstm_out shape: [batch_size, hidden_size]
        return self.sigmoid(self.fc(lstm_out))

In [None]:
# **Cross-Validation Loop**
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X_features, y_targets), start=1):
    # Split data into train and test folds
    X_train_fold, X_test_fold = X_features[train_idx], X_features[test_idx]
    y_train_fold, y_test_fold = y_features[train_idx], y_features[test_idx]

    X_original_train_fold, X_original_test_fold = X_original_features[train_idx], X_original_features[test_idx]
    y_original_train_fold, y_original_test_fold =y_original_features[train_idx], y_original_features[test_idx]
    
    y_train_target, y_test_target = y_targets[train_idx], y_targets[test_idx]
    
    
    # Train LSTM model
    train_dataset = SequenceDataset(X_train_fold, y_train_fold)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    lstm_model = LSTMForecaster(input_dim, hidden_dim, num_layers, output_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)

    if(not os.path.exists('models')):
        os.makedirs('models')
    if os.path.exists(f'models/lstm_model_fold_{fold_idx}.pt'):
        lstm_model.load_state_dict(torch.load(f"models/lstm_model_fold_{fold_idx}.pt"))
        lstm_model.eval()
    else:
        for epoch in range(epochs):
            lstm_model.train()
            epoch_loss = 0
            for batch_sequences, batch_targets in train_loader:
                batch_sequences = batch_sequences.to(device, dtype=torch.float32)
                batch_targets = batch_targets.to(device, dtype=torch.float32)
                optimizer.zero_grad()
                predictions = lstm_model(batch_sequences)
                d_loss = criterion(predictions, batch_targets)
                predictions_inverse = pca.inverse_transform(predictions.cpu().detach().numpy())
                predictions_inverse_tensor = torch.tensor(predictions_inverse, dtype=torch.float32).to(device)
                batch_inverse_targets = pca.inverse_transform(batch_targets.cpu().detach().numpy())
                batch_inverse_targets_tensor = torch.tensor(batch_inverse_targets, dtype=torch.float32).to(device)
                o_loss = criterion(predictions_inverse_tensor, batch_inverse_targets_tensor)
                final_loss = d_loss + o_loss / 30
                final_loss.backward()
                optimizer.step()
                epoch_loss += final_loss.item()
        torch.save(lstm_model.state_dict(), f"models/lstm_model_fold_{fold_idx}.pt")
    
    # Forecasting with LSTM
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test_fold, dtype=torch.float32).to(device)
        y_pred_test = lstm_model(X_test_tensor)
        y_pred_pca_inverse = pca.inverse_transform(y_pred_test.cpu().numpy())
        y_true_pca_inverse = pca.inverse_transform(y_test_fold)
        rmse_pca = np.sqrt(np.mean((y_true_pca_inverse - y_pred_pca_inverse) ** 2))
        RMSE_PCA.append(rmse_pca)
        y_pred_test_original = scaler.inverse_transform(y_pred_pca_inverse)


    JVGAN_features = []
    # Filter normal data where target == 0
    original_features['Timestamp'] =  timestamps

    JVGAN_features = original_features[original_features['target'] == 0]
    
    # Ensure the data is sorted by timestamp if it's not already
    JVGAN_features = JVGAN_features.sort_values(by='Timestamp')

    # Drop the target column as it is not needed for JVGAN training
    JVGAN_features = JVGAN_features.drop(columns=['target'])

    # anomaly detection using JVGAN
    JVGAN_features = pd.DataFrame(JVGAN_features)
    # print(f"JVGAN Features:\n{JVGAN_features.head()}")
    
    # Scale features and apply PCA
    JVGAN_scaler = MinMaxScaler(feature_range=(-1, 1))
    JVGAN_features_scaled = scaler.fit_transform(JVGAN_features)

    # Train JVGAN
    real_sequences = torch.tensor(JVGAN_features_scaled, dtype=torch.float32).to(device)

    # Hyperparameters
    JVGAN_latent_dim = 100  # Latent dimension for noise vector
    JVGAN_sequence_length = lookback  # Number of previous time steps to condition on
    JVGAN_LSTM_hidden_dim = 128  # Hidden dimension for LSTM layers
    JVGAN_output_dim = JVGAN_features_scaled.shape[1]


    # Initialize models
    generator = SequentialGenerator(JVGAN_latent_dim, JVGAN_sequence_length, JVGAN_LSTM_hidden_dim, JVGAN_output_dim).to(device)
    # When creating discriminator:
    discriminator = SequentialDiscriminator(
        sequence_length=JVGAN_sequence_length,
        hidden_dim=JVGAN_LSTM_hidden_dim,
        input_size=JVGAN_features_scaled.shape[1]  # Should be 100 for your data
    ).to(device)    

    # Loss and optimizers
    criterion = nn.BCELoss()  # Binary cross-entropy for GAN
    optimizer_g = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

    num_epochs = 200
    # Training loop
    for epoch in range(num_epochs):
        for i in range(0, len(real_sequences) - JVGAN_sequence_length, batch_size):
            batch = real_sequences[i:i+batch_size]
            
            # Train Discriminator
            optimizer_d.zero_grad()
            
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)
            
            real_output = discriminator(batch)
            d_loss_real = criterion(real_output, real_labels)
            
            z = torch.randn(batch_size, JVGAN_latent_dim).to(device)
            fake_sequence = generator(z, batch)
            fake_output = discriminator(fake_sequence.detach())
            d_loss_fake = criterion(fake_output, fake_labels)
            
            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            optimizer_d.step()
            
            # Train Generator
            optimizer_g.zero_grad()
            
            fake_output = discriminator(fake_sequence)
            g_loss = criterion(fake_output, real_labels)
            
            g_loss.backward()
            optimizer_g.step()
        if(epoch % 10 == 0):
            print(f"Epoch [{epoch}/{num_epochs}], D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")


    # **Classification on Actual Future Features**
    # Anomaly Detection using Discriminator
    # with torch.no_grad():
    #     test_scores = discriminator(X_test_tensor).numpy().flatten()
    #     anomaly_preds = (test_scores < 0.5).astype(int)  # Lower scores indicate anomalies

    with torch.no_grad():
        # print(f"X_test_fold shape: {X_original_test_fold.shape}")
        # X_test_tensor is 3D: (batch_size, seq_length, input_dim)
        JVGAN_X_test_tensor = torch.tensor(X_original_test_fold, dtype=torch.float32).to(device)
        
        # Select the last timestep
        JVGAN_X_last_timestep = JVGAN_X_test_tensor[:, -1, :]  # Shape: (batch_size, input_dim)

        # Pass the last timestep to the discriminator
        # print(f"X_last_timestep shape: {JVGAN_X_last_timestep.shape}")
        test_scores = discriminator(JVGAN_X_last_timestep).cpu().numpy().flatten()
        anomaly_preds = (test_scores < 0.5).astype(int)  # Lower scores indicate anomalies
    
    torch.cuda.empty_cache()
    lstm_model = LSTMForecaster(input_dim, hidden_dim, num_layers, output_dim).to(device)
    lstm_model.load_state_dict(torch.load(f"models/lstm_model_fold_{fold_idx}.pt", map_location=device, weights_only=False))
    lstm_model.eval()

    # **Classification on Actual Future Features**
    print(f"\nClassification Report for Actual Future Features (Fold {fold_idx}):")
    print(classification_report(y_test_target, anomaly_preds))
    conf_matrix_actual = confusion_matrix(y_test_target, anomaly_preds)
    print(f"Confusion Matrix for Actual Features (Fold {fold_idx}):")
    print(conf_matrix_actual)
    accuracy_actual = accuracy_score(y_test_target, anomaly_preds)
    f1_actual = f1_score(y_test_target, anomaly_preds, average='weighted')
    accuracy_scores_actual.append(accuracy_actual)
    f1_scores_actual.append(f1_actual)
    classification_reports_actual.append(classification_report(y_test_target, anomaly_preds, output_dict=True))

    # **Classification on Forecasted Features**
    y_jvgan_pred_forecasted = discriminator(torch.tensor(y_pred_test_original, dtype=torch.float32).to(device)).cpu().detach().numpy().flatten()
    y_jvgan_pred_forecasted = (y_jvgan_pred_forecasted < 0.5).astype(int)  # Lower scores indicate anomalies
    print(f"\nClassification Report for Forecasted Features (Fold {fold_idx}):")
    print(classification_report(y_test_target, y_jvgan_pred_forecasted))
    conf_matrix_forecasted = confusion_matrix(y_test_target, y_jvgan_pred_forecasted)
    print(f"Confusion Matrix for Forecasted Features (Fold {fold_idx}):")
    print(conf_matrix_forecasted)
    accuracy_forecasted = accuracy_score(y_test_target, y_jvgan_pred_forecasted)
    f1_forecasted = f1_score(y_test_target, y_jvgan_pred_forecasted, average='weighted')
    accuracy_scores_forecasted.append(accuracy_forecasted)
    f1_scores_forecasted.append(f1_forecasted)
    classification_reports_forecasted.append(classification_report(y_test_target, y_jvgan_pred_forecasted, output_dict=True))
            
    for i in range(len(anomaly_preds)):
        if anomaly_preds[i] != 0:
            # here add the code for the integrated gradients
            # Initialize the IntegratedGradients object
            ig = IntegratedGradients(discriminator) # discriminator is the model used for anomaly detection
            # Get the input tensor
            input_tensor = torch.tensor(y_pred_test_original, dtype=torch.float32).to(device)
            # Get the baseline tensor
            baseline_tensor = torch.zeros_like(input_tensor)
            # Get the attributions
            attributions, delta = ig.attribute(input_tensor, baseline_tensor, target=0, return_convergence_delta=True)
            
            
            # Get the attributions as numpy array
            attributions = attributions.cpu().detach().numpy()
            # Get the delta as numpy array
            delta = delta.cpu().detach().numpy()

            if not os.path.exists('RCA.csv'):
                with open('RCA.csv', mode='w', newline='') as f:
                    writer = csv.writer(f)
                    # create a dictionary with feature_names and write attributions sequentially
                    attributions_dict = dict(zip(feature_names, attributions[i]))
                    attributions_dict['predicted_target'] = anomaly_preds[i]
                    writer.writerow(attributions_dict.keys())  # Write the column names (keys)

            with open('RCA.csv', mode='a', newline='') as f:
                writer = csv.writer(f)
                # create a dictionary with feature_names and write attributions sequentially
                attributions_dict = dict(zip(feature_names, attributions[i]))
                attributions_dict['predicted_target'] = anomaly_preds[i]
                writer.writerow(attributions_dict.values())

    # Accumulate confusion matrices
    if total_conf_matrix_actual is None:
        total_conf_matrix_actual = conf_matrix_actual
    else:
        total_conf_matrix_actual += conf_matrix_actual

    if total_conf_matrix_forecasted is None:
        total_conf_matrix_forecasted = conf_matrix_forecasted
    else:
        total_conf_matrix_forecasted += conf_matrix_forecasted

    torch.cuda.empty_cache()

ValueError: Found array with 0 sample(s) (shape=(0, 397)) while a minimum of 1 is required by MinMaxScaler.

In [None]:
# **Compute Average Metrics**
avg_accuracy_actual = np.mean(accuracy_scores_actual)
avg_f1_actual = np.mean(f1_scores_actual)
avg_accuracy_forecasted = np.mean(accuracy_scores_forecasted)
avg_f1_forecasted = np.mean(f1_scores_forecasted)
avg_rmse_pca = np.mean(RMSE_PCA)
avg_conf_matrix_actual = total_conf_matrix_actual / n_splits
avg_conf_matrix_forecasted = total_conf_matrix_forecasted / n_splits

# Average classification report for actual features
avg_report_actual = {}
for key in classification_reports_actual[0].keys():
    if key not in ['accuracy', 'macro avg', 'weighted avg']:
        avg_report_actual[key] = {metric: np.mean([r[key][metric] for r in classification_reports_actual])
                                    for metric in ['precision', 'recall', 'f1-score']}
avg_report_actual['macro avg'] = {metric: np.mean([r['macro avg'][metric] for r in classification_reports_actual])
                                    for metric in ['precision', 'recall', 'f1-score']}
avg_report_actual['weighted avg'] = {metric: np.mean([r['weighted avg'][metric] for r in classification_reports_actual])
                                    for metric in ['precision', 'recall', 'f1-score']}

# Average classification report for forecasted features
avg_report_forecasted = {}
for key in classification_reports_forecasted[0].keys():
    if key not in ['accuracy', 'macro avg', 'weighted avg']:
        avg_report_forecasted[key] = {metric: np.mean([r[key][metric] for r in classification_reports_forecasted])
                                        for metric in ['precision', 'recall', 'f1-score']}
avg_report_forecasted['macro avg'] = {metric: np.mean([r['macro avg'][metric] for r in classification_reports_forecasted])
                                        for metric in ['precision', 'recall', 'f1-score']}
avg_report_forecasted['weighted avg'] = {metric: np.mean([r['weighted avg'][metric] for r in classification_reports_forecasted])
                                        for metric in ['precision', 'recall', 'f1-score']}

# **Display Results**
print("\n\nAverage Metrics for Classification on Actual Future Features:")
print(f"Average Accuracy: {avg_accuracy_actual}")
print(f"Average F1-Score: {avg_f1_actual}")
print("Average Classification Report:")
for key, metrics in avg_report_actual.items():
    if key not in ['macro avg', 'weighted avg']:
        print(f"Class {key}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    else:
        print(f"{key}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
print("Average Confusion Matrix (Actual Features):")
print(avg_conf_matrix_actual)

print("\n\nAverage Metrics for Classification on Forecasted Features:")
print(f"Average Accuracy: {avg_accuracy_forecasted}")
print(f"Average F1-Score: {avg_f1_forecasted}")
print("Average Classification Report:")
for key, metrics in avg_report_forecasted.items():
    if key not in ['macro avg', 'weighted avg']:
        print(f"Class {key}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    else:
        print(f"{key}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
print(f"\nAverage RMSE (PCA Inverse): {avg_rmse_pca}")
print("Average Confusion Matrix (Forecasted Features):")
print(avg_conf_matrix_forecasted)