In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

# Load the data
data_clean = pd.read_csv('cleansample_insdn.csv')

# Sample 45000 rows from the data
df = data_clean.sample(n=45000, random_state=42)

# Separate features and labels from the DataFrame
features = df.drop(columns=["Label"]).values  # 30 features
labels = df["Label"].values  # Label column (last column)

# No scaling is applied, keeping the features in their original scale
features_scaled = features  

# Parameters for the diffusion model
timesteps = 1000
embedding_dim = 128  # Adjust based on your data
input_dim = features_scaled.shape[1]  
num_classes = len(np.unique(labels))

# Noise schedule (betas for the diffusion process)
def get_noise_schedule(timesteps):
    beta_start = 0.0001
    beta_end = 0.02
    return np.linspace(beta_start, beta_end, timesteps)

betas = get_noise_schedule(timesteps)

# Forward noise process (adding noise to data)
def forward_noise(x, t):
    noise = np.random.normal(size=x.shape)
    return np.sqrt(1 - betas[t]) * x + np.sqrt(betas[t]) * noise

# Build conditional reverse model (MLP-based), with labels
def build_conditional_reverse_model(input_dim, embedding_dim, num_classes):
    input_data = layers.Input(shape=(input_dim,))
    input_label = layers.Input(shape=(num_classes,))  # Labels one-hot encoded

    # Concatenate data and label
    x = layers.concatenate([input_data, input_label])
    x = layers.Dense(embedding_dim, activation='relu')(x)
    x = layers.Dense(embedding_dim, activation='relu')(x)
    output = layers.Dense(input_dim)(x)  # Output is the denoised (or OOD) data

    model = models.Model(inputs=[input_data, input_label], outputs=output)
    return model

# Loss function for reverse diffusion
def diffusion_loss(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

# Training the reverse diffusion model with label conditioning
def train_conditional_reverse_diffusion_model(model, data, labels, timesteps, epochs=50, batch_size=32):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Lower learning rate
    
    for epoch in range(epochs):
        for step in range(0, len(data), batch_size):
            x_batch = data[step:step+batch_size]
            y_batch = labels[step:step+batch_size]
            
            t = np.random.randint(0, timesteps)  # Randomly choose a timestep
            noisy_data = forward_noise(x_batch, t)  # Add noise to data

            # Train model to predict the clean data from noisy data
            with tf.GradientTape() as tape:
                predictions = model([noisy_data, y_batch], training=True)
                loss = diffusion_loss(x_batch, predictions)
            
            gradients = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.numpy()}")

# Convert labels to one-hot encoding
labels_one_hot = tf.keras.utils.to_categorical(labels, num_classes=num_classes)

# Build the conditional reverse model
reverse_model = build_conditional_reverse_model(input_dim, embedding_dim, num_classes)

# Train reverse model with labeled data
train_conditional_reverse_diffusion_model(reverse_model, features_scaled, labels_one_hot, timesteps, epochs=50)

# Generate OOD samples without scaling, with labels in (10000, 1) format
def generate_conditional_ood_samples_with_labels(model, num_samples, input_dim, num_classes, timesteps):
    # Randomly sample classes as integers
    random_classes = np.random.randint(0, num_classes, num_samples)  # Labels as integers, not one-hot
    
    # Start with random noise
    noise = np.random.normal(size=(num_samples, input_dim))  # Keep the noise in the original scale
    
    # Iteratively apply reverse diffusion
    for t in reversed(range(timesteps)):
        noise = model([noise, tf.keras.utils.to_categorical(random_classes, num_classes=num_classes)], training=False)  # Predict the clean data
        # Optionally, can add noise back to keep it stochastic
        noise = np.sqrt(1 - betas[t]) * noise + np.sqrt(betas[t]) * np.random.normal(size=noise.shape)
    
    # Return OOD samples and labels in shape (10000, 1)
    return noise, random_classes.reshape(-1, 1)  # Labels as (10000, 1) with values 0 or 1

# Generate OOD samples with labels in (10000, 1) format
ood_samples, ood_labels = generate_conditional_ood_samples_with_labels(reverse_model, num_samples=10000, input_dim=input_dim, num_classes=num_classes, timesteps=timesteps)

# Check the output shape
print("OOD Samples Shape:", ood_samples.shape)  # Should be (10000, input_dim)
print("OOD Labels Shape:", ood_labels.shape)    # Should be (10000, 1)
print("Sample OOD Labels:", ood_labels[:10])

# Create a Pandas DataFrame for OOD samples and labels
ood_samples_np = ood_samples.numpy() if isinstance(ood_samples, tf.Tensor) else ood_samples
ood_labels_np = ood_labels.numpy() if isinstance(ood_labels, tf.Tensor) else ood_labels

# Define column names for OOD samples based on your original dataset
column_names = df.drop(columns=["Label"]).columns.tolist()

# Create a Pandas DataFrame for the OOD samples
ood_samples_df = pd.DataFrame(ood_samples_np, columns=column_names)
ood_labels_df = pd.DataFrame(ood_labels_np, columns=['Label'])

# Concatenate the OOD samples and labels
ood_combined_df = pd.concat([ood_samples_df, ood_labels_df], axis=1)

# Save the OOD samples to CSV
ood_combined_df.to_csv('ood_samples_SDN.csv', index=False)


Epoch 1/50, Loss: 17625802752.0
Epoch 2/50, Loss: 2111551744.0
Epoch 3/50, Loss: 81706960.0
Epoch 4/50, Loss: 37312496.0
Epoch 5/50, Loss: 20806314.0
Epoch 6/50, Loss: 22675716.0
Epoch 7/50, Loss: 21469414.0
Epoch 8/50, Loss: 15504832.0
Epoch 9/50, Loss: 21020394.0
Epoch 10/50, Loss: 36449032.0
Epoch 11/50, Loss: 13996612.0
Epoch 12/50, Loss: 15704698.0
Epoch 13/50, Loss: 15011846.0
Epoch 14/50, Loss: 16277535.0
Epoch 15/50, Loss: 15697503.0
Epoch 16/50, Loss: 12515750.0
Epoch 17/50, Loss: 12072567.0
Epoch 18/50, Loss: 16311497.0
Epoch 19/50, Loss: 23723580.0
Epoch 20/50, Loss: 10800391.0
Epoch 21/50, Loss: 10407817.0
Epoch 22/50, Loss: 12170982.0
Epoch 23/50, Loss: 16641034.0
Epoch 24/50, Loss: 9647277.0
Epoch 25/50, Loss: 11903851.0
Epoch 26/50, Loss: 13505783.0
Epoch 27/50, Loss: 28423062.0
Epoch 28/50, Loss: 11431113.0
Epoch 29/50, Loss: 8240112.0
Epoch 30/50, Loss: 9163342.0
Epoch 31/50, Loss: 19432244.0
Epoch 32/50, Loss: 14819814.0
Epoch 33/50, Loss: 11171642.0
Epoch 34/50, Loss

In [19]:
newdf=pd.read_csv('ood_samples_SDN.csv')
newdf1=pd.read_csv('ood_samples_SDN1.csv')

In [20]:
newdf.head()

Unnamed: 0,Protocol,Bwd Pkts/s,Init Bwd Win Byts,Flow Pkts/s,SYN Flag Cnt,FIN Flag Cnt,ACK Flag Cnt,Down/Up Ratio,Bwd Header Len,Bwd Pkt Len Std,...,Bwd Pkt Len Min,Pkt Len Min,Fwd Pkt Len Std,Bwd PSH Flags,Bwd IAT Tot,Flow IAT Max,Bwd IAT Mean,Flow IAT Min,Subflow Fwd Pkts,Label
0,,,,,,,,,,,...,,,,,,,,,,1
1,,,,,,,,,,,...,,,,,,,,,,1
2,,,,,,,,,,,...,,,,,,,,,,1
3,-284707400000.0,-245805800000.0,-155400300000.0,80227700000.0,-22156600000.0,-110214900000.0,2666426000.0,178829500000.0,-294175200000.0,385623400000.0,...,-137716600000.0,116002600000.0,-221740700000.0,-229176300000.0,-1015367000000.0,192053900000.0,-131970600000.0,1311052000000.0,-75802760000.0,0
4,145922000.0,-114594600.0,45359130.0,347001200.0,-222365700.0,-188594700.0,-17925720.0,91622700.0,-9286026.0,-391413800.0,...,-211852100.0,-6652255.0,190381900.0,175120700.0,5581097000.0,4584634000.0,464799900.0,-352118500.0,-66954600.0,0


In [21]:
ood_combined_df1 = pd.concat([newdf, newdf1])

In [22]:
ood_combined_df1.head()

Unnamed: 0,Protocol,Bwd Pkts/s,Init Bwd Win Byts,Flow Pkts/s,SYN Flag Cnt,FIN Flag Cnt,ACK Flag Cnt,Down/Up Ratio,Bwd Header Len,Bwd Pkt Len Std,...,Bwd Pkt Len Min,Pkt Len Min,Fwd Pkt Len Std,Bwd PSH Flags,Bwd IAT Tot,Flow IAT Max,Bwd IAT Mean,Flow IAT Min,Subflow Fwd Pkts,Label
0,,,,,,,,,,,...,,,,,,,,,,1
1,,,,,,,,,,,...,,,,,,,,,,1
2,,,,,,,,,,,...,,,,,,,,,,1
3,-284707400000.0,-245805800000.0,-155400300000.0,80227700000.0,-22156600000.0,-110214900000.0,2666426000.0,178829500000.0,-294175200000.0,385623400000.0,...,-137716600000.0,116002600000.0,-221740700000.0,-229176300000.0,-1015367000000.0,192053900000.0,-131970600000.0,1311052000000.0,-75802760000.0,0
4,145922000.0,-114594600.0,45359130.0,347001200.0,-222365700.0,-188594700.0,-17925720.0,91622700.0,-9286026.0,-391413800.0,...,-211852100.0,-6652255.0,190381900.0,175120700.0,5581097000.0,4584634000.0,464799900.0,-352118500.0,-66954600.0,0


In [26]:
ood_combined_df1 =ood_combined_df1.dropna()

In [27]:
# Save the OOD samples to CSV
ood_combined_df1.to_csv('ood_samples_SDNnn.csv', index=False)