In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

# Load the data
data_clean = pd.read_csv('cleansample_cicids2017.csv')

# Sample 45000 rows from the data
df = data_clean.sample(n=45000, random_state=42)

# Separate features and labels from the DataFrame
features = df.drop(columns=["Label"]).values  # 30 features
labels = df["Label"].values  # Label column (last column)

# No scaling is applied, keeping the features in their original scale
features_scaled = features  

# Parameters for the diffusion model
timesteps = 1000
embedding_dim = 128  # Adjust based on your data
input_dim = features_scaled.shape[1] 
num_classes = len(np.unique(labels))

# Noise schedule (betas for the diffusion process)
def get_noise_schedule(timesteps):
    beta_start = 0.0001
    beta_end = 0.02
    return np.linspace(beta_start, beta_end, timesteps)

betas = get_noise_schedule(timesteps)

# Forward noise process (adding noise to data)
def forward_noise(x, t):
    noise = np.random.normal(size=x.shape)
    return np.sqrt(1 - betas[t]) * x + np.sqrt(betas[t]) * noise

# Build conditional reverse model (MLP-based), with labels
def build_conditional_reverse_model(input_dim, embedding_dim, num_classes):
    input_data = layers.Input(shape=(input_dim,))
    input_label = layers.Input(shape=(num_classes,))  # Labels one-hot encoded

    # Concatenate data and label
    x = layers.concatenate([input_data, input_label])
    x = layers.Dense(embedding_dim, activation='relu')(x)
    x = layers.Dense(embedding_dim, activation='relu')(x)
    output = layers.Dense(input_dim)(x)  # Output is the denoised (or OOD) data

    model = models.Model(inputs=[input_data, input_label], outputs=output)
    return model

# Loss function for reverse diffusion
def diffusion_loss(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

# Training the reverse diffusion model with label conditioning
def train_conditional_reverse_diffusion_model(model, data, labels, timesteps, epochs=50, batch_size=32):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Lower learning rate
    
    for epoch in range(epochs):
        for step in range(0, len(data), batch_size):
            x_batch = data[step:step+batch_size]
            y_batch = labels[step:step+batch_size]
            
            t = np.random.randint(0, timesteps)  # Randomly choose a timestep
            noisy_data = forward_noise(x_batch, t)  # Add noise to data

            # Train model to predict the clean data from noisy data
            with tf.GradientTape() as tape:
                predictions = model([noisy_data, y_batch], training=True)
                loss = diffusion_loss(x_batch, predictions)
            
            gradients = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.numpy()}")

# Convert labels to one-hot encoding
labels_one_hot = tf.keras.utils.to_categorical(labels, num_classes=num_classes)

# Build the conditional reverse model
reverse_model = build_conditional_reverse_model(input_dim, embedding_dim, num_classes)

# Train reverse model with labeled data
train_conditional_reverse_diffusion_model(reverse_model, features_scaled, labels_one_hot, timesteps, epochs=50)

# Generate OOD samples without scaling, with labels in (10000, 1) format
def generate_conditional_ood_samples_with_labels(model, num_samples, input_dim, num_classes, timesteps):
    # Randomly sample classes as integers
    random_classes = np.random.randint(0, num_classes, num_samples)  # Labels as integers, not one-hot
    
    # Start with random noise
    noise = np.random.normal(size=(num_samples, input_dim))  # Keep the noise in the original scale
    
    # Iteratively apply reverse diffusion
    for t in reversed(range(timesteps)):
        noise = model([noise, tf.keras.utils.to_categorical(random_classes, num_classes=num_classes)], training=False)  # Predict the clean data
        # Optionally, can add noise back to keep it stochastic
        noise = np.sqrt(1 - betas[t]) * noise + np.sqrt(betas[t]) * np.random.normal(size=noise.shape)
    
    # Return OOD samples and labels in shape (10000, 1)
    return noise, random_classes.reshape(-1, 1)  # Labels as (10000, 1) with values 0 or 1

# Generate OOD samples with labels in (10000, 1) format
ood_samples, ood_labels = generate_conditional_ood_samples_with_labels(reverse_model, num_samples=10000, input_dim=input_dim, num_classes=num_classes, timesteps=timesteps)

# Check the output shape
print("OOD Samples Shape:", ood_samples.shape)  # Should be (10000, input_dim)
print("OOD Labels Shape:", ood_labels.shape)    # Should be (10000, 1)
print("Sample OOD Labels:", ood_labels[:10])

# Create a Pandas DataFrame for OOD samples and labels
ood_samples_np = ood_samples.numpy() if isinstance(ood_samples, tf.Tensor) else ood_samples
ood_labels_np = ood_labels.numpy() if isinstance(ood_labels, tf.Tensor) else ood_labels

# Define column names for OOD samples based on your original dataset
column_names = df.drop(columns=["Label"]).columns.tolist()

# Create a Pandas DataFrame for the OOD samples
ood_samples_df = pd.DataFrame(ood_samples_np, columns=column_names)
ood_labels_df = pd.DataFrame(ood_labels_np, columns=['Label'])

# Concatenate the OOD samples and labels
ood_combined_df = pd.concat([ood_samples_df, ood_labels_df], axis=1)

# Save the OOD samples to CSV
ood_combined_df.to_csv('ood_samples_CICIDS2017.csv', index=False)




Epoch 1/50, Loss: 73679806464.0
Epoch 2/50, Loss: 32025475072.0
Epoch 3/50, Loss: 17187530752.0
Epoch 4/50, Loss: 9119132672.0
Epoch 5/50, Loss: 5388014080.0
Epoch 6/50, Loss: 4026478848.0
Epoch 7/50, Loss: 3655920384.0
Epoch 8/50, Loss: 3081977344.0
Epoch 9/50, Loss: 2680680192.0
Epoch 10/50, Loss: 2824721664.0
Epoch 11/50, Loss: 4238984192.0
Epoch 12/50, Loss: 3036802816.0
Epoch 13/50, Loss: 2288489984.0
Epoch 14/50, Loss: 2044369536.0
Epoch 15/50, Loss: 2845809408.0
Epoch 16/50, Loss: 2145673472.0
Epoch 17/50, Loss: 1774596608.0
Epoch 18/50, Loss: 1700304000.0
Epoch 19/50, Loss: 4041637376.0
Epoch 20/50, Loss: 3025621760.0
Epoch 21/50, Loss: 2300027136.0
Epoch 22/50, Loss: 3869168896.0
Epoch 23/50, Loss: 2644785664.0
Epoch 24/50, Loss: 1414704384.0
Epoch 25/50, Loss: 1870813440.0
Epoch 26/50, Loss: 1332093184.0
Epoch 27/50, Loss: 2069321600.0
Epoch 28/50, Loss: 1748551680.0
Epoch 29/50, Loss: 1713818240.0
Epoch 30/50, Loss: 1315737600.0
Epoch 31/50, Loss: 1318360832.0
Epoch 32/50,

In [2]:
newdf=pd.read_csv('ood_samples_CICIDS2017.csv')

In [3]:
newdf.head()

Unnamed: 0,Bwd Packet Length Std,Packet Length Std,Bwd Packet Length Max,Avg Bwd Segment Size,Bwd Packet Length Mean,Average Packet Size,Packet Length Variance,ACK Flag Count,Fwd IAT Std,Packet Length Mean,...,Idle Max,Flow Duration,Bwd IAT Total,act_data_pkt_fwd,Min Packet Length,Fwd IAT Mean,Bwd IAT Max,URG Flag Count,Fwd Packet Length Std,Label
0,-1.0949179999999998e+29,5.350024e+29,-1.9627379999999998e+30,3.810242e+29,-1.87774e+29,2.089048e+30,2.8319110000000003e+31,5.854916e+29,3.209356e+31,1.1872999999999999e+30,...,1.2998910000000001e+31,3.6248170000000004e+31,2.0258510000000002e+31,1.288902e+30,5.2199129999999995e+29,2.5207850000000002e+31,3.2075000000000004e+31,1.040691e+30,-3.5883e+29,0
1,-8.381894e+27,4.0953e+28,-1.502367e+29,2.916673e+28,-1.437232e+28,1.5990469999999998e+29,2.167653e+30,4.481571e+28,2.456566e+30,9.087922e+28,...,9.949804999999999e+29,2.774563e+30,1.550658e+30,9.865701e+28,3.995619e+28,1.929511e+30,2.455145e+30,7.966081e+28,-2.746534e+28,1
2,-2.134693e+28,1.0428869999999999e+29,-3.82588e+29,7.427595e+28,-3.659997e+28,4.072041e+29,5.520051e+30,1.1412479999999999e+29,6.255781e+30,2.314299e+29,...,2.533771e+30,7.065574e+30,3.948836e+30,2.5123529999999997e+29,1.01751e+29,4.913614e+30,6.252162e+30,2.028612e+29,-6.994174e+28,1
3,-2.310333e+28,1.1287509999999999e+29,-4.1409269999999995e+29,8.038998e+28,-3.961325e+28,4.407414e+29,5.974637e+30,1.2352499999999998e+29,6.770955e+30,2.504889e+29,...,2.742434e+30,7.647443999999999e+30,4.274031e+30,2.719265e+29,1.101292e+29,5.318248e+30,6.76704e+30,2.19566e+29,-7.570301e+28,1
4,-2.590786e+28,1.265932e+29,-4.644165e+29,9.016211e+28,-4.442834e+28,4.942979999999999e+29,6.700713e+30,1.385351e+29,7.5938e+30,2.8093019999999997e+29,...,3.075716e+30,8.576814999999999e+30,4.793439e+30,3.0497229999999996e+29,1.235155e+29,5.964552e+30,7.589411e+30,2.4624719999999998e+29,-8.490107e+28,1
