In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

# Load the data
data_clean = pd.read_csv('cleansample_ciciot23.csv')

# Sample 45000 rows from the data
df = data_clean.sample(n=50000, random_state=42)

In [13]:
df.head()

Unnamed: 0,Header_Length,LLC,TCP,UDP,DHCP,ARP,ICMP,IGMP,IPv,Tot sum,...,ece_flag_number,Time_To_Live,Rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,cwr_flag_number,Label
636342,25.2,1.0,0.7,0.3,0.0,0.0,0.0,0.0,1.0,1128,...,0.0,85.7,45.588031,0.0,0.2,0.0,0.2,0.6,0.0,0
791919,20.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6000,...,0.0,64.0,21335.286637,0.0,0.82,0.18,0.0,0.0,0.0,1
554820,21.28,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6000,...,0.0,64.0,19542.021153,0.0,1.0,0.0,0.0,0.32,0.0,1
416904,19.88,1.0,0.99,0.01,0.0,0.0,0.0,0.0,1.0,6093,...,0.0,64.64,19084.102284,0.0,0.99,0.0,0.0,0.0,0.0,1
273907,32.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9359,...,0.0,130.8,725.118683,0.0,0.0,0.0,0.1,1.0,0.0,0


In [None]:


# Separate features and labels from the DataFrame
features = df.drop(columns=["Label"]).values  # 30 features
labels = df["Label"].values  # Label column (last column)

# No scaling is applied, keeping the features in their original scale
features_scaled = features  

# Parameters for the diffusion model
timesteps = 1000
embedding_dim = 128  # Adjust based on your data
input_dim = features_scaled.shape[1]  
num_classes = len(np.unique(labels))

# Noise schedule (betas for the diffusion process)
def get_noise_schedule(timesteps):
    beta_start = 0.0001
    beta_end = 0.02
    return np.linspace(beta_start, beta_end, timesteps)

betas = get_noise_schedule(timesteps)

# Forward noise process (adding noise to data)
def forward_noise(x, t):
    noise = np.random.normal(size=x.shape)
    return np.sqrt(1 - betas[t]) * x + np.sqrt(betas[t]) * noise

# Build conditional reverse model (MLP-based), with labels
def build_conditional_reverse_model(input_dim, embedding_dim, num_classes):
    input_data = layers.Input(shape=(input_dim,))
    input_label = layers.Input(shape=(num_classes,))  # Labels one-hot encoded

    # Concatenate data and label
    x = layers.concatenate([input_data, input_label])
    x = layers.Dense(embedding_dim, activation='relu')(x)
    x = layers.Dense(embedding_dim, activation='relu')(x)
    output = layers.Dense(input_dim)(x)  # Output is the denoised (or OOD) data

    model = models.Model(inputs=[input_data, input_label], outputs=output)
    return model

# Loss function for reverse diffusion
def diffusion_loss(y_true, y_pred):
    return tf.reduce_mean(tf.square(y_true - y_pred))

# Training the reverse diffusion model with label conditioning
def train_conditional_reverse_diffusion_model(model, data, labels, timesteps, epochs=50, batch_size=32):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Lower learning rate
    
    for epoch in range(epochs):
        for step in range(0, len(data), batch_size):
            x_batch = data[step:step+batch_size]
            y_batch = labels[step:step+batch_size]
            
            t = np.random.randint(0, timesteps)  # Randomly choose a timestep
            noisy_data = forward_noise(x_batch, t)  # Add noise to data

            # Train model to predict the clean data from noisy data
            with tf.GradientTape() as tape:
                predictions = model([noisy_data, y_batch], training=True)
                loss = diffusion_loss(x_batch, predictions)
            
            gradients = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.numpy()}")

# Convert labels to one-hot encoding
labels_one_hot = tf.keras.utils.to_categorical(labels, num_classes=num_classes)

# Build the conditional reverse model
reverse_model = build_conditional_reverse_model(input_dim, embedding_dim, num_classes)

# Train reverse model with labeled data
train_conditional_reverse_diffusion_model(reverse_model, features_scaled, labels_one_hot, timesteps, epochs=50)

# Generate OOD samples without scaling, with labels in (10000, 1) format
def generate_conditional_ood_samples_with_labels(model, num_samples, input_dim, num_classes, timesteps):
    # Randomly sample classes as integers
    random_classes = np.random.randint(0, num_classes, num_samples)  # Labels as integers, not one-hot
    
    # Start with random noise
    noise = np.random.normal(size=(num_samples, input_dim))  # Keep the noise in the original scale
    
    # Iteratively apply reverse diffusion
    for t in reversed(range(timesteps)):
        noise = model([noise, tf.keras.utils.to_categorical(random_classes, num_classes=num_classes)], training=False)  # Predict the clean data
        # Optionally, can add noise back to keep it stochastic
        noise = np.sqrt(1 - betas[t]) * noise + np.sqrt(betas[t]) * np.random.normal(size=noise.shape)
    
    # Return OOD samples and labels in shape (10000, 1)
    return noise, random_classes.reshape(-1, 1)  # Labels as (10000, 1) with values 0 or 1

# Generate OOD samples with labels in (10000, 1) format
ood_samples, ood_labels = generate_conditional_ood_samples_with_labels(reverse_model, num_samples=20000, input_dim=input_dim, num_classes=num_classes, timesteps=timesteps)

# Check the output shape
print("OOD Samples Shape:", ood_samples.shape)  # Should be (10000, input_dim)
print("OOD Labels Shape:", ood_labels.shape)    # Should be (10000, 1)
print("Sample OOD Labels:", ood_labels[:10])

# Create a Pandas DataFrame for OOD samples and labels
ood_samples_np = ood_samples.numpy() if isinstance(ood_samples, tf.Tensor) else ood_samples
ood_labels_np = ood_labels.numpy() if isinstance(ood_labels, tf.Tensor) else ood_labels

# Define column names for OOD samples based on your original dataset
column_names = df.drop(columns=["Label"]).columns.tolist()

# Create a Pandas DataFrame for the OOD samples
ood_samples_df = pd.DataFrame(ood_samples_np, columns=column_names)
ood_labels_df = pd.DataFrame(ood_labels_np, columns=['Label'])

# Concatenate the OOD samples and labels
ood_combined_df = pd.concat([ood_samples_df, ood_labels_df], axis=1)

# Save the OOD samples to CSV
ood_combined_df.to_csv('ood_samples_IoT.csv', index=False)


Epoch 1/50, Loss: 15198.248046875
Epoch 2/50, Loss: 4794.9453125
Epoch 3/50, Loss: 2073.04052734375
Epoch 4/50, Loss: 1166.8824462890625
Epoch 5/50, Loss: 1103.0015869140625
Epoch 6/50, Loss: 813.5145874023438
Epoch 7/50, Loss: 818.8939819335938
Epoch 8/50, Loss: 427.78118896484375
Epoch 9/50, Loss: 435.6746520996094
Epoch 10/50, Loss: 313.1453857421875
Epoch 11/50, Loss: 231.5169219970703
Epoch 12/50, Loss: 429.9721374511719
Epoch 13/50, Loss: 364.1856689453125
Epoch 14/50, Loss: 193.2771759033203
Epoch 15/50, Loss: 198.85357666015625
Epoch 16/50, Loss: 400.94342041015625
Epoch 17/50, Loss: 136.79483032226562
Epoch 18/50, Loss: 128.87208557128906
Epoch 19/50, Loss: 118.69204711914062
Epoch 20/50, Loss: 191.47357177734375
Epoch 21/50, Loss: 107.09317016601562
Epoch 22/50, Loss: 293.2994689941406
Epoch 23/50, Loss: 240.05628967285156
Epoch 24/50, Loss: 80.39372253417969
Epoch 25/50, Loss: 138.4593048095703
Epoch 26/50, Loss: 425.3372497558594
Epoch 27/50, Loss: 95.53495025634766
Epoch 2

In [15]:
newdf=pd.read_csv('ood_samples_IoT.csv')

In [16]:
newdf.head()

Unnamed: 0,Header_Length,LLC,TCP,UDP,DHCP,ARP,ICMP,IGMP,IPv,Tot sum,...,ece_flag_number,Time_To_Live,Rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,cwr_flag_number,Label
0,109.77064,1.748207,7.294745,-6.981845,-1.471898,0.035954,4.584979,11.591014,-0.087753,1692.4153,...,9.731266,608.89246,882.9111,-3.454185,8.889511,7.784659,3.099734,6.60099,-13.527213,0
1,81.16375,1.488363,5.336557,-5.054549,-0.907123,-0.507603,2.840215,8.99997,0.457124,1251.7667,...,7.142943,450.5136,634.31287,-3.228183,6.522609,5.677407,2.075346,5.244236,-9.897901,1
2,81.31266,1.472496,5.33175,-5.082406,-0.91854,-0.499593,2.864492,9.031106,0.45167,1253.8086,...,7.135342,451.23407,635.2308,-3.234776,6.537467,5.675081,2.065433,5.280328,-9.949425,1
3,80.74574,1.477105,5.278973,-5.059063,-0.91555,-0.503525,2.815097,8.97814,0.457655,1244.9521,...,7.094887,448.06384,630.49664,-3.196458,6.476322,5.64611,2.052216,5.240566,-9.860132,1
4,80.89248,1.475915,5.312867,-5.073564,-0.917409,-0.495399,2.824968,8.977908,0.467393,1247.0093,...,7.098178,448.83362,631.6133,-3.215761,6.514331,5.666192,2.092775,5.216328,-9.863365,1
