In [1]:
import pandas as pd

In [3]:
# Load your dataset
df = pd.read_csv('X:\PROJECT\DDoS-Detection-main\dataset_sdn.csv')  # Replace with actual filename

In [4]:
# Check the shape and structure
print("Dataset Shape:", df.shape)
print("\nColumn Names:\n", df.columns)
print("\nFirst few rows:\n", df.head())
print("\nData Types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

Dataset Shape: (104345, 23)

Column Names:
 Index(['dt', 'switch', 'src', 'dst', 'pktcount', 'bytecount', 'dur',
       'dur_nsec', 'tot_dur', 'flows', 'packetins', 'pktperflow',
       'byteperflow', 'pktrate', 'Pairflow', 'Protocol', 'port_no', 'tx_bytes',
       'rx_bytes', 'tx_kbps', 'rx_kbps', 'tot_kbps', 'label'],
      dtype='object')

First few rows:
       dt  switch       src       dst  pktcount  bytecount  dur   dur_nsec  \
0  11425       1  10.0.0.1  10.0.0.8     45304   48294064  100  716000000   
1  11605       1  10.0.0.1  10.0.0.8    126395  134737070  280  734000000   
2  11425       1  10.0.0.2  10.0.0.8     90333   96294978  200  744000000   
3  11425       1  10.0.0.2  10.0.0.8     90333   96294978  200  744000000   
4  11425       1  10.0.0.2  10.0.0.8     90333   96294978  200  744000000   

        tot_dur  flows  ...  pktrate  Pairflow  Protocol  port_no   tx_bytes  \
0  1.010000e+11      3  ...      451         0       UDP        3  143928631   
1  2.810000e+11

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load dataset
df = pd.read_csv('X:\PROJECT\DDoS-Detection-main\dataset_sdn.csv')  # Replace with your actual file

# Drop rows with missing values (or you could impute them)
df = df.dropna()

# Encode 'src', 'dst', and 'Protocol' categorical columns
label_encoders = {}
for col in ['src', 'dst', 'Protocol']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [6]:
# Features and label
X = df.drop(columns=['label']).values  # shape: (num_samples, num_features)
y = df['label'].values                 # shape: (num_samples,)

In [7]:
def reshape_2D_to_3D(X, y, timesteps=5):
    Xs, ys = [], []
    for i in range(len(X) - timesteps):
        Xs.append(X[i:i+timesteps])
        ys.append(y[i+timesteps])
    return np.array(Xs), np.array(ys)

X_3D, y_3D = reshape_2D_to_3D(X, y, timesteps=5)

print("3D Features shape:", X_3D.shape)  # (samples, timesteps, features)
print("Labels shape:", y_3D.shape)

3D Features shape: (103834, 5, 22)
Labels shape: (103834,)


In [12]:
# LSTM

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X_3D and y_3D are already created using the reshape_2D_to_3D function

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_3D, y_3D, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Binary classification (0 = benign, 1 = DDoS)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Classifier_accuracy=[]

# Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"LSTM Accuracy: {accuracy*100:.2f}%")

# Predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LSTM Accuracy: 60.67%
Classification Report:
               precision    recall  f1-score   support

           0       0.61      1.00      0.75     12584
           1       0.76      0.00      0.01      8183

    accuracy                           0.61     20767
   macro avg       0.68      0.50      0.38     20767
weighted avg       0.67      0.61      0.46     20767

Confusion Matrix:
 [[12577     7]
 [ 8161    22]]


In [None]:
# CNN-LSTM Hybrid

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split your 3D data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_3D, y_3D, test_size=0.2, random_state=42)

# Define the CNN-LSTM model
model = Sequential()

# CNN layers
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# LSTM layer
model.add(LSTM(units=64, return_sequences=False))
model.add(Dropout(0.3))

# Dense layers
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of CNN-LSTM Hybrid Model: {accuracy*100:.2f}%")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of CNN-LSTM Hybrid Model: 60.60%


In [None]:
# GRU

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GRU, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

# Split your 3D data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_3D, y_3D, test_size=0.2, random_state=42)

# Define the CNN-GRU model
model = Sequential()

# CNN layers
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# GRU layer
model.add(GRU(units=64, return_sequences=False))
model.add(Dropout(0.3))

# Dense layers
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Predict and evaluate
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of CNN-GRU Hybrid Model: {accuracy*100:.2f}%")

# Confusion Matrix & Additional Metrics
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
f1 = f1_score(y_test, y_pred)

print("\nConfusion Matrix:\n", cm)
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy of CNN-GRU Hybrid Model: 60.60%

Confusion Matrix:
 [[12584     0]
 [ 8183     0]]
F1 Score: 0.0000
Specificity: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       0.61      1.00      0.75     12584
           1       0.00      0.00      0.00      8183

    accuracy                           0.61     20767
   macro avg       0.30      0.50      0.38     20767
weighted avg       0.37      0.61      0.46     20767



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# LSTM Autoencoder

import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_3D, y_3D, test_size=0.2, random_state=42)

# Shape parameters
timesteps = X_train.shape[1]
n_features = X_train.shape[2]

# Define LSTM Autoencoder architecture
input_layer = Input(shape=(timesteps, n_features))
encoded = LSTM(64, return_sequences=False)(input_layer)
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(n_features, return_sequences=True)(decoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder
history = autoencoder.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_test, X_test))

# Predict reconstruction
reconstructions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructions, 2), axis=(1, 2))

# Set anomaly threshold (e.g., 95th percentile)
threshold = np.percentile(mse, 95)
y_pred = (mse > threshold).astype(int)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of LSTM Autoencoder: {accuracy*100:.2f}%")

# Confusion Matrix & Additional Metrics
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
f1 = f1_score(y_test, y_pred)

print("\nConfusion Matrix:\n", cm)
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Accuracy of LSTM Autoencoder: 58.48%

Confusion Matrix:
 [[11845   739]
 [ 7883   300]]
F1 Score: 0.0651
Specificity: 0.9413

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.94      0.73     12584
           1       0.29      0.04      0.07      8183

    accuracy                           0.58     20767
   macro avg       0.44      0.49      0.40     20767
weighted avg       0.48      0.58      0.47     20767



In [None]:
# Variational Autoencoder (VAE)

import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, Lambda
from tensorflow.keras.losses import mse
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X_3D, y_3D, test_size=0.2, random_state=42)

# Parameters
timesteps, n_features = X_train.shape[1], X_train.shape[2]
original_dim = timesteps * n_features
latent_dim = 32

# Encoder
inputs = Input(shape=(timesteps, n_features))
x = Flatten()(inputs)
h = Dense(64, activation='relu')(x)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], latent_dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling)([z_mean, z_log_var])

# Decoder
decoder_h = Dense(64, activation='relu')
decoder_output = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded = decoder_output(h_decoded)
outputs = Reshape((timesteps, n_features))(x_decoded)

# VAE Model
vae = Model(inputs, outputs)

# VAE Loss
reconstruction_loss = mse(tf.reshape(inputs, (-1, original_dim)), x_decoded)
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_mean(kl_loss) * -0.5 * latent_dim
vae_loss = tf.reduce_mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# Train
vae.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_test, X_test))

# Evaluate
reconstructions = vae.predict(X_test)
mse_score = np.mean(np.power(X_test - reconstructions, 2), axis=(1, 2))
threshold = np.percentile(mse_score, 95)
y_pred = (mse_score > threshold).astype(int)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Confusion Matrix:
 [[12584     0]
 [ 8183     0]]
F1 Score: 0.0
Classification Report:
               precision    recall  f1-score   support

           0       0.61      1.00      0.75     12584
           1       0.00      0.00      0.00      8183

    accuracy                           0.61     20767
   macro avg       0.30      0.50      0.38     20767
weighted avg       0.37      0.61      0.46     20767



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
pip install ydata-synthetic


^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
# TimeGAN – Time-Series Generative Adversarial Network

from ydata_synthetic.synthesizers.timeseries import TimeGAN
from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading

# Use your own data here (already shaped as (samples, 5, 22))
data = X_3D.astype(np.float32)

# TimeGAN config
gan_args = {
    'batch_size': 64,
    'learning_rate': 5e-4,
    'noise_dim': 32,
    'layers_dim': 128,
    'iterations': 10000
}

# Initialize & train
synthesizer = TimeGAN(model_parameters=gan_args)
synthesizer.train(data, train_steps=gan_args['iterations'])

# Generate synthetic samples
synthetic_data = synthesizer.sample(n_samples=1000)
print("Synthetic Data Shape:", synthetic_data.shape)
