In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Lambda
import tensorflow.keras.backend as K
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv('UNSW_NB15_training-set.csv')

In [None]:
print(data.head())

   id       dur proto service state  spkts  dpkts  sbytes  dbytes  \
0   1  0.000011   udp       -   INT      2      0     496       0   
1   2  0.000008   udp       -   INT      2      0    1762       0   
2   3  0.000005   udp       -   INT      2      0    1068       0   
3   4  0.000006   udp       -   INT      2      0     900       0   
4   5  0.000010   udp       -   INT      2      0    2126       0   

          rate  ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  \
0   90909.0902  ...                 1               2             0   
1  125000.0003  ...                 1               2             0   
2  200000.0051  ...                 1               3             0   
3  166666.6608  ...                 1               3             0   
4  100000.0025  ...                 1               3             0   

   ct_ftp_cmd  ct_flw_http_mthd  ct_src_ltm  ct_srv_dst  is_sm_ips_ports  \
0           0                 0           1           2                0   
1     

Step 1: Data Preprocessing

In [None]:
# 1.1: Handling categorical features
# Convert 'proto', 'service', and 'state' columns to numerical values using LabelEncoder
label_enc = LabelEncoder()
data['proto'] = label_enc.fit_transform(data['proto'])
data['service'] = label_enc.fit_transform(data['service'])
data['state'] = label_enc.fit_transform(data['state'])

In [None]:
# 1.2: Dropping columns that may not be necessary for modeling (e.g., 'id', 'attack_cat' for now)
data = data.drop(columns=['id', 'attack_cat'])

In [None]:
# 1.3: Splitting features and target labels
X = data.drop(columns=['label'])  # Features
y = data['label']  # Labels (Normal = 0, Attack = 1)

In [None]:
# 1.4: Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 1.5: Train-test split (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

Step 2: Contrastive Learning Model

In [None]:
# Define a simple MLP encoder model
def create_encoder(input_shape):
    inputs = Input(shape=(input_shape,))
    x = Dense(128, activation='relu')(inputs)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    outputs = Dense(16)(x)  # Latent space representation
    return Model(inputs, outputs)

In [None]:
# Contrastive Loss Function
def contrastive_loss(y_true, y_pred):
    margin = 1.0
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)

In [None]:
# Build the siamese network for contrastive learning
def create_siamese_network(input_shape):
    # Two identical encoders
    encoder = create_encoder(input_shape)

    input_a = Input(shape=(input_shape,))
    input_b = Input(shape=(input_shape,))

    encoded_a = encoder(input_a)
    encoded_b = encoder(input_b)
    # Euclidean distance between the two encodings
    distance = Lambda(lambda embeddings: K.sqrt(K.sum(K.square(embeddings[0] - embeddings[1]), axis=-1)))([encoded_a, encoded_b])

    siamese_network = Model(inputs=[input_a, input_b], outputs=distance)
    return siamese_network

In [None]:
# Compile the siamese model
input_shape = X_train.shape[1]
siamese_model = create_siamese_network(input_shape)
siamese_model.compile(optimizer=Adam(learning_rate=0.001), loss=contrastive_loss)

In [None]:
# Import the necessary libraries
from sklearn.model_selection import train_test_split
import numpy as np
import random  # This is the missing import

# Create pairs for contrastive learning by sampling
def create_pairs(X, y, num_pairs):
    data = X
    labels = y
    pairs = []
    labels_pairs = []

    n = len(X)
    positive_pairs = 0
    negative_pairs = 0

    # Loop to create `num_pairs` randomly
    while positive_pairs + negative_pairs < num_pairs:
        i = random.randint(0, n - 1)
        j = random.randint(0, n - 1)

        # Access y using .iloc to ensure using positional indexing
        if y.iloc[i] == y.iloc[j] and positive_pairs < num_pairs // 2:
            # Positive pair (same class)
            pairs.append([data[i], data[j]])
            labels_pairs.append(1)
            positive_pairs += 1
        elif y.iloc[i] != y.iloc[j] and negative_pairs < num_pairs // 2:
            # Negative pair (different classes)
            pairs.append([data[i], data[j]])
            labels_pairs.append(0)
            negative_pairs += 1

    return np.array(pairs), np.array(labels_pairs)

# Assuming X_train and y_train are your full dataset (features and labels)
# Split the dataset to use only 30% of the data for training
X_subset, _, y_subset, _ = train_test_split(X_train, y_train, test_size=0.7, random_state=42)

# Now create pairs from 30% of the dataset
num_pairs = 5000  # You can change this number based on your memory constraints
pairs_train, labels_train = create_pairs(X_subset, y_subset, num_pairs)

In [None]:
# Train the siamese model
siamese_model.fit([pairs_train[:, 0], pairs_train[:, 1]], labels_train, epochs=10, batch_size=32)

Epoch 1/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.5502
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1599
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1320
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1205
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1093
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1033
Epoch 7/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.1009
Epoch 8/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0965
Epoch 9/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0869
Epoch 10/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - lo

<keras.src.callbacks.history.History at 0x7f4418743e20>

Step 3: Fine-tuning with Supervised Learning

In [None]:
# Use the trained encoder to extract features
encoder = create_encoder(input_shape)
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [None]:
# Build a simple classifier on top of the encoder
classifier = tf.keras.Sequential([
    Dense(64, activation='relu', input_shape=(X_train_encoded.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the classifier model
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the classifier using the encoded features
classifier.fit(X_train_encoded, y_train, epochs=10, batch_size=32, validation_data=(X_test_encoded, y_test))

Epoch 1/10
[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7661 - loss: 0.4774 - val_accuracy: 0.8542 - val_loss: 0.3239
Epoch 2/10
[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8585 - loss: 0.3176 - val_accuracy: 0.8700 - val_loss: 0.2863
Epoch 3/10
[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8718 - loss: 0.2877 - val_accuracy: 0.8796 - val_loss: 0.2706
Epoch 4/10
[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8800 - loss: 0.2689 - val_accuracy: 0.8823 - val_loss: 0.2599
Epoch 5/10
[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8862 - loss: 0.2521 - val_accuracy: 0.8781 - val_loss: 0.2654
Epoch 6/10
[1m1801/1801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8899 - loss: 0.2438 - val_accuracy: 0.8925 - val_loss: 0.2358
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x7f44181abf40>

In [None]:
# Evaluate the model on the test set
y_pred = classifier.predict(X_test_encoded)
y_pred_labels = (y_pred > 0.5).astype(int)

[1m772/772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [None]:
# Print classification report
print(classification_report(y_test, y_pred_labels))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89     11147
           1       0.90      0.92      0.91     13553

    accuracy                           0.90     24700
   macro avg       0.90      0.90      0.90     24700
weighted avg       0.90      0.90      0.90     24700



In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
import random

# Function to create pairs for testing (same as before)
def create_pairs(X, y, num_pairs):
    data = X
    labels = y
    pairs = []
    labels_pairs = []

    n = len(X)
    positive_pairs = 0
    negative_pairs = 0

    # Loop to create `num_pairs` randomly
    while positive_pairs + negative_pairs < num_pairs:
        i = random.randint(0, n - 1)
        j = random.randint(0, n - 1)

        # Access y using iloc to ensure integer-based indexing
        if y.iloc[i] == y.iloc[j] and positive_pairs < num_pairs // 2:
            # Positive pair (same class)
            pairs.append([data[i], data[j]])
            labels_pairs.append(1)
            positive_pairs += 1
        elif y.iloc[i] != y.iloc[j] and negative_pairs < num_pairs // 2:
            # Negative pair (different classes)
            pairs.append([data[i], data[j]])
            labels_pairs.append(0)
            negative_pairs += 1

    return np.array(pairs), np.array(labels_pairs)

# Function to test the Siamese model on test data (no change)
def test_siamese_model(siamese_model, X_test, y_test, num_pairs):
    # Create pairs from the test set
    pairs_test, labels_test = create_pairs(X_test, y_test, num_pairs)

    # Predict the distances using the model
    distances = siamese_model.predict([pairs_test[:, 0], pairs_test[:, 1]])

    # Convert distances into binary predictions (1 for similar, 0 for different)
    threshold = 0.5  # You can tune this threshold
    predictions = (distances < threshold).astype(int)

    # Calculate accuracy by comparing with true labels
    accuracy = accuracy_score(labels_test, predictions)

    return accuracy

# Assuming X_test and y_test are your test features and labels
num_test_pairs = 1000  # Specify the number of pairs to create for testing
test_accuracy = test_siamese_model(siamese_model, X_test, y_test, num_test_pairs)

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Test Accuracy: 89.00%


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
import random

# Function to create pairs for testing
def create_pairs(X, y, num_pairs):
    data = X
    labels = y
    pairs = []
    labels_pairs = []

    n = len(X)
    positive_pairs = 0
    negative_pairs = 0

    # Loop to create `num_pairs` randomly
    while positive_pairs + negative_pairs < num_pairs:
        i = random.randint(0, n - 1)
        j = random.randint(0, n - 1)

        # Access y using iloc to ensure integer-based indexing
        if y.iloc[i] == y.iloc[j] and positive_pairs < num_pairs // 2:
            # Positive pair (same class)
            pairs.append([data[i], data[j]])
            labels_pairs.append(1)
            positive_pairs += 1
        elif y.iloc[i] != y.iloc[j] and negative_pairs < num_pairs // 2:
            # Negative pair (different classes)
            pairs.append([data[i], data[j]])
            labels_pairs.append(0)
            negative_pairs += 1

    return np.array(pairs), np.array(labels_pairs)

# Function to test the Siamese model on test data and print predictions
def test_siamese_model(siamese_model, X_test, y_test, num_pairs):
    # Create pairs from the test set
    pairs_test, labels_test = create_pairs(X_test, y_test, num_pairs)

    # Predict the distances using the model
    distances = siamese_model.predict([pairs_test[:, 0], pairs_test[:, 1]])

    # Convert distances into binary predictions (1 for similar, 0 for different)
    threshold = 0.5  # You can tune this threshold
    predictions = (distances < threshold).astype(int)

    # Calculate accuracy by comparing with true labels
    accuracy = accuracy_score(labels_test, predictions)

    # Print predictions along with actual labels and vulnerability status
    print("\nPredictions:")
    for i in range(len(pairs_test)):
        # Determine vulnerability status based on predicted label
        if predictions[i] == 1 and labels_test[i] == 1:
            vulnerability_status = "Vulnerable"
        elif predictions[i] == 0 and labels_test[i] == 0:
            vulnerability_status = "Not Vulnerable"
        else:
            vulnerability_status = "Uncertain"

        print(f"Pair: {pairs_test[i]}, True Label: {labels_test[i]}, Predicted: {predictions[i]}, Distance: {distances[i]:.4f}, Status: {vulnerability_status}")

    return accuracy

# Assuming X_test and y_test are your
# Assuming X_test and y_test are your test features and labels
num_test_pairs = 1000  # Specify the number of pairs to create for testing
test_accuracy = test_siamese_model(siamese_model, X_test, y_test, num_test_pairs)

print(f"\nTest Accuracy: {test_accuracy * 100:.2f}%")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Pair: [[-0.21372915 -1.36009246 -0.6744059   0.93269533 -0.1244551  -0.15181641
  -0.04540814 -0.08736871  1.68835513  0.71944006 -0.82039474  1.12374517
  -0.26349797 -0.07353054 -0.11324391 -0.12218063 -0.09416902 -0.11217671
  -0.14721836 -1.04791956 -0.77984004 -0.77675409 -1.00624379 -0.48202491
  -0.41290971 -0.48407269 -0.18961219 -0.47537059 -0.17364821 -0.04190986
  -0.50013466  0.59102107 -0.44486766 -0.34911492 -0.28113479 -0.30278779
  -0.09085748 -0.09061736 -0.20314282  0.53039637 -0.46435583 -0.10607007]
 [-0.21351706  0.41056274 -0.6744059  -2.05074116 -0.1244551  -0.13451139
  -0.04581597 -0.0862992  -0.53436389 -1.47732852 -0.57182405 -0.35599795
   0.00675293 -0.07353054 -0.11324391 -0.1221795  -0.09416825 -0.11217671
  -0.14721836 -1.04791956 -0.77984004 -0.77675409 -1.00624379 -0.48202491
  -0.41290971 -0.48407269 -0.35750141 -0.14421604 -0.17364821 -0.04190986
   0.22122156 -1.2830742  -0.44486766 -0