## Load in data

In [2]:
import pandas as pd

monday_data = pd.read_csv('data/Monday-WorkingHours.pcap_ISCX.csv')
tuesday_data = pd.read_csv('data/Tuesday-WorkingHours.pcap_ISCX.csv')
wednesday_data = pd.read_csv('data/Wednesday-workingHours.pcap_ISCX.csv')
thursday_web_attacks_data = pd.read_csv('data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
thursday_infiltration_data = pd.read_csv('data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
friday_morning_data = pd.read_csv('data/Friday-WorkingHours-Morning.pcap_ISCX.csv')
friday_port_scan_data = pd.read_csv('data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
friday_ddos_data = pd.read_csv('data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

In [3]:
full_data = pd.concat([monday_data, tuesday_data, wednesday_data,
                       thursday_web_attacks_data, thursday_infiltration_data,
                       friday_morning_data, friday_port_scan_data, friday_ddos_data], ignore_index=True)

print(full_data.shape)
full_data.head()

(2830743, 79)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,49188,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,49486,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

full_data.replace([np.inf, -np.inf], None, inplace=True)
full_data.dropna(inplace=True)
full_data.columns = full_data.columns.str.strip()

label_encoder = LabelEncoder()

full_data['Label'] = label_encoder.fit_transform(full_data['Label'])

In [5]:
feature_columns = full_data.columns.difference(['Label'])

scaler = StandardScaler()

full_data[feature_columns] = scaler.fit_transform(full_data[feature_columns])

full_data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,2.250506,-0.439612,-0.009825,-0.010426,-0.053793,-0.007145,-0.281253,-0.210906,-0.280686,-0.245193,...,0.002547,-0.125799,-0.104619,-0.149404,-0.101068,-0.352126,-0.109516,-0.357072,-0.339184,0
1,2.250506,-0.439612,-0.009825,-0.010426,-0.053793,-0.007145,-0.281253,-0.210906,-0.280686,-0.245193,...,0.002547,-0.125799,-0.104619,-0.149404,-0.101068,-0.352126,-0.109516,-0.357072,-0.339184,0
2,2.250506,-0.439612,-0.009825,-0.010426,-0.053793,-0.007145,-0.281253,-0.210906,-0.280686,-0.245193,...,0.002547,-0.125799,-0.104619,-0.149404,-0.101068,-0.352126,-0.109516,-0.357072,-0.339184,0
3,2.250506,-0.439612,-0.009825,-0.010426,-0.053793,-0.007145,-0.281253,-0.210906,-0.280686,-0.245193,...,0.002547,-0.125799,-0.104619,-0.149404,-0.101068,-0.352126,-0.109516,-0.357072,-0.339184,0
4,2.266813,-0.439612,-0.009825,-0.010426,-0.053793,-0.007145,-0.281253,-0.210906,-0.280686,-0.245193,...,0.002547,-0.125799,-0.104619,-0.149404,-0.101068,-0.352126,-0.109516,-0.357072,-0.339184,0


In [6]:
from sklearn.model_selection import train_test_split

X = full_data[feature_columns]
y = full_data['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (2262300, 78)
Test set size: (565576, 78)


#### Hybrid Autoencoder

In [34]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from qiskit import QuantumCircuit
from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_aer import AerSimulator  # GPU-enabled simulator
from qiskit.primitives import StatevectorSampler
from qiskit_machine_learning.neural_networks import SamplerQNN
from qiskit_machine_learning.connectors import TorchConnector
import numpy as np

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
from qiskit.primitives import Sampler  # Import the Sampler class

class HybridAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim=4):
        super(HybridAutoencoder, self).__init__()
        self.input_dim = input_dim
        self.encoding_dim = encoding_dim

        # Classical Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, encoding_dim)
        )

        # Quantum Layer
        self.quantum_layer = self.create_quantum_bottleneck()

        # Classical Decoder
        self.decoder = nn.Sequential(
            nn.Linear(2 ** encoding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        x = x.to(device)
        encoded = self.encoder(x)
        parameter_values = encoded.detach().cpu().numpy()
        quantum_encoded = self.quantum_layer(torch.tensor(parameter_values).float().to(device))
        decoded = self.decoder(quantum_encoded)
        return encoded, decoded

    def create_quantum_bottleneck(self):
        # Use Sampler instead of StatevectorSampler
        sampler = Sampler(options={"backend": AerSimulator(method='statevector')})

        feature_map = ZZFeatureMap(self.encoding_dim)
        ansatz = RealAmplitudes(self.encoding_dim, reps=1)

        circuit = QuantumCircuit(self.encoding_dim)
        circuit.compose(feature_map, inplace=True)
        circuit.compose(ansatz, inplace=True)

        qnn = SamplerQNN(
            circuit=circuit,
            input_params=feature_map.parameters,
            weight_params=ansatz.parameters,
            sampler=sampler,
            input_gradients=True
        )

        return TorchConnector(qnn).to(device)

    def print_model_structure(self):
        print("Model Structure:")
        print("\nEncoder:")
        print(self.encoder)
        print("\nQuantum Circuit:")
        print(self.quantum_layer)
        print("\nDecoder:")
        print(self.decoder)



Using device: cuda


#### Train the Autoencoder

In [None]:
X_train_numpy = X_train.values  # Convert X_train DataFrame to NumPy

# Initialize the hybrid autoencoder
input_dim = X_train_numpy.shape[1]
encoding_dim = 4  # This must match the quantum circuit's qubit count
hybrid_autoencoder = HybridAutoencoder(input_dim=input_dim, encoding_dim=encoding_dim).to(device)

# Print model structure
hybrid_autoencoder.print_model_structure()

# Define loss function and optimizer
criterion = nn.MSELoss()  # Reconstruction loss
optimizer = torch.optim.Adam(hybrid_autoencoder.parameters(), lr=0.001)

# Create DataLoader for batching
batch_size = 32
train_dataset = TensorDataset(torch.tensor(X_train_numpy).float())
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        inputs = batch[0].to(device)  # Move input to GPU if available

        # Zero out the gradients
        optimizer.zero_grad()

        # Forward pass through the hybrid autoencoder
        encoded, decoded = hybrid_autoencoder(inputs)

        # Compute loss (reconstruction loss)
        loss = criterion(decoded, inputs)

        # Backpropagation and optimization
        loss.backward()  # Calculate gradients
        optimizer.step()  # Update parameters

        total_loss += loss.item()

    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    if epoch % 10 == 0:
        print(f'Epoch {epoch}/{num_epochs}, Average Loss: {avg_loss:.4f}')

  sampler = Sampler(options={"backend": AerSimulator(method='statevector')})


Model Structure:

Encoder:
Sequential(
  (0): Linear(in_features=78, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=4, bias=True)
)

Quantum Circuit:
TorchConnector()

Decoder:
Sequential(
  (0): Linear(in_features=16, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=78, bias=True)
)


In [1]:
# Extract latent space representation
latent_train = hybrid_autoencoder.encoder(torch.tensor(X_train).float()).detach().numpy()
latent_test = hybrid_autoencoder.encoder(torch.tensor(X_test).float()).detach().numpy()

NameError: name 'hybrid_autoencoder' is not defined

#### QSVM for Anomaly Detection

In [None]:
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.algorithms import QSVC

# Create a ZZ Feature Map with 4 qubits and depth of 2
feature_map = ZZFeatureMap(feature_dimension=4, reps=2, entanglement='linear')

# Define the quantum kernel using the feature map
quantum_kernel = QuantumKernel(feature_map=feature_map, quantum_instance=Aer.get_backend('qasm_simulator'))

# Initialize QSVC with the quantum kernel
qsvc = QSVC(quantum_kernel=quantum_kernel)

# Fit the QSVC model on the latent space from the hybrid autoencoder
qsvc.fit(latent_train, y_train)

# Predict on the test set using the QSVC
y_pred_svm = qsvc.predict(latent_test)

#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the QSVC model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

print(f"SVM - Accuracy: {accuracy_svm}, Precision: {precision_svm}, Recall: {recall_svm}, F1: {f1_svm}")