In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F
import os

In [2]:
def load_csv_data(input_folder: str,
                  file_name: str):
    """
    Reads train & test CSVs from disk.
    
    Returns:
      train_df (both pandas.DataFrame)
    """
    dataset_path = os.path.join(input_folder, file_name)
    dataset_df = pd.read_csv(dataset_path)
    return dataset_df

def extract_features_labels(df: pd.DataFrame):
    """
    Splits a DataFrame into numpy feature array X and label vector y.
    
    The last column is the label.
    """
    X = df.iloc[:, :-1].values
    y = df.iloc[:,  -1].values
    return X, y


# Load data
input_folder = '/home/zyang44/Github/baseline_cicIOT/P1_structurelevel/efficiency/input_files'
test_fname = 'logiKNet_test_3994.csv'

test_df = load_csv_data(input_folder, test_fname)

In [None]:
# label_L1_mapping = {"MQTT": 0, "Benign": 1} 
# label_L2_mapping = {"MQTT-DDoS-Connect_Flood": 0, "MQTT-DDoS-Publish_Flood": 1, 
#                     "MQTT-DoS-Connect_Flood": 2, "MQTT-DoS-Publish_Flood": 3,
#                     "MQTT-Malformed_Data": 4, "Benign": 5} 

# Count how many rows in each class
class_counts = test_df.iloc[:, -1].value_counts()
print("Class counts in the test set:")
print(class_counts)

# Get whole indices of the benign class
# stratified draw 5 indices from each other class
benign_indices = test_df[test_df.iloc[:, -1] == 5].index.tolist()
other_classes = test_df[test_df.iloc[:, -1] != 5].iloc[:, -1].unique()
stratified_indices = []
for cls in other_classes:
    cls_indices = test_df[test_df.iloc[:, -1] == cls].index.tolist()
    if len(cls_indices) >= 5:
        stratified_indices.extend(np.random.choice(cls_indices, 5, replace=False).tolist())
    else:
        stratified_indices.extend(cls_indices)

print(len(benign_indices), "benign indices")
print(len(stratified_indices), "stratified indices from other classes")

print(test_df.iloc[stratified_indices[0], :])


Class counts in the test set:
label_L2
5    703
0    684
1    673
3    650
4    645
2    639
Name: count, dtype: int64
703 benign indices
25 stratified indices from other classes
Header_Length   -0.307076
Protocol Type   -0.139918
Duration        -0.234570
Rate            -0.090659
Srate           -0.090659
IPv              0.103078
LLC              0.103078
Tot sum         -0.527501
Min             -0.338572
Max             -0.752721
AVG             -0.574588
Std             -0.650680
Tot size        -0.572762
IAT              0.002025
Number           0.003347
Magnitue        -0.664116
Radius          -0.649943
Covariance      -0.543391
label_L2         2.000000
Name: 1379, dtype: float64


In [None]:
def generate_poisson_stream(benign_samples, anomaly_samples, seed=42):
    """
    Interleave anomaly_samples into benign_samples according to a Poisson‐like process.
    
    Parameters
    ----------
    benign_samples : array‐like
        Your collection of benign items (e.g. feature vectors, indices, etc.).
    anomaly_samples : array‐like
        Your collection of anomaly items (must be “rare”, e.g. length << benign_samples).
    seed : int, optional
        Random seed for reproducibility (default=42).
    
    Returns
    -------
    stream : list
        A single list containing all benign and anomaly samples in an order
        where the gaps between anomalies follow a Geometric(p) distribution
        with p = len(anomaly_samples)/len(benign_samples).
    """
    rng = np.random.RandomState(seed)
    
    B = np.array(benign_samples)
    A = np.array(anomaly_samples)
    n_benign = len(B)
    n_anom   = len(A)
    
    # overall “success” probability per benign sample
    p = n_anom / n_benign
    
    # shuffle each pool
    benign_shuf = rng.permutation(B)
    anom_shuf   = rng.permutation(A)
    
    stream = []
    b_ptr = 0
    a_ptr = 0
    
    # place anomalies one by one
    while a_ptr < n_anom and b_ptr < n_benign:
        # draw gap ~ Geometric(p): number of trials until success
        gap = rng.geometric(p)
        # take (gap − 1) benign samples before the next anomaly
        take = min(gap - 1, n_benign - b_ptr)
        if take > 0:
            stream.extend(benign_shuf[b_ptr : b_ptr + take].tolist())
            b_ptr += take
        # then place one anomaly
        stream.append(anom_shuf[a_ptr])
        a_ptr += 1
    
    # append any leftover benigns
    if b_ptr < n_benign:
        stream.extend(benign_shuf[b_ptr:].tolist())
    
    return stream

# ── Example usage ──
# benign = list(range(703))         # e.g. your 703 benign indices
# anomalies = list(range(703,728))  # e.g. your 25 anomaly indices
# stream = generate_poisson_stream(benign, anomalies, seed=0)
# print(len(stream))  # → 728


In [None]:
# IF READ THE TEST SET SEQUENTIALLY,
# we can randomly put the benign class (label_L2 = 5) in the dataset

# shuffle the test set
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)


Number of benign samples in the test set: 0
Percentage of benign samples in the test set: 0.00%
Distribution of benign samples in the test set: []


In [5]:
class DataLoader(object):
    def __init__(self,
                 data,
                 labels,
                 batch_size=1,
                 shuffle=True):
        self.data = data
        self.labels = labels
        self.batch_size = batch_size
        self.shuffle = shuffle

    def __len__(self):
        return int(np.ceil(self.data.shape[0] / self.batch_size))

    def __iter__(self):
        n = self.data.shape[0]
        idxlist = list(range(n))
        if self.shuffle:
            np.random.shuffle(idxlist)

        for _, start_idx in enumerate(range(0, n, self.batch_size)):
            end_idx = min(start_idx + self.batch_size, n)
            data = self.data[idxlist[start_idx:end_idx]]
            labels = self.labels[idxlist[start_idx:end_idx]]
            ############################################################
            # Check if any class is missing in the batch
            # present_classes = np.unique(labels.cpu().numpy())
            # all_classes = np.arange(len(label_mapping))  # Adjust based on number of classes
            # missing_classes = set(all_classes) - set(present_classes)
            #
            # if missing_classes:
            #     print(f"Batch {start_idx // self.batch_size} is missing classes {missing_classes}")
            ############################################################
            yield data, labels

# Extract features and labels from the test DataFrame
X_test, y_test = extract_features_labels(test_df)
# Create DataLoader for test data
test_loader = DataLoader(
    data=torch.tensor(X_test, dtype=torch.float32),
    labels=torch.tensor(y_test, dtype=torch.long),
    shuffle=False
    )


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0
