In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F
import os

In [3]:
def load_csv_data(input_folder: str,
                  file_name: str):
    """
    Reads train & test CSVs from disk.
    
    Returns:
      train_df (both pandas.DataFrame)
    """
    dataset_path = os.path.join(input_folder, file_name)
    dataset_df = pd.read_csv(dataset_path)
    return dataset_df

def extract_features_labels(df: pd.DataFrame):
    """
    Splits a DataFrame into numpy feature array X and label vector y.
    
    The last column is the label.
    """
    X = df.iloc[:, :-1].values
    y = df.iloc[:,  -1].values
    return X, y


# Load data
input_folder = '/home/zyang44/Github/baseline_cicIOT/P1_structurelevel/efficiency/input_files'
test_fname = 'logiKNet_test_3994.csv'

test_df = load_csv_data(input_folder, test_fname)

In [None]:
# label_L1_mapping = {"MQTT": 0, "Benign": 1} 
# label_L2_mapping = {"MQTT-DDoS-Connect_Flood": 0, "MQTT-DDoS-Publish_Flood": 1, 
#                     "MQTT-DoS-Connect_Flood": 2, "MQTT-DoS-Publish_Flood": 3,
#                     "MQTT-Malformed_Data": 4, "Benign": 5} 

# Count how many rows in each class
class_counts = test_df.iloc[:, -1].value_counts()
print("Class counts in the test set:")
print(class_counts)

Class counts in the test set:
label_L2
5    703
0    684
1    673
3    650
4    645
2    639
Name: count, dtype: int64


In [None]:
# IF READ THE TEST SET SEQUENTIALLY,
# we can randomly put the benign class (label_L2 = 5) in the dataset

# shuffle the test set
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)


Number of benign samples in the test set: 0
Percentage of benign samples in the test set: 0.00%
Distribution of benign samples in the test set: []


In [None]:
class DataLoader(object):
    def __init__(self,
                 data,
                 labels,
                 batch_size=1,
                 shuffle=True):
        self.data = data
        self.labels = labels
        self.batch_size = batch_size
        self.shuffle = shuffle

    def __len__(self):
        return int(np.ceil(self.data.shape[0] / self.batch_size))

    def __iter__(self):
        n = self.data.shape[0]
        idxlist = list(range(n))
        if self.shuffle:
            np.random.shuffle(idxlist)

        for _, start_idx in enumerate(range(0, n, self.batch_size)):
            end_idx = min(start_idx + self.batch_size, n)
            data = self.data[idxlist[start_idx:end_idx]]
            labels = self.labels[idxlist[start_idx:end_idx]]
            ############################################################
            # Check if any class is missing in the batch
            # present_classes = np.unique(labels.cpu().numpy())
            # all_classes = np.arange(len(label_mapping))  # Adjust based on number of classes
            # missing_classes = set(all_classes) - set(present_classes)
            #
            # if missing_classes:
            #     print(f"Batch {start_idx // self.batch_size} is missing classes {missing_classes}")
            ############################################################
            yield data, labels

# Extract features and labels from the test DataFrame
X_test, y_test = extract_features_labels(test_df)
# Create DataLoader for test data
test_loader = DataLoader(data=torch.tensor(X_test, dtype=torch.float32),
                          labels=torch.tensor(y_test, dtype=torch.long),
                          batch_size=64,
                          shuffle=False)
