# Structure
1. Dependecies
2. Model
3. Data Preparation

# Dependencies

In [1]:
import os
import glob
import torch
import requests
import pandas as pd

from transformers import RobertaTokenizer, RobertaModel
from config import config

  from .autonotebook import tqdm as notebook_tqdm
2025-07-29 10:06:53.841390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753801617.603958    2879 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753801617.891324    2879 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753801623.670846    2879 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753801623.670904    2879 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753801623.670908    2879

# RoBERTa

In [2]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Test

In [3]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [4]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1146,  0.1103, -0.0149,  ..., -0.0809, -0.0018, -0.0271],
         [-0.0225,  0.1612,  0.0556,  ...,  0.5366,  0.1196,  0.1576],
         [ 0.0532, -0.0020,  0.0370,  ..., -0.4887,  0.1641,  0.2736],
         ...,
         [-0.1586,  0.0837,  0.1302,  ...,  0.3970,  0.1715, -0.0848],
         [-0.1065,  0.1044, -0.0383,  ..., -0.1068, -0.0015, -0.0517],
         [ 0.0059,  0.0758,  0.1228,  ...,  0.1037,  0.0075,  0.0976]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-6.2804e-02, -8.7860e-02, -3.1384e-01,  3.8667e-02,  1.8623e-01,
          1.0576e-01,  3.2486e-01,  1.1789e-01, -2.6375e-01, -3.5515e-01,
          3.7771e-01, -1.5325e-01, -3.4446e-01, -4.1108e-01, -1.1949e-01,
         -3.5237e-01, -3.8675e-01,  1.8939e-01,  1.5224e-01,  2.3866e-01,
          1.1605e-01, -2.8356e-02,  2.8580e-01,  1.3295e-01,  2.6168e-01,
          5.8053e-01,  1.9444e-01,  4.4226e-02,  2.8255e-01,  9.038

# Load The Data

In [5]:
from data_loader import load_and_prepare_data

In [6]:
DATA_DIR = config['data_dir']
MODEL_NAME = "roberta-base"
OUTPUT_DIR = config['output']
LOGGING_DIR = config['logs']
NUM_EPOCHS = 10 #3
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 128
CLASS_CONFIG = 19 # Choose 19, 6, or 2 based on your experiment
RANDOM_STATE = 42

In [7]:
ATTACK_CATEGORIES = {
    'Benign': 'Benign',
    'ARP_Spoofing': 'Spoofing',
    'Recon-Ping_Sweep': 'Recon-Ping_Sweep',
    'Recon_VulScan': 'Recon_VulScan',
    'Recon-OS_Scan': 'Recon-OS_Scan',
    'Recon-Port_Scan': 'Recon-Port_Scan',
    'MQTT-Malformed_Data': 'MQTT-Malformed_Data',
    'MQTT-DoS_Connect_Flood': 'MQTT-DoS_Connect_Flood',
    'MQTT-DDoS_Publish_Flood': 'MQTT-DDoS_Publish_Flood',
    'MQTT-DoS_Publish_Flood': 'MQTT-DoS_Publish_Flood',
    'MQTT-DDoS_Connect_Flood': 'MQTT-DDoS_Connect_Flood',
    'DoS_TCP': 'DoS_TCP',
    'DoS_ICMP': 'DoS_ICMP',
    'DoS_SYN': 'DoS_SYN',
    'DoS_UDP': 'DoS_UDP',
    'DDoS_TCP': 'DDoS_TCP',
    'DDoS_ICMP': 'DDoS_ICMP',
    'DDoS_SYN': 'DDoS_SYN',
    'DDoS_UDP': 'DDoS_UDP'
}

# Load Data Together

In [8]:
print(f'Loading and preparing datasets')

Loading and preparing datasets


In [9]:
train_path = os.path.join(DATA_DIR, "train")
test_path = os.path.join(DATA_DIR, "test")

In [10]:
file_list = pd.read_csv(DATA_DIR + 'file_list.csv')

In [11]:
file_list.head()

Unnamed: 0,File,Category,Attack,Class
0,ARP_Spoofing_train.pcap.csv,SPOOFING,ARP Spoofing,1
1,Benign_train.pcap.csv,BENIGN,Benign,0
2,MQTT-DDoS-Connect_Flood_train.pcap.csv,MQTT,DDoS Connect Flood,1
3,MQTT-DDoS-Publish_Flood_train.pcap.csv,MQTT,DDoS Publish Flood,1
4,MQTT-DoS-Connect_Flood_train.pcap.csv,MQTT,DoS Connect Flood,1


In [12]:
if not os.path.exists(train_path) or not os.path.isdir(train_path):
    raise FileNotFoundError(f'Training directory not found or is not a directory: {train_path}.')
if not os.path.exists(test_path) or not os.path.isdir(test_path):
    raise FileNotFoundError(f'Testing directory not found or is not a directory: {test_path}.')

In [13]:
train_files = [os.path.join(train_path, f) for f in os.listdir(train_path) if f.endswith('.csv')]
test_files = [os.path.join(test_path, f) for f in os.listdir(test_path) if f.endswith('.csv')]

In [14]:
if not train_files:
    raise FileNotFoundError(f'No CSV files found in training directory: {train_path}')
if not test_files:
    raise FileNotFoundError(f'No CSV files found in testing directory: {test_path}')

In [15]:
df_list_train = [pd.read_csv(f).assign(filename=os.path.basename(f)) for f in train_files]
df_list_test = [pd.read_csv(f).assign(filename=os.path.basename(f)) for f in test_files]

In [16]:
train_df_full = pd.concat(df_list_train, ignore_index=True)
test_df_full = pd.concat(df_list_test, ignore_index=True)

In [17]:
train_df_full.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,cwr_flag_number,ack_count,syn_count,fin_count,rst_count,HTTP,HTTPS,DNS,Telnet,SMTP,SSH,IRC,TCP,UDP,DHCP,ARP,ICMP,IGMP,IPv,LLC,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,filename
0,114.6,5.9,51.2,30655.032896,30655.032896,0.0,0.0,0.0,0.0,0.5,0.7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.1,0.0,0.2,0.0,0.0,0.8,0.8,568.0,42.0,181.4,95.151429,54.909851,99.6,169410400.0,5.5,13.639579,77.654256,3685.081162,0.9,38.5,Recon-OS_Scan_train.pcap.csv
1,129.0,6.0,64.0,90366.618129,90366.618129,0.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1438.2,42.0,214.0,93.121933,49.319359,81.2,169410400.0,13.5,13.647331,69.908824,2457.350159,1.0,244.6,Recon-OS_Scan_train.pcap.csv
2,321.1,7.1,100.6,13324.032213,13324.032213,0.0,0.0,0.0,0.0,0.4,0.9,0.0,0.0,0.0,0.0,0.0,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.0,0.0,0.0,0.0,1.0,1.0,617.2,63.6,187.5,102.235437,48.063324,119.4,0.0883225,5.5,14.179865,67.971805,3322.481708,0.7,38.5,Recon-OS_Scan_train.pcap.csv
3,292.6,6.0,80.8,3.897624,3.897624,0.0,0.0,0.0,0.0,0.6,1.0,0.0,0.0,0.0,0.0,0.0,3.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1693.4,54.0,246.0,110.225159,59.841524,87.5,169410400.0,13.5,14.849839,84.803609,3609.775298,1.0,244.6,Recon-OS_Scan_train.pcap.csv
4,483.6,7.1,69.9,8.191751,8.191751,0.0,0.0,0.0,0.0,0.4,0.9,0.0,0.0,0.0,0.0,0.0,4.1,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.0,0.0,0.0,0.0,1.0,1.0,913.8,66.0,590.4,144.389087,180.179746,160.9,0.0821888,5.5,16.531551,254.81264,53155.729481,0.9,38.5,Recon-OS_Scan_train.pcap.csv


In [18]:
test_df_full.head()

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,cwr_flag_number,ack_count,syn_count,fin_count,rst_count,HTTP,HTTPS,DNS,Telnet,SMTP,SSH,IRC,TCP,UDP,DHCP,ARP,ICMP,IGMP,IPv,LLC,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,filename
0,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,101635400.0,9.5,10.392305,0.0,0.0,0.0,141.55,TCP_IP-DDoS-TCP_test.pcap.csv
1,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55,TCP_IP-DDoS-TCP_test.pcap.csv
2,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55,TCP_IP-DDoS-TCP_test.pcap.csv
3,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55,TCP_IP-DDoS-TCP_test.pcap.csv
4,54.0,6.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,567.0,54.0,54.0,54.0,0.0,54.0,84696150.0,9.5,10.392305,0.0,0.0,0.0,141.55,TCP_IP-DDoS-TCP_test.pcap.csv


In [19]:
train_df_full.info

<bound method DataFrame.info of          Header_Length  Protocol Type  Duration          Rate         Srate  \
0                114.6            5.9     51.20  30655.032896  30655.032896   
1                129.0            6.0     64.00  90366.618129  90366.618129   
2                321.1            7.1    100.60  13324.032213  13324.032213   
3                292.6            6.0     80.80      3.897624      3.897624   
4                483.6            7.1     69.90      8.191751      8.191751   
...                ...            ...       ...           ...           ...   
7160826            0.0            1.0     64.00      1.332830      1.332830   
7160827            0.0            1.0     64.00      1.332830      1.332830   
7160828            0.0            1.0     64.00      1.332830      1.332830   
7160829            0.0            1.0     65.91      1.332830      1.332830   
7160830            0.0            1.0     64.00      1.332830      1.332830   

         Drate  fin

In [20]:
test_df_full.info

<bound method DataFrame.info of          Header_Length  Protocol Type  Duration          Rate         Srate  \
0            54.000000            6.0      64.0      0.000000      0.000000   
1            54.000000            6.0      64.0      0.000000      0.000000   
2            54.000000            6.0      64.0      0.000000      0.000000   
3            54.000000            6.0      64.0      0.000000      0.000000   
4            54.000000            6.0      64.0      0.000000      0.000000   
...                ...            ...       ...           ...           ...   
1614177   14077.000000           17.0      64.0  25421.093838  25421.093838   
1614178   29153.000000           17.0      64.0  16454.770620  16454.770620   
1614179   24568.500000           17.0      64.0  22708.477712  22708.477712   
1614180   27461.500000           17.0      64.0  18353.623894  18353.623894   
1614181   25628.571429           17.0      64.0  21059.648472  21059.648472   

         Drate  fin

## Textualization

In [21]:
train_df_full.to_csv(DATA_DIR + 'CICIoMT2024_text.txt', sep='\t', index=False)

In [22]:
textualized_ds = pd.read_csv(DATA_DIR + 'CICIoMT2024_text.txt')

In [23]:
textualized_ds.head()

Unnamed: 0,Header_Length\tProtocol Type\tDuration\tRate\tSrate\tDrate\tfin_flag_number\tsyn_flag_number\trst_flag_number\tpsh_flag_number\tack_flag_number\tece_flag_number\tcwr_flag_number\tack_count\tsyn_count\tfin_count\trst_count\tHTTP\tHTTPS\tDNS\tTelnet\tSMTP\tSSH\tIRC\tTCP\tUDP\tDHCP\tARP\tICMP\tIGMP\tIPv\tLLC\tTot sum\tMin\tMax\tAVG\tStd\tTot size\tIAT\tNumber\tMagnitue\tRadius\tCovariance\tVariance\tWeight\tfilename
0,114.6\t5.9\t51.2\t30655.032896090102\t30655.03...
1,129.0\t6.0\t64.0\t90366.61812863432\t90366.618...
2,321.1\t7.1\t100.6\t13324.032212847253\t13324.0...
3,292.6\t6.0\t80.8\t3.897623729966417\t3.8976237...
4,483.6\t7.1\t69.9\t8.191751239408674\t8.1917512...
