# Structure
1. Dependecies
2. Model
3. Data Preparation

# Dependencies

In [1]:
import os, glob, torch, requests, logging, json, random
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset as HFDataset
#from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizerFast, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from config import config

  from .autonotebook import tqdm as notebook_tqdm
2025-08-25 09:31:46.509653: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756132307.046530   34932 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756132307.141698   34932 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756132308.120947   34932 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756132308.121015   34932 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756132308.121019   34932

## Logging

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
pd.set_option('display.max_columns', None)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Load The Data

In [4]:
DATA_DIR = config['data_dir'] 
MODEL_NAME = "roberta-base"
OUTPUT_DIR = config['output']
LOGGING_DIR = config['logs'] 
NUM_EPOCHS = 3#10 #3
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 128
CLASS_CONFIG = 19 # Choose 19, 6, or 2 based on your experiment
RANDOM_STATE = 42
SAVE_EVAL_RESULTS = True
SAMPLE_SIZE = None # For testing, None=Full Dataset
LABEL_COLUMN = 'Attack_Type'

In [5]:
# 19 class mapping
ATTACK_CATEGORIES_19 = {
    'ARP_Spoofing': 'Spoofing',
    'MQTT-DDoS-Connect_Flood': 'MQTT-DDoS-Connect_Flood',
    'MQTT-DDoS-Publish_Flood': 'MQTT-DDoS-Publish_Flood',
    'MQTT-DoS-Connect_Flood': 'MQTT-DoS-Connect_Flood',
    'MQTT-DoS-Publish_Flood': 'MQTT-DoS-Publish_Flood',
    'MQTT-Malformed_Data': 'MQTT-Malformed_Data',
    'Recon-OS_Scan': 'Recon-OS_Scan',
    'Recon-Ping_Sweep': 'Recon-Ping_Sweep',
    'Recon-Port_Scan': 'Recon-Port_Scan',
    'Recon-VulScan': 'Recon-VulScan',
    'TCP_IP-DDoS-ICMP': 'DDoS-ICMP',
    'TCP_IP-DDoS-SYN': 'DDoS-SYN',
    'TCP_IP-DDoS-TCP': 'DDoS-TCP',
    'TCP_IP-DDoS-UDP': 'DDoS-UDP',
    'TCP_IP-DoS-ICMP': 'DoS-ICMP',
    'TCP_IP-DoS-SYN': 'DoS-SYN',
    'TCP_IP-DoS-TCP': 'DoS-TCP',
    'TCP_IP-DoS-UDP': 'DoS-UDP',
    'Benign': 'Benign'
}

# 6 Class mapping
ATTACK_CATEGORIES_6 = { 
    'Spoofing': 'Spoofing',
    'MQTT-DDoS-Connect_Flood': 'MQTT',
    'MQTT-DDoS-Publish_Flood': 'MQTT',
    'MQTT-DoS-Connect_Flood': 'MQTT',
    'MQTT-DoS-Publish_Flood': 'MQTT',
    'MQTT-Malformed_Data': 'MQTT',
    'Recon-OS_Scan': 'Recon',
    'Recon-Ping_Sweep': 'Recon',
    'Recon-Port_Scan': 'Recon',
    'Recon-VulScan': 'Recon',
    'DDoS-ICMP': 'DDoS',
    'DDoS-SYN': 'DDoS',
    'DDoS-TCP': 'DDoS',
    'DDoS-UDP': 'DDoS',
    'DoS-ICMP': 'DoS',
    'DoS-SYN': 'DoS',
    'DoS-TCP': 'DoS',
    'DoS-UDP': 'DoS',
    'Benign': 'Benign'
}

# 2 class mapping
ATTACK_CATEGORIES_2 = { #
    'ARP_Spoofing': 'attack',
    'MQTT-DDoS-Connect_Flood': 'attack',
    'MQTT-DDoS-Publish_Flood': 'attack',
    'MQTT-DoS-Connect_Flood': 'attack',
    'MQTT-DoS-Publish_Flood': 'attack',
    'MQTT-Malformed_Data': 'attack',
    'Recon-OS_Scan': 'attack',
    'Recon-Ping_Sweep': 'attack',
    'Recon-Port_Scan': 'attack',
    'Recon-VulScan': 'attack',
    'TCP_IP-DDoS-ICMP': 'attack',
    'TCP_IP-DDoS-SYN': 'attack',
    'TCP_IP-DDoS-TCP': 'attack',
    'TCP_IP-DDoS-UDP': 'attack',
    'TCP_IP-DoS-ICMP': 'attack',
    'TCP_IP-DoS-SYN': 'attack',
    'TCP_IP-DoS-TCP': 'attack',
    'TCP_IP-DoS-UDP': 'attack',
    'Benign': 'Benign'
}

# Load Data Together

## Attack Category

In [6]:
def get_attack_category(label, class_config):
    if class_config == 2:
        categories = ATTACK_CATEGORIES_2
    elif class_config == 6:
        categories = ATTACK_CATEGORIES_6
    elif class_config == 19:
        categories = ATTACK_CATEGORIES_19
        
    for key in categories:
        if key in label:
            return categories[key]
    return 'Unknown_Category_From_Filename'

## Textualize

In [7]:
def textualize_flow(row, feature_names, sep_token='</s>'):
    text_parts = []
    for feature_name in feature_names:
        if feature_name in row:
            value = row[feature_name]
            clean_feature_name = feature_name.replace('_',' ').replace('/',' ')
            
        if pd.isnull(value):
            value = 'missing'
        elif isinstance(value, float):
            value = f'{value:.2f}' if abs(value) >= 0.01 else f'{value:.4f}'
        elif isinstance(value, int):
            value = str(value)
        else:
            value = str(value)
            
        if 'bytes' in clean_feature_name.lower():
            text_parts.append(f'The {clean_feature_name} is {value} bytes')
        elif 'time' in clean_feature_name.lower() or 'duration' in clean_feature_name.lower():
            text_parts.append(f'The {clean_feature_name} is {value} seconds')
        else:
            text_parts.append(f'The {clean_feature_name} is {value}')
    return f' {sep_token}'.join(text_parts)

## Load and Prepare Data

In [8]:
def load_and_prepare(data_dir, class_config, tokenizer, max_seq_len, text_size_for_val, random_state, sample_size):
    logger.info(f'Loading and preparing datasets for {class_config}-class configuration')
    
    train_path = os.path.join(data_dir, 'train')
    test_path = os.path.join(data_dir, 'test')
    
    if not os.path.exists(train_path) or not os.path.isdir(train_path):
        raise FileNotFoundError(f'Training directory not found or is not a directory: {train_path}.')
    if not os.path.exists(test_path) or not os.path.isdir(test_path):
        raise FileNotFoundError(f'Training directory not found or is not a directory: {test_path}.')
        
    train_files = [os.path.join(train_path, f) for f in os.listdir(train_path) if f.endswith('.csv')]
    test_files = [os.path.join(test_path, f) for f in os.listdir(test_path) if f.endswith('.csv')]
    
    if not train_files:
        raise FileNotFoundError(f'No CSV files found in training directory: {train_path}')
    if not test_files:
        raise FileNotFoundError(f'No CSV files found in training directory: {test_path}')
        
    df_list_train = [pd.read_csv(f).assign(filename=os.path.basename(f)) for f in train_files]
    df_list_test = [pd.read_csv(f).assign(filename=os.path.basename(f)) for f in test_files]
    
    train_df = pd.concat(df_list_train, ignore_index=True)
    test_df = pd.concat(df_list_test, ignore_index=True)
    
    if sample_size:
        logger.info(f'Sampling {sample_size} instances from training data...')
        train_df = train_df.sample(n=sample_size, random_state=random_state)
        
    train_df['Attack_Type_Str'] = train_df['filename'].apply(lambda x: get_attack_category(x, class_config))
    test_df['Attack_Type_Str'] = test_df['filename'].apply(lambda x: get_attack_category(x, class_config))
    
    # Drop rows where Attack_Type could not be determined
    train_df = train_df[train_df['Attack_Type_Str'] != 'Unknown_Category_From_Filename'].copy()
    test_df = test_df[test_df['Attack_Type_Str'] != 'Unknown_Category_From_Filename'].copy()
    
    if train_df.empty or test_df.empty:
        raise ValueError('No data remaining after filtering for unknown categories. Check filename and category mappings.')
        
    # Feature column definition
    feature_cols = [col for col in train_df.columns if col not in ['filename', 'Attack_Type_Str']]
    
    # Textualize data
    logger.info('Textualizing data...')
    
    train_df['text'] = train_df.apply(lambda row: textualize_flow(row, feature_cols), axis=1)
    test_df['text'] = test_df.apply(lambda row: textualize_flow(row, feature_cols), axis=1)
    
    # Encoding labels
    all_labels = pd.concat([train_df['Attack_Type_Str'], test_df['Attack_Type_Str']]).unique()
    
    label_encoder = LabelEncoder()
    label_encoder.fit(all_labels)
    train_df['label'] = label_encoder.transform(train_df['Attack_Type_Str'])
    test_df['label'] = label_encoder.transform(test_df['Attack_Type_Str'])
    
    num_classes = len(label_encoder.classes_)
    logger.info(f'Number of classes: {num_classes}, classes: {list(label_encoder.classes_)}')
    logger.info(f'Class mapping: {dict(zip(label_encoder.classes_, range(num_classes)))}')
    
    logger.info(f'Training smaples (before_split): {len(train_df)}')
    logger.info(f'Test samples: {len(test_df)}')
    
    logger.info('Textualized Training Dataset\n', train_df.head())
    logger.info('Textualized Testing Dataset\n', test_df.head())
    
    # Splitting training data to create a validation set
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_df['text'].tolist(),
        train_df['label'].tolist(),
        test_size=text_size_for_val,
        random_state=random_state,
        stratify=train_df['label'].tolist()
    )
    
    test_texts = test_df['text'].tolist()
    test_labels = test_df['label'].tolist()
    
    logger.info(f'Training samples: {len(train_texts)}')
    logger.info(f'Validation samples: {len(val_texts)}')
    logger.info(f'Test samples: {len(test_texts)}')
    
    # Tokenize
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_seq_len)
    
    train_ds = HFDataset.from_dict({'text': train_texts, 'label': train_labels}).map(tokenize_function, batched=True)
    val_ds = HFDataset.from_dict({'text': val_texts, 'label': val_labels}).map(tokenize_function, batched=True)
    test_ds = HFDataset.from_dict({'text': test_texts, 'label': test_labels}).map(tokenize_function, batched=True)
    
    # Calculating class weights
    try:
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_labels),
            y=train_labels
        )
        class_weights = dict(enumerate(class_weights))
        logger.info(f'Computed class weights: {class_weights}')
    except Exception as e:
        logger.error(f'Failed to compute class weights: {e}')
        class_weights = {i: 1.0 for i in range(num_classes)}
        logger.info(f'Using equal class weights as fallback: {class_weights}')
        
    return train_ds, val_ds, test_ds, label_encoder, class_weights, feature_cols

# RoBERTa

In [9]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

def init_roberta_model(model_name, num_labels, id2label=None, label2id=None, dropout=None):
    """
    Initialize RoBERTa model for sequence classification
    
    Args:
        model_name (str): Name or path of the pretrained RoBERTa model.
        num_labels (int): Number of output labels
        id2label (dict, optional): Mapping from label IDs to label names
        label2id (dict, optional): Mapping from label names to label IDs
        dropout (float, optional): Custom dropout rate for classifier head
        
    Returns: 
        RobertaForSequenceClassification: Initialized model
    """
    
    logging.info(f"Initializing RoBERTa model: {model_name} with {num_labels} labels")
    model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
        hidden_dropout_prob=dropout if dropout is not None else 0.1
    )

    return model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Custom Trainer

In [10]:
class CustomTrainerWithWeightedLoss(Trainer):
    """
    Custom Trainer to apply class weights to the loss function.
    
    Args: 
        class_weights (torch.Tensor): Tensor of class weights for imbalanced classification.
    """
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Compute the weighted cross-entropy loss
        
        Args:
            model: The model being trained
            inputs (dict): Input batch including 'labels'
            return_output (bool): Whether to return model outputs 
            
        Returns:
            loss or (loss, outputs)
        """
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits#get('logits')

        weights_tensor = self.class_weights.to(logits.device) if self.class_weights is not None else None
        loss_fnct = torch.nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fnct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

## Test

In [11]:
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

In [12]:
# output

### Metrics

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1, 
        'precision': precision,
        'recall': recall
    }

# Running...

In [None]:
if __name__ == '__main__':
    try:
        logger.info(f'Loading tokenizer for {MODEL_NAME}...')
        
        logger.info(f'Loading and preprocessing data from {DATA_DIR}...')
        train_ds, val_ds, test_ds, label_encoder, class_weights, feature_names = load_and_prepare(
            data_dir=DATA_DIR, 
            class_config=CLASS_CONFIG, 
            tokenizer=tokenizer, 
            max_seq_len=MAX_SEQ_LENGTH, 
            text_size_for_val=0.2,
            random_state=RANDOM_STATE,
            sample_size=SAMPLE_SIZE
        )
        
        logger.info('Sample textualized data:')
        for i in range(min(3, len(train_ds))):
            logger.info(f"Text: {train_ds['text'][i]}")
            logger.info(f"Label: {label_encoder.inverse_transform([train_ds['label'][i]])[0]}")
            
        num_labels = len(label_encoder.classes_)
        id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
        label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
    
        logger.info(f"Number of unique labels: {num_labels}")
        logger.info(f"Training dataset size: {len(train_ds)}")
        logger.info(f"Validation dataset size: {len(val_ds)}")
        logger.info(f"Test dataset size: {len(test_ds)}")
        logger.info(f"Features used for textualization: {feature_names}")
        
        logger.info('\nScript execution completed successfully!')
    except Exception as e:
        logger.info(f'Error: An exception occured during execution: {e}')
        raise

INFO:__main__:Loading tokenizer for roberta-base...
INFO:__main__:Loading and preprocessing data from /data/user/bsindala/PhD/Research/DataSets/CICIoMT2024/WiFI and MQTT/attacks/CSV/...
INFO:__main__:Loading and preparing datasets for 19-class configuration
INFO:__main__:Textualizing data...
