# Structure
1. Dependecies
2. Model
3. Data Preparation

# Dependencies

In [16]:
import os, glob, torch, requests, logging, json, random, wandb
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model
#from torch.utils.data import Dataset, DataLoader

from transformers import RobertaTokenizerFast, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from config import config

## Logging

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
pd.set_option('display.max_columns', None)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Load The Data

In [4]:
DATA_DIR = config['data_dir'] 
MODEL_NAME = "roberta-base"
OUTPUT_DIR = config['output']
LOGGING_DIR = config['logs'] 
WANDB_API_KEY = config['wb_api_key']
WANDB_PROJECT = 'llm-anomaly-detection'
WANDB_ENTITY = 'bsindala-university-of-alabama-at-birmigham'
NUM_EPOCHS = 3#10 #3
BATCH_SIZE = 8
LEARNING_RATE = 1e-5
MAX_SEQ_LENGTH = 256
CLASS_CONFIG = 2 # Choose 19, 6, or 2 based on your experiment
RANDOM_STATE = 42
SAMPLE_FRAC=0.2
SAVE_EVAL_RESULTS = True
# SAMPLE_SIZE = None # For testing, None=Full Dataset
# LABEL_COLUMN = 'Attack_Type'

In [5]:
# 19 class mapping
ATTACK_CATEGORIES_19 = {
    'ARP_Spoofing': 'Spoofing',
    'MQTT-DDoS-Connect_Flood': 'MQTT-DDoS-Connect_Flood',
    'MQTT-DDoS-Publish_Flood': 'MQTT-DDoS-Publish_Flood',
    'MQTT-DoS-Connect_Flood': 'MQTT-DoS-Connect_Flood',
    'MQTT-DoS-Publish_Flood': 'MQTT-DoS-Publish_Flood',
    'MQTT-Malformed_Data': 'MQTT-Malformed_Data',
    'Recon-OS_Scan': 'Recon-OS_Scan',
    'Recon-Ping_Sweep': 'Recon-Ping_Sweep',
    'Recon-Port_Scan': 'Recon-Port_Scan',
    'Recon-VulScan': 'Recon-VulScan',
    'TCP_IP-DDoS-ICMP': 'DDoS-ICMP',
    'TCP_IP-DDoS-SYN': 'DDoS-SYN',
    'TCP_IP-DDoS-TCP': 'DDoS-TCP',
    'TCP_IP-DDoS-UDP': 'DDoS-UDP',
    'TCP_IP-DoS-ICMP': 'DoS-ICMP',
    'TCP_IP-DoS-SYN': 'DoS-SYN',
    'TCP_IP-DoS-TCP': 'DoS-TCP',
    'TCP_IP-DoS-UDP': 'DoS-UDP',
    'Benign': 'Benign'
}

# 6 Class mapping
ATTACK_CATEGORIES_6 = { 
    'Spoofing': 'Spoofing',
    'MQTT-DDoS-Connect_Flood': 'MQTT',
    'MQTT-DDoS-Publish_Flood': 'MQTT',
    'MQTT-DoS-Connect_Flood': 'MQTT',
    'MQTT-DoS-Publish_Flood': 'MQTT',
    'MQTT-Malformed_Data': 'MQTT',
    'Recon-OS_Scan': 'Recon',
    'Recon-Ping_Sweep': 'Recon',
    'Recon-Port_Scan': 'Recon',
    'Recon-VulScan': 'Recon',
    'DDoS-ICMP': 'DDoS',
    'DDoS-SYN': 'DDoS',
    'DDoS-TCP': 'DDoS',
    'DDoS-UDP': 'DDoS',
    'DoS-ICMP': 'DoS',
    'DoS-SYN': 'DoS',
    'DoS-TCP': 'DoS',
    'DoS-UDP': 'DoS',
    'Benign': 'Benign'
}

# 2 class mapping
ATTACK_CATEGORIES_2 = { #
    'ARP_Spoofing': 'attack',
    'MQTT-DDoS-Connect_Flood': 'attack',
    'MQTT-DDoS-Publish_Flood': 'attack',
    'MQTT-DoS-Connect_Flood': 'attack',
    'MQTT-DoS-Publish_Flood': 'attack',
    'MQTT-Malformed_Data': 'attack',
    'Recon-OS_Scan': 'attack',
    'Recon-Ping_Sweep': 'attack',
    'Recon-Port_Scan': 'attack',
    'Recon-VulScan': 'attack',
    'TCP_IP-DDoS-ICMP': 'attack',
    'TCP_IP-DDoS-SYN': 'attack',
    'TCP_IP-DDoS-TCP': 'attack',
    'TCP_IP-DDoS-UDP': 'attack',
    'TCP_IP-DoS-ICMP': 'attack',
    'TCP_IP-DoS-SYN': 'attack',
    'TCP_IP-DoS-TCP': 'attack',
    'TCP_IP-DoS-UDP': 'attack',
    'Benign': 'Benign'
}

# Load Data Together

## Attack Category

In [6]:
def get_attack_category(label, class_config):
    if class_config == 2:
        categories = ATTACK_CATEGORIES_2
    elif class_config == 6:
        categories = ATTACK_CATEGORIES_6
    elif class_config == 19:
        categories = ATTACK_CATEGORIES_19
        
    for key in categories:
        if key in label:
            return categories[key]
    logger.warning(f"Could not map label: {label} with class_config: {class_config}. Returning 'Unknown'")
    return 'Unknown Category'

## Textualize

In [7]:
def textualize_flow(row, feature_names, sep_token='</s>'):
    text_parts = []
    for feature_name in feature_names:
        if feature_name in row:
            value = row[feature_name]
            clean_feature_name = feature_name.replace('_',' ').replace('-',' ').replace('/',' ')
            
        if pd.isnull(value):
            value = 'missing'
        elif isinstance(value, float):
            value = f'{value:.2f}' if abs(value) >= 0.01 else f'{value:.4f}'
        elif isinstance(value, int):
            value = str(value)
        else:
            value = str(value)
            
#         if 'bytes' in clean_feature_name.lower():
#             text_parts.append(f'The {clean_feature_name} is {value} bytes')
#         elif 'time' in clean_feature_name.lower() or 'duration' in clean_feature_name.lower():
#             text_parts.append(f'The {clean_feature_name} is {value} seconds')
#         else:
#             text_parts.append(f'The {clean_feature_name} is {value}')
        text_parts.append(f"{clean_feature_name}:{value}")
    return f' {sep_token}'.join(text_parts)

## Load and Prepare Data

In [28]:
def load_and_prepare(data_dir, class_config, tokenizer, max_seq_len=256, test_size_for_val=0.2, random_state=42, sample_frac=0.2):
    logger.info(f"Loading and preparing datasets for {class_config}-class configuration...")

    train_path = os.path.join(data_dir, "train")
    test_path = os.path.join(data_dir, "test")

    if not os.path.exists(train_path) or not os.path.isdir(train_path):
        raise FileNotFoundError(f"Training directory not found or is not a directory: {train_path}.")
    if not os.path.exists(test_path) or not os.path.isdir(test_path):
        raise FileNotFoundError(f"Testing directory not found or is not a directory: {test_path}.")

    train_files = [os.path.join(train_path, f) for f in os.listdir(train_path) if f.endswith('.csv')]
    test_files = [os.path.join(test_path, f) for f in os.listdir(test_path) if f.endswith('.csv')]

    if not train_files or not test_files:
        raise FileNotFoundError("No CSV files found in training or test directories.")

    logger.info("Loading datasets with streaming...")

    # Define default features
    selected_features = [
        'Src IP', 'Dst IP', 'Protocol', 'Flow Duration', 'Pkt Len Mean',
        'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow Pkts/s', 'Flow IAT Mean',
        'Fwd IAT Tot', 'Bwd IAT Tot', 'Fwd PSH Flags', 'BWd PSH Flags'
    ]

    def process_batch(batch, feature_cols):
        """Process a batch of data for textualization and labeling"""
        if isinstance(batch, dict):
            batch = [batch]

        df = pd.DataFrame(batch)
        df = df.fillna(df.mean(numeric_only=True))
        df['Attack_Type'] = df['filename'].apply(lambda x: get_attack_category(x, class_config))
        df = df[df['Attack_Type'] != 'Unknown Category'].copy()
        if df.empty:
            logger.warning('Empty batch after filtering unknown categories.')
            return None
        feature_cols = [col for col in df.columns if col not in ['filename', 'Attack_Type']]
        df['text'] = df.apply(lambda row: textualize_flow(row, feature_cols), axis=1)
        return df[['text', 'Attack_Type']]

    train_texts, train_labels = [], []
    test_texts, test_labels = [], []

    logger.info('Processing train data...')
    for file_path in train_files:
        filename = os.path.basename(file_path)
        train_dataset = load_dataset('csv', data_files={'train': file_path}, streaming=True)['train']
        for example in train_dataset:
            if isinstance(example, dict):
                example['filename'] = filename
                df_batch = process_batch(example, selected_features)
                if df_batch is not None:
                    train_texts.extend(df_batch['text'].tolist())
                    train_labels.extend(df_batch['Attack_Type'].tolist())
            else:
                logger.warning(f'Unexcepted data format in {filename}: {type(example)}')

    logger.info('Processing test data...')
    for file_path in test_files:
        filename = os.path.basename(file_path)
        test_dataset = load_dataset('csv', data_files={'test': file_path}, streaming=True)['test']
        for example in test_dataset:
            if isinstance(example, dict):
                example['filename'] = filename
                df_batch = process_batch(example, selected_features)
                if df_batch is not None:
                    test_texts.extend(df_batch['text'].tolist())
                    test_labels.extend(df_batch['Attack_Type'].tolist())
            else:
                logger.warning(f'Unexcepted data format in {filename}: {type(example)}')

    if sample_frac < 1.0:
        logger.info(f'Subsampling {sample_frac*100:.1f}% of training data...')
        indices = np.random.choice(len(train_texts), size=int(len(train_texts) * sample_frac), replace=False)
        train_texts = [train_texts[i] for i in indices]
        train_labels = [train_labels[i] for i in indices]

    label_encoder = LabelEncoder()
    all_labels = list(set(train_labels + test_labels))
    label_encoder.fit(all_labels)
    train_labels = label_encoder.transform(train_labels)
    test_labels = label_encoder.transform(test_labels)

    num_classes = len(label_encoder.classes_)
    logger.info(f"Number of classes: {num_classes}, classes: {list(label_encoder.classes_)}")

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts,
        train_labels,
        test_size=test_size_for_val,
        random_state=random_state,
        stratify=train_labels
    )

    logger.info(f'Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}')

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_seq_len)

    train_ds = Dataset.from_dict({'text': train_texts, 'label': train_labels}).map(tokenize_function, batched=True, batch_size=1000)
    val_ds = Dataset.from_dict({'text': val_texts, 'label': val_labels}).map(tokenize_function, batched=True, batch_size=1000)
    test_ds = Dataset.from_dict({'text': test_texts, 'label': test_labels}).map(tokenize_function, batched=True, batch_size=1000)

    try:
        class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
        class_weights = dict(enumerate(class_weights))
        logger.info(f"Class weights: {class_weights}")
    except Exception as e:
        logger.error(f"Failed to compute class weights: {e}")
        class_weights = {i: 1.0 for i in range(num_classes)}
        logger.info(f"Using equal class weights as fallback: {class_weights}")

    return train_ds, val_ds, test_ds, label_encoder, class_weights, selected_features

# RoBERTa

In [9]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

def init_roberta_model(model_name, num_labels=2, id2label=None, label2id=None, dropout=0.1, use_lora=True, lora_r=16):
    """
    Initialize RoBERTa model for sequence classification
    
    Args:
        model_name (str): Name or path of the pretrained RoBERTa model (default: 'roberta-base')
        num_labels (int): Number of output labels (e.g., 2 for binary anomaly detection)
        id2label (dict, optional): Mapping from label IDs to label names 
        label2id (dict, optional): Mapping from label names to label IDs
        dropout (float, optional): Custom dropout rate for classifier head (default: 0.1)
        use_lora (bool): Whether to apply LoRA for parameter-efficient fine-tuning (default: True)
        lora_r (int): LoRA rank parameter (default: 16)
    Returns: 
        RobertaForSequenceClassification: Initialized model, optionally with LoRA
    """
    
    logging.info(f"Initializing RoBERTa model: {model_name} with {num_labels} labels, LoRA={use_lora}")
    try:
        model = RobertaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id,
            hidden_dropout_prob=dropout if dropout is not None else 0.1,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
        
        model.gradient_checkpointing_enable()
        
        if use_lora:
            lora_config = LoraConfig(
                r=lora_r,
                lora_alpha=32,
                target_modules=['query', 'value'],
                lora_dropout=0.05,
                bias='none',
                task_type='SEQ_CLS'
            )
            model = get_peft_model(model, lora_config)
            
            # Logging trainable parameters
            total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            logger.info(f'Applied LoRA with rank={lora_r}, trainable params: {total_params}')
            if wandb.run is not None:
                wandb.log({"trainable_params": total_params, "lora_rank": lora_r})
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        logger.info(f'Model moved to device: {device}')
        if wandb.run is not None:
            wandb.log({"device": str(device)})

        return model
        
    except Exception as e:
        logger.error(f'Failed to initialize model: {e}')
        raise


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Custom Trainer

In [10]:
class CustomTrainerWithWeightedLoss(Trainer):
    """
    Custom Trainer to apply class weights to the loss function.
    
    Args: 
        class_weights (torch.Tensor): Tensor of class weights for imbalanced classification.
    """
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            weights = torch.tensor(list(class_weights.values()), dtype=torch.float32)
            weights = weights / weights.sum() * len(weights)
            self.class_weights = weights.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
            if wandb.run is None:
                wandb.log({"class_weights": {i: w.item() for i, w in enumerate(self.class_weights)}})
        else:
            self.class_weights = None
        logger.info(f'Class weights initialized: {self.class_weights}')

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute the weighted cross-entropy loss
        
        Args:
            model: The model being trained
            inputs (dict): Input batch including 'labels'
            return_output (bool): Whether to return model outputs 
            
        Returns:
            loss or (loss, outputs)
        """
        try:
            labels = inputs.pop('labels')
            outputs = model(**inputs)
            logits = outputs.logits
            loss_fnct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fnct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            if wandb.run is not None:
                wandb.log({"batch_loss": loss.item()})
    
            return (loss, outputs) if return_outputs else loss
        except Exception as e:
            logger.error(f'Error computing loss: {e}')
            raise

## Test

In [11]:
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

In [12]:
# output

### Metrics

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    metrics = {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    if wandb.run is not None:
        wandb.log({"eval_" + k: v for k, v in metrics.items()})
    return metrics

# Running...

In [None]:
if __name__ == '__main__':
    try:
        # Intialization of WANDB
        wandb.login(key=WANDB_API_KEY)
        wandb.init(
            entity =WANDB_ENTITY,
            project=WANDB_PROJECT, 
            config={
                "model_name": MODEL_NAME,
                "num_epochs": NUM_EPOCHS,
                "batch_size": BATCH_SIZE,
                "learning_rate": LEARNING_RATE,
                "max_seq_length": MAX_SEQ_LENGTH,
                "class_config": CLASS_CONFIG,
                "sample_frac": SAMPLE_FRAC
            }
        )
        logger.info(f'W&B initialized for project: {WANDB_PROJECT}')
        
         # Initialization of tokenizer
        logger.info(f"Loading tokenizer for {MODEL_NAME}...")
        tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
        # Loading data
        logger.info(f"Loading and preprocessing data from {DATA_DIR}...")
        train_ds, val_ds, test_ds, label_encoder, class_weights, feature_names = load_and_prepare(
            data_dir=DATA_DIR,
            class_config=CLASS_CONFIG,
            tokenizer=tokenizer,
            max_seq_len=MAX_SEQ_LENGTH,
            random_state=RANDOM_STATE,
            sample_frac=SAMPLE_FRAC
        )
        
        logger.info("Sample textualized data:")
        for i in range(min(3, len(train_ds))):
            logger.info(f"Text: {train_ds['text'][i]}")
            logger.info(f"Label: {label_encoder.inverse_transform([train_ds['label'][i]])[0]}")
        
        if wandb.run is not None:
            wandb.log({
                'training_size': len(train_ds),
                'val_size': len(val_ds),
                'test_size': len(test_ds),
                'num_classes': len(label_encoder.classes_),
                'features_used': len(feature_names)
            })
    
        num_labels = len(label_encoder.classes_)
        id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
        label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
    
        logger.info(f"Number of labels: {num_labels}, Classes: {list(label_encoder.classes_)}, Test size: {len(test_ds)}")
        if wandb.run is not None:
            wandb.log({'classes': list(label_encoder.classes_)})
        
        # Validate class config
        expected_classes = {2: 2, 6: 6, 19: 19}.get(CLASS_CONFIG)
        if num_labels != expected_classes:
            raise ValueError(f"Expected {expected_classes} classes, but found {num_labels}.")
        
        logger.info('\nScript execution completed successfully!')
    except Exception as e:
        logger.info(f'Error: An exception occured during execution: {e}')
        raise



INFO:__main__:W&B initialized for project: llm-anomaly-detection
INFO:__main__:Loading tokenizer for roberta-base...
INFO:__main__:Loading and preprocessing data from /data/user/bsindala/PhD/Research/DataSets/CICIoMT2024/WiFI and MQTT/attacks/CSV/...
INFO:__main__:Loading and preparing datasets for 2-class configuration...
INFO:__main__:Loading datasets with streaming...
INFO:__main__:Processing train data...
