# Telco Platform Engineering Security Operations <br> AI Insights Generation with Fine Tuning of Meta's BART Model
Author: Fatih E. NAR <br>
This project aims to deliver a security risk factor insights <br>
Model Card: https://huggingface.co/facebook/bart-large <br>[Work In Progress]

In [None]:
# Install the required packages
%pip install -r requirements.txt

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import torch
import os
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import BartTokenizer, BartForConditionalGeneration, get_linear_schedule_with_warmup, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType

fp16 = False
# Check for accelerator availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    fp16 = True # only allow mix-precision training on Nvidia GPUs
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
    max_memory_mb = 11.8 * 1024 # Limit Memory to Avoid OOM Issues
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_memory_mb}'
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
else:
    device = torch.device("cpu")
print(f'Using device: {device}')

# Load the data
data = pd.read_parquet('data/5G_SecOps_Data.parquet')

# Display the first few rows of the dataframe
print(data.head())

In [None]:
# Preprocess the data
def preprocess_data(data):
    # Convert categorical variables to numeric
    data['Service Configurations'] = data['Service Configurations'].map({'Poor': 0, 'Medium': 1, 'Good': 2})
    data['Service Reachability'] = data['Service Reachability'].map({'Internal': 0, 'External': 1})
    data['RBAC Configuration'] = data['RBAC Configuration'].map({'Low': 0, 'Medium': 1, 'High': 2})
    
    # Drop non-numeric columns
    data = data.drop(columns=['Cluster', 'Namespace'])
    
    return data

data = preprocess_data(data)

# Define features and target
X = data.drop(columns=['Risk Score'])
y = data['Risk Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to string format for BART
X_train = X_train.astype(str)
X_test = X_test.astype(str)
y_train = y_train.astype(str)
y_test = y_test.astype(str)

In [None]:
# Tokenize the data
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

class Seq2SeqDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        target_text = str(self.targets[idx])
        input_encoding = self.tokenizer(
            input_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

train_dataset = Seq2SeqDataset(X_train.values.tolist(), y_train.values.tolist(), tokenizer)
test_dataset = Seq2SeqDataset(X_test.values.tolist(), y_test.values.tolist(), tokenizer)

In [None]:
# Define the model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)

# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    gradient_accumulation_steps=16,  # Accumulate gradients over multiple steps
    learning_rate=5e-5,  # Learning rate
    save_steps=2000,  # Save checkpoint every 2000 steps
    save_total_limit=2,  # Limit the total amount of checkpoints
    eval_strategy="steps",  # Evaluate during training at each `logging_steps`
    logging_steps=500,  # Log every 500 steps
    eval_steps=2000,  # Evaluate every 2000 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss",  # Use loss to evaluate the best model
    predict_with_generate=True,  # Use generation for evaluation
    fp16=fp16,  # Disable mixed precision training for MPS
)

# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


In [None]:
# Save the model
model.save_pretrained('bart_risk_score_model')
tokenizer.save_pretrained('bart_risk_score_model')

# Export the model to ONNX
dummy_input = torch.tensor(tokenizer.encode("dummy input text", return_tensors="pt"), dtype=torch.float32).to(device)
torch.onnx.export(model, (dummy_input, torch.tensor([1], dtype=torch.float32).to(device)), "bart_risk_score_model.onnx", 
                  input_names=["input_ids", "attention_mask"], output_names=["output"], 
                  dynamic_axes={"input_ids": {0: "batch_size"}, "output": {0: "batch_size"}})
