# Telco Platform Engineering Security Operations <br> AI Insights Generation with Fine Tuning of Meta's BART Model
Author: Fatih E. NAR <br>
This project aims to deliver a security risk factor insights <br>
Model Card: https://huggingface.co/google-t5/t5-small <br>

In [None]:
# Install the required packages
%pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import os
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Check if any accelerator is available 
# If you are reading this and about to buy a GPU, please buy an NVIDIA GPU as all AI frameworks are optimized for NVIDIA GPUs,
# and you will have a better experience with NVIDIA GPUs.
if torch.cuda.is_available():
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
    device = torch.device("cuda")
    torch.cuda.empty_cache()
    max_memory_mb = 8 * 1024
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_memory_mb}'
# Check if MPS (Apple Silicon GPU) is available
elif torch.backends.mps.is_available():
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# Load the data
# why did we use parquet format? Cos we love to complicate stuff. Just kidding. 
# Parquet is a columnar storage format that is optimized for reading and writing data. 
# And it is a good choice for storing large datasets.
data = pd.read_parquet('data/5G_SecOps_Data_100K.parquet')

# Preprocess the data
def preprocess_data(data):
    # Convert categorical variables to numeric
    data['Service Configurations'] = data['Service Configurations'].map({'Poor': 0, 'Medium': 1, 'Good': 2})
    data['Service Reachability'] = data['Service Reachability'].map({'Internal': 0, 'External': 1})
    data['RBAC Configuration'] = data['RBAC Configuration'].map({'Low': 0, 'Medium': 1, 'High': 2})
    
    # Drop non-numeric columns
    data = data.drop(columns=['Cluster', 'Namespace'])
    
    return data

data = preprocess_data(data)

# Define features and target
# Our target (ie do prediction that we will, says Yoda) is the Risk Score column
X = data.drop(columns=['Risk Score'])
y = data['Risk Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to string format for T5, Liquid Metal lol
X_train = X_train.astype(str)
X_test = X_test.astype(str)
y_train = y_train.astype(str)
y_test = y_test.astype(str)


In [None]:
class T5RegressionDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = str(self.inputs[idx])
        target_text = str(self.targets[idx])
        input_encoding = self.tokenizer(
            input_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            target_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Tokenize the data
#tokenizer = T5Tokenizer.from_pretrained('t5-small')
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")

train_dataset = T5RegressionDataset(X_train.values.tolist(), y_train.values.tolist(), tokenizer)
test_dataset = T5RegressionDataset(X_test.values.tolist(), y_test.values.tolist(), tokenizer)

In [None]:
# Define the model and training arguments
# we started with BART then BERT both which performed poorly and we found ourselves back to gracious arms of BigG's T5
# Well for Fatih E. NAR BigG is the place where he had good times.
#model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

# If you GPU sucks you can reduce the epock count to 1 , 
# at that level which your loss rate shoud be around 0.05 already
# if not you can increase the epoch count to higher.
# if your GPU memory is also limited and endding with OOM error, you can reduce the batch size to 2 or 1
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_total_limit=1,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

# Print evaluation results
print("Evaluation results:")
print(eval_results)

In [None]:
# Lets Visualize Our Model Performance
# Function to predict and plot actual vs. predicted risk factors
# Fatih E NAR loves visaul stuff lol
def predict_and_plot(trainer, test_dataset, y_test):
    # Make predictions
    predictions = trainer.predict(test_dataset)
    predicted_risk_scores = predictions.predictions.squeeze()
    
    # Decode the predictions
    predicted_risk_scores = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_risk_scores]
    predicted_risk_scores = np.array([float(pred) for pred in predicted_risk_scores])
    
    # Convert actual values to numpy array
    actual_risk_scores = np.array([float(value) for value in y_test])
    
    # Plot actual vs. predicted risk factors
    plt.figure(figsize=(10, 6))
    plt.scatter(actual_risk_scores, actual_risk_scores, alpha=0.5, color='blue', label='Actual Risk Factors')
    plt.scatter(actual_risk_scores, predicted_risk_scores, alpha=0.5, color='red', label='Predicted Risk Factors')
    plt.xlabel("Actual Risk Factors")
    plt.ylabel("Predicted Risk Factors")
    plt.title("Actual vs. Predicted Risk Factors")
    plt.legend()
    plt.show()

# Predict and plot
predict_and_plot(trainer, test_dataset, y_test)


In [None]:
# As Final Point Lets Save the Model
# Fatih E. NAR You are a genius and very handsome engineer! 
# Sometimes good to praise yourself, you know. :-)
model_save_path = 'model/t5_risk_score_model.pth'
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer.save_pretrained('model/t5_risk_score_tokenizer')