# Telecom Revenue Assurance AI Model (Fine-Tuned BERT with {PEFT/LoRA})
Author: Fatih E. NAR

This notebook trains a BERT model with LoRA (Low-Rank Adaptation) for detecting fraud in a telco revenue assurance context. The training is optimized for low capacity GPU resources by using techniques such as gradient accumulation and mixed precision training.

In [None]:
# Install Necessary Libraries
!pip install -r requirements.txt
!pip install transformers datasets accelerate peft bitsandbytes torch scikit-learn accelerate -U

In [None]:
# Import Libraries
import pandas as pd
import torch
import numpy as np
import lzma
import shutil
import bitsandbytes as bnb
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.optim import AdamW

# Extract the .xz file
xz_file_path = 'data/telecom_revass_data.csv.xz'
csv_file_path = 'data/telecom_revass_data.csv'
with lzma.open(xz_file_path, 'rb') as f_in:
    with open(csv_file_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

model_name = 'bert-base-uncased'

# Check if MPS (Metal Performance Shaders) or CUDA is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
elif torch.cuda.is_available():
    device = torch.device("cuda")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=5e-5)
    !nvidia-smi
else:
    device = torch.device("cpu")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

print(f'Using device: {device}')

# Load your dataset
df = pd.read_csv(csv_file_path)

# Convert data to text format
df['text'] = df.apply(lambda row: (
    f"Call Duration: {row['Call_Duration']}, Data Usage: {row['Data_Usage']}, "
    f"SMS Count: {row['Sms_Count']}, Roaming Indicator: {row['Roaming_Indicator']}, "
    f"Mobile Wallet Use: {row['MobileWallet_Use']}, Plan Type: {row['Plan_Type']}, "
    f"Cost: {row['Cost']}, Cellular Location Distance: {row['Cellular_Location_Distance']}, "
    f"Personal Pin Used: {row['Personal_Pin_Used']}, Avg Call Duration: {row['Avg_Call_Duration']}, "
    f"Avg Data Usage: {row['Avg_Data_Usage']}, Avg Cost: {row['Avg_Cost']}"
), axis=1)

# Split the data
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['Fraud'])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'Fraud']])
test_dataset = Dataset.from_pandas(test_df[['text', 'Fraud']])

# Rename column if it exists
if 'Fraud' in train_dataset.column_names:
    train_dataset = train_dataset.rename_column('Fraud', 'labels')
if 'Fraud' in test_dataset.column_names:
    test_dataset = test_dataset.rename_column('Fraud', 'labels')

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Apply LoRA to the model
config = LoraConfig(r=4, lora_alpha=8, lora_dropout=0.2, bias="none")
model = get_peft_model(model, config)


In [None]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Check if CUDA is available and set fp16 accordingly
use_fp16 = torch.cuda.is_available()

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjusted for low capacity GPU,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,  # Increased to handle larger effective batch sizes,  # Accumulate gradients
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
    fp16=use_fp16  # Enable mixed precision training if CUDA is available  # Enable mixed precision training if using CUDA
)

# Define custom compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Initialize the Trainer
trainer_args = {
    'model': model,
    'args': training_args,
    #'train_dataset': train_dataset,
    #'eval_dataset': test_dataset,
    'compute_metrics': compute_metrics,
    'tokenizer': tokenizer,
    'train_dataset': train_dataset.select(range(1000)),  # Use a subset for debugging
    'eval_dataset': test_dataset.select(range(200)),  # Use a subset for debugging
}

if device == torch.device('cuda'):
    trainer_args['optimizers'] = (AdamW(model.parameters(), lr=2e-5), None)  # Optimizer < enable only when using with NVIDIA

trainer = Trainer(**trainer_args)

# Train the model
trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate()

# Print results with formatting
for key, value in results.items():
    print(f"{key}: {value:.4f}")

In [None]:
# Save the fine-tuned model
model_save_path = 'models/fine-tuned-bert-perf-revass'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Export the model to ONNX format
dummy_input = {
    "input_ids": torch.zeros(1, train_dataset[0]['input_ids'].shape[0], dtype=torch.long).to(device),
    "attention_mask": torch.zeros(1, train_dataset[0]['attention_mask'].shape[0], dtype=torch.long).to(device)
}

onnx_model_path = 'models/fine-tuned-bert-revass.onnx'
torch.onnx.export(
    model,
    (dummy_input["input_ids"], dummy_input["attention_mask"]),
    onnx_model_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}, "logits": {0: "batch_size"}},
    opset_version=14  # Use opset version 14
)

print(f"Model exported to {onnx_model_path}")