## Telecom Revenue Assurance AI Model (Transformer-NN From Dust) 
Author: Fatih E. NAR

In [None]:
## Install dependencies
!pip install -r requirements.txt
!pip install torch transformers

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import lzma
import shutil
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Extract the .xz file
with lzma.open('data/telecom_revass_data.csv.xz', 'rb') as f_in:
    with open('data/telecom_revass_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Load the synthetic telecom data
data_path = "data/telecom_revass_data.csv"
df = pd.read_csv(data_path)

# Drop the 'Fraud' column to exclude it from features
X = df.drop('Fraud', axis=1)
y = df['Fraud']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=(['number'])).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Fit the preprocessor on the training data
preprocessor.fit(X_train)

# Transform the features
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Create PyTorch datasets
tensor_X_train = torch.tensor(X_train_transformed, dtype=torch.float32)
tensor_y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # Reshape to (batch_size, 1)
train_dataset = TensorDataset(tensor_X_train, tensor_y_train)

tensor_X_test = torch.tensor(X_test_transformed, dtype=torch.float32)
tensor_y_test = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)  # Reshape to (batch_size, 1)
test_dataset = TensorDataset(tensor_X_test, tensor_y_test)

In [None]:
# Check if MPS (Metal Performance Shaders) or CUDA is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

## Model Definition and Training using Transformer Architecture

In [None]:
# Define model architecture
class SimpleTransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead, output_dim):
        super(SimpleTransformerModel, self).__init__()
        self.input_projection = nn.Linear(input_dim, d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        self.fc = nn.Linear(d_model, output_dim)
    
    def forward(self, x):
        x = self.input_projection(x)
        x = self.transformer_encoder(x.unsqueeze(1))  # Add sequence dimension
        x = x.mean(dim=1)  # Aggregate over the sequence dimension, retaining batch size
        x = self.fc(x).squeeze(1)  # Output shape: (batch_size)
        return x

# Training parameters
input_dim = X_train_transformed.shape[1]
d_model = 32
nhead = 2
output_dim = 1  # Output should be a single value for binary classification

# Initialize model, loss function, and optimizer
model = SimpleTransformerModel(input_dim, d_model, nhead, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Ensure your model is on the correct device
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_dataloader:
        # Move data to the same device as the model
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch.squeeze(1))  # Ensure target is of shape (batch_size)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

## Model Evaluation

In [None]:
# Evaluation on the test set
model.eval()
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test_loss = 0
correct = 0
with torch.no_grad():
    for X_batch, y_batch in test_dataloader:
        # Move data to the same device as the model
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch.squeeze(1))
        test_loss += loss.item()
        preds = (torch.sigmoid(outputs) > 0.5).float()
        correct += (preds == y_batch.squeeze(1)).sum().item()

test_loss /= len(test_dataloader)
accuracy = correct / len(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {accuracy}")


## Saving the Model

In [None]:
# Save the model
model_path = 'models/revass_transformer_model.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")