In [1]:
# Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")
print("CUDA version:", torch.version.cuda)

# import sys
# print("CUDA available:", torch.cuda.is_available())
# print("CUDA version:", torch.version.cuda)
# print("Torch version:", torch.__version__)
# print("Python executable:", sys.executable)


Running on: cuda
CUDA version: 12.1


In [7]:
# Load Pretrained Model
MODEL_NAME = "cssupport/mobilebert-sql-injection-detect"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, trust_remote_code=True, use_safetensors=True)
model.to(device)
model.eval()

MobileBertForSequenceClassification(
  (mobilebert): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0-23): 24 x MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_fe

In [11]:
# Batch Prediction Function
def predict_batch(queries, batch_size=32):
    """Run inference on a batch of queries and return predictions (0=benign, 1=SQLi)."""
    preds = []
    for i in range(0, len(queries), batch_size):
        batch = queries[i:i+batch_size].tolist()
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        preds.extend(batch_preds)
    return preds

In [12]:
# Evaluation Function
def evaluate_pretrained_model(dataset_path):
    df = pd.read_csv(dataset_path)
    X = df['Query']
    y = df['Label']

    y_pred = predict_batch(X)

    print("=" * 80)
    print(f"Results for Pretrained MobileBERT on {dataset_path}")
    print("Accuracy:", accuracy_score(y, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
    print("Classification Report:\n", classification_report(y, y_pred, digits=4))
    print("=" * 80)

In [13]:
# Imbalanced
evaluate_pretrained_model("../Dataset/Raw/SQLi_Original_Raw.csv")

Results for Pretrained MobileBERT on ../Dataset/Raw/SQLi_Original_Raw.csv
Accuracy: 0.9935961706394126
Confusion Matrix:
 [[19482    55]
 [  143 11239]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9927    0.9972    0.9949     19537
           1     0.9951    0.9874    0.9913     11382

    accuracy                         0.9936     30919
   macro avg     0.9939    0.9923    0.9931     30919
weighted avg     0.9936    0.9936    0.9936     30919



In [14]:
# RUS Balanced
evaluate_pretrained_model("../Dataset/Raw/SQLi_RUS_Raw.csv")

Results for Pretrained MobileBERT on ../Dataset/Raw/SQLi_RUS_Raw.csv
Accuracy: 0.9924442101563873
Confusion Matrix:
 [[11353    29]
 [  143 11239]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9876    0.9975    0.9925     11382
           1     0.9974    0.9874    0.9924     11382

    accuracy                         0.9924     22764
   macro avg     0.9925    0.9924    0.9924     22764
weighted avg     0.9925    0.9924    0.9924     22764



In [15]:
# ROS Balanced
evaluate_pretrained_model("../Dataset/Raw/SQLi_ROS_Raw.csv")

Results for Pretrained MobileBERT on ../Dataset/Raw/SQLi_ROS_Raw.csv
Accuracy: 0.9923222603265599
Confusion Matrix:
 [[19482    55]
 [  245 19292]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9876    0.9972    0.9924     19537
           1     0.9972    0.9875    0.9923     19537

    accuracy                         0.9923     39074
   macro avg     0.9924    0.9923    0.9923     39074
weighted avg     0.9924    0.9923    0.9923     39074

