In [1]:
# Support Vector Machine (SVM)
def run_model(dataset_path, model, vectorizer):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

    # Load dataset
    df = pd.read_csv(dataset_path)
    X = df['Query']
    y = df['Label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # TF-IDF vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model.fit(X_train_vec, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_vec)
    print("=" * 80)
    print(f"Results for {model.__class__.__name__} on {dataset_path}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("=" * 80)


In [2]:
# SetUp
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer (char-level n-grams are great for SQLi detection)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), max_features=5000)

# Support Vector Machine model
model = SVC(kernel='linear', C=1.0, random_state=42)  # Linear kernel usually best for text

In [3]:
# Imbalanced
run_model("../Dataset/Raw/SQLi_Original_Raw.csv", model, vectorizer)

Results for SVC on ../Dataset/Raw/SQLi_Original_Raw.csv
Accuracy: 0.9951487710219923
Confusion Matrix:
 [[3906    2]
 [  28 2248]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9929    0.9995    0.9962      3908
           1     0.9991    0.9877    0.9934      2276

    accuracy                         0.9951      6184
   macro avg     0.9960    0.9936    0.9948      6184
weighted avg     0.9952    0.9951    0.9951      6184



In [4]:
# RUS Balanced
run_model("../Dataset/Raw/SQLi_RUS_Raw.csv", model, vectorizer)

Results for SVC on ../Dataset/Raw/SQLi_RUS_Raw.csv
Accuracy: 0.9925323962222711
Confusion Matrix:
 [[2270    7]
 [  27 2249]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9882    0.9969    0.9926      2277
           1     0.9969    0.9881    0.9925      2276

    accuracy                         0.9925      4553
   macro avg     0.9926    0.9925    0.9925      4553
weighted avg     0.9926    0.9925    0.9925      4553



In [5]:
# ROS Balanced
run_model("../Dataset/Raw/SQLi_ROS_Raw.csv", model, vectorizer) 

Results for SVC on ../Dataset/Raw/SQLi_ROS_Raw.csv
Accuracy: 0.9950095969289827
Confusion Matrix:
 [[3904    4]
 [  35 3872]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9911    0.9990    0.9950      3908
           1     0.9990    0.9910    0.9950      3907

    accuracy                         0.9950      7815
   macro avg     0.9950    0.9950    0.9950      7815
weighted avg     0.9950    0.9950    0.9950      7815

