In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    precision_recall_curve, auc
)

In [2]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier

In [3]:
RANDOM_STATE = 42

In [4]:
def load_and_split(csv_path):
    df = pd.read_csv(csv_path)
    X = df['Query'].astype(str)
    y = df['Label'].astype(int)
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

In [5]:
def vectorize(X_train, X_test):
    # TF-IDF vectorizer (char-level n-grams)
    vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), max_features=5000)
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    return X_train_vec, X_test_vec, vec

In [11]:
def get_models():
    # Note: SVC requires probability=True for soft voting
    models = {
        'lr': LogisticRegression(max_iter=1000),
        'dt': DecisionTreeClassifier(random_state=RANDOM_STATE),
        'rf': RandomForestClassifier(n_estimators=100, max_depth=None, random_state=RANDOM_STATE, n_jobs=-1),
        'svc': SVC(kernel='linear', C=1.0, probability=True, random_state=RANDOM_STATE),
        'knn': KNeighborsClassifier(n_neighbors=5, weights='uniform', n_jobs=-1),
        # 'mlp': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=RANDOM_STATE)
    }
    return models

In [12]:
def print_eval(y_true, y_pred, y_prob=None):
    print("=" * 80)
    # print(f"Results for {model.__class__.__name__} on {dataset_path}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("=" * 80)
    if y_prob is not None:
        # compute PR-AUC for positive class (label=1)
        precision, recall, _ = precision_recall_curve(y_true, y_prob[:, 1])
        pr_auc = auc(recall, precision)
        print(f"PR-AUC: {pr_auc:.4f}")

In [17]:
def run_ensembles(dataset_path):
    print("\n=== Dataset:", dataset_path, "===\n")
    X_train, X_test, y_train, y_test = load_and_split(dataset_path)
    X_train_vec, X_test_vec, vec = vectorize(X_train, X_test)
    # # Fix for sparse matrix read-only issue on Python 3.12 + SciPy
    # X_train_vec = X_train_vec.copy()

    models = get_models()

    # Optionally: fit each base model separately (not required for VotingClassifier, but useful to inspect)
    # for name, m in models.items():
    #     m.fit(X_train_vec, y_train)
    #     print(name, "trained")

    # Build estimator list for VotingClassifier
    estimators = [(name, models[name]) for name in ['lr','dt','rf','svc','knn']]

    # 1) Hard Voting (majority)
    vc_hard = VotingClassifier(estimators=estimators, voting='hard', n_jobs=1)
    vc_hard.fit(X_train_vec, y_train)
    y_pred_hard = vc_hard.predict(X_test_vec)
    print(">> Hard Voting (majority):")
    # Hard voting does not expose reliable probabilities; pass None
    print_eval(y_test, y_pred_hard, y_prob=None)
    print("-" * 70)

    # 2) Soft Voting (average probabilities)
    # Requires all estimators to support predict_proba (they do in our choice)
    vc_soft = VotingClassifier(estimators=estimators, voting='soft', n_jobs=1)
    vc_soft.fit(X_train_vec, y_train)
    y_pred_soft = vc_soft.predict(X_test_vec)
    y_prob_soft = vc_soft.predict_proba(X_test_vec)
    print(">> Soft Voting (average probabilities):")
    print_eval(y_test, y_pred_soft, y_prob_soft)
    print("-" * 70)

    # 3) Weighted Voting (soft) - example weights
    # Choose weights to favor stronger models (we can tune these)
    # Here I give higher weight to RandomForest
    weight_map = {'lr':3, 'dt':2, 'rf':4, 'svc':2, 'knn':1}
    weights = [weight_map[name] for name, _ in estimators]

    vc_weighted = VotingClassifier(estimators=estimators, voting='soft', weights=weights, n_jobs=1)
    vc_weighted.fit(X_train_vec, y_train)
    y_pred_weighted = vc_weighted.predict(X_test_vec)
    y_prob_weighted = vc_weighted.predict_proba(X_test_vec)
    print(">> Weighted Voting (soft) - weights:", weight_map)
    print_eval(y_test, y_pred_weighted, y_prob_weighted)
    print("-" * 70)

In [18]:
datasets = [
        "../Dataset/Raw/SQLi_Original_Raw.csv",
        "../Dataset/Raw/SQLi_RUS_Raw.csv",
        "../Dataset/Raw/SQLi_ROS_Raw.csv"
    ]

for path in datasets:
    run_ensembles(path)


=== Dataset: ../Dataset/Raw/SQLi_Original_Raw.csv ===

>> Hard Voting (majority):
Accuracy: 0.9953104786545925
Confusion Matrix:
 [[3907    1]
 [  28 2248]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9929    0.9997    0.9963      3908
           1     0.9996    0.9877    0.9936      2276

    accuracy                         0.9953      6184
   macro avg     0.9962    0.9937    0.9949      6184
weighted avg     0.9953    0.9953    0.9953      6184

----------------------------------------------------------------------
>> Soft Voting (average probabilities):
Accuracy: 0.9957956015523933
Confusion Matrix:
 [[3907    1]
 [  25 2251]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9936    0.9997    0.9967      3908
           1     0.9996    0.9890    0.9943      2276

    accuracy                         0.9958      6184
   macro avg     0.9966    0.9944    0.9955      6184
weighted 