In [1]:
# KNN - KNeighborsClassifier
def run_model(dataset_path, model, vectorizer):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

    # Load dataset
    df = pd.read_csv(dataset_path)
    X = df['Query']
    y = df['Label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # TF-IDF vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model.fit(X_train_vec, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_vec)
    print("=" * 80)
    print(f"Results for {model.__class__.__name__} on {dataset_path}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("=" * 80)


In [2]:
# SetUp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizer (same settings as before)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), max_features=5000)

# K-Nearest Neighbors model
model = KNeighborsClassifier(
    n_neighbors=5,        # Default = 5 neighbors
    weights='uniform',    # Can also try 'distance'
    n_jobs=-1             # Use all cores (for large test sets)
)

In [3]:
# Imbalanced
run_model("../Dataset/Raw/SQLi_Original_Raw.csv", model, vectorizer)

Results for KNeighborsClassifier on ../Dataset/Raw/SQLi_Original_Raw.csv
Accuracy: 0.9605433376455369
Confusion Matrix:
 [[3866   42]
 [ 202 2074]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9503    0.9893    0.9694      3908
           1     0.9802    0.9112    0.9444      2276

    accuracy                         0.9605      6184
   macro avg     0.9652    0.9503    0.9569      6184
weighted avg     0.9613    0.9605    0.9602      6184



In [4]:
# RUS Balanced
run_model("../Dataset/Raw/SQLi_RUS_Raw.csv", model, vectorizer) 

Results for KNeighborsClassifier on ../Dataset/Raw/SQLi_RUS_Raw.csv
Accuracy: 0.9352075554579398
Confusion Matrix:
 [[2222   55]
 [ 240 2036]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9025    0.9758    0.9378      2277
           1     0.9737    0.8946    0.9324      2276

    accuracy                         0.9352      4553
   macro avg     0.9381    0.9352    0.9351      4553
weighted avg     0.9381    0.9352    0.9351      4553



In [5]:
# ROS Balanced
run_model("../Dataset/Raw/SQLi_ROS_Raw.csv", model, vectorizer)

Results for KNeighborsClassifier on ../Dataset/Raw/SQLi_ROS_Raw.csv
Accuracy: 0.9564939219449776
Confusion Matrix:
 [[3800  108]
 [ 232 3675]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9425    0.9724    0.9572      3908
           1     0.9715    0.9406    0.9558      3907

    accuracy                         0.9565      7815
   macro avg     0.9570    0.9565    0.9565      7815
weighted avg     0.9570    0.9565    0.9565      7815

