In [1]:
# Random Forest
def run_model(dataset_path, model, vectorizer):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

    # Load dataset
    df = pd.read_csv(dataset_path)
    X = df['Query']
    y = df['Label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # TF-IDF vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model.fit(X_train_vec, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_vec)
    print("=" * 80)
    print(f"Results for {model.__class__.__name__} on {dataset_path}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("=" * 80)


In [3]:
# SetUp
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer (same char-level n-grams as before)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), max_features=5000)

# Random Forest model
model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    max_depth=None,        # Let trees grow fully
    random_state=42,
    n_jobs=-1              # Use all cores for speed
)

In [4]:
  # Imbalanced
run_model("../Dataset/Raw/SQLi_Original_Raw.csv", model, vectorizer)

Results for RandomForestClassifier on ../Dataset/Raw/SQLi_Original_Raw.csv
Accuracy: 0.9961190168175937
Confusion Matrix:
 [[3907    1]
 [  23 2253]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9941    0.9997    0.9969      3908
           1     0.9996    0.9899    0.9947      2276

    accuracy                         0.9961      6184
   macro avg     0.9969    0.9948    0.9958      6184
weighted avg     0.9961    0.9961    0.9961      6184



In [5]:
# RUS Balanced
run_model("../Dataset/Raw/SQLi_RUS_Raw.csv", model, vectorizer)  

Results for RandomForestClassifier on ../Dataset/Raw/SQLi_RUS_Raw.csv
Accuracy: 0.9934109378431804
Confusion Matrix:
 [[2271    6]
 [  24 2252]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9895    0.9974    0.9934      2277
           1     0.9973    0.9895    0.9934      2276

    accuracy                         0.9934      4553
   macro avg     0.9934    0.9934    0.9934      4553
weighted avg     0.9934    0.9934    0.9934      4553



In [6]:
# ROS Balanced
run_model("../Dataset/Raw/SQLi_ROS_Raw.csv", model, vectorizer) 

Results for RandomForestClassifier on ../Dataset/Raw/SQLi_ROS_Raw.csv
Accuracy: 0.9975687779910428
Confusion Matrix:
 [[3905    3]
 [  16 3891]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9959    0.9992    0.9976      3908
           1     0.9992    0.9959    0.9976      3907

    accuracy                         0.9976      7815
   macro avg     0.9976    0.9976    0.9976      7815
weighted avg     0.9976    0.9976    0.9976      7815

