In [3]:
# Decision Tree
def run_model(dataset_path, model, vectorizer):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

    # Load dataset
    df = pd.read_csv(dataset_path)
    X = df['Query']
    y = df['Label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # TF-IDF vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model.fit(X_train_vec, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test_vec)
    print("="*80)
    print(f"Results for {model.__class__.__name__} on {dataset_path}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("="*80)


In [4]:
# SetUp
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizer (char-level n-grams)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), max_features=5000)

# Decision Tree model with basic settings
model = DecisionTreeClassifier(random_state=42)

In [5]:
# Imbalanced
run_model("../Dataset/Raw/SQLi_Original_Raw.csv", model, vectorizer)

Results for DecisionTreeClassifier on ../Dataset/Raw/SQLi_Original_Raw.csv
Accuracy: 0.9945019404915912
Confusion Matrix:
 [[3899    9]
 [  25 2251]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9936    0.9977    0.9957      3908
           1     0.9960    0.9890    0.9925      2276

    accuracy                         0.9945      6184
   macro avg     0.9948    0.9934    0.9941      6184
weighted avg     0.9945    0.9945    0.9945      6184



In [6]:
# Balanced - RUS
run_model("../Dataset/Raw/SQLi_RUS_Raw.csv", model, vectorizer)

Results for DecisionTreeClassifier on ../Dataset/Raw/SQLi_RUS_Raw.csv
Accuracy: 0.9912145837909071
Confusion Matrix:
 [[2262   15]
 [  25 2251]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9891    0.9934    0.9912      2277
           1     0.9934    0.9890    0.9912      2276

    accuracy                         0.9912      4553
   macro avg     0.9912    0.9912    0.9912      4553
weighted avg     0.9912    0.9912    0.9912      4553



In [7]:
# Balanced - ROS
run_model("../Dataset/Raw/SQLi_ROS_Raw.csv", model, vectorizer)

Results for DecisionTreeClassifier on ../Dataset/Raw/SQLi_ROS_Raw.csv
Accuracy: 0.9952655150351888
Confusion Matrix:
 [[3887   21]
 [  16 3891]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9959    0.9946    0.9953      3908
           1     0.9946    0.9959    0.9953      3907

    accuracy                         0.9953      7815
   macro avg     0.9953    0.9953    0.9953      7815
weighted avg     0.9953    0.9953    0.9953      7815

