In [1]:
# Logistic Regression
def run_model(dataset_path, model, vectorizer):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

    # Load dataset
    df = pd.read_csv(dataset_path)
    X = df['Query']
    y = df['Label']

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # TF-IDF
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train model
    model.fit(X_train_vec, y_train)

    # Predict & evaluate
    y_pred = model.predict(X_test_vec)
    print("="*80)
    print(f"\nResults for {model.__class__.__name__} on {dataset_path}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("="*80)
    

In [2]:
# Setup
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), max_features=5000)
model = LogisticRegression(max_iter=1000)

In [3]:
# imbalance 
run_model("../Dataset/Raw/SQLi_Original_Raw.csv", model, vectorizer)


Results for LogisticRegression on ../Dataset/Raw/SQLi_Original_Raw.csv
Accuracy: 0.9938551099611902
Confusion Matrix:
 [[3889    4]
 [  34 2257]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9913    0.9990    0.9951      3893
           1     0.9982    0.9852    0.9917      2291

    accuracy                         0.9939      6184
   macro avg     0.9948    0.9921    0.9934      6184
weighted avg     0.9939    0.9939    0.9938      6184



In [4]:
# balance - RUS (Random under sampling)
run_model("../Dataset/Raw/SQLi_RUS_Raw.csv", model, vectorizer)


Results for LogisticRegression on ../Dataset/Raw/SQLi_RUS_Raw.csv
Accuracy: 0.9934109378431804
Confusion Matrix:
 [[2265    5]
 [  25 2258]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9891    0.9978    0.9934      2270
           1     0.9978    0.9890    0.9934      2283

    accuracy                         0.9934      4553
   macro avg     0.9934    0.9934    0.9934      4553
weighted avg     0.9934    0.9934    0.9934      4553



In [5]:
# balance - ROS (Random over sampling)
run_model("../Dataset/Raw/SQLi_ROS_Raw.csv", model, vectorizer)


Results for LogisticRegression on ../Dataset/Raw/SQLi_ROS_Raw.csv
Accuracy: 0.9933461292386436
Confusion Matrix:
 [[3867    3]
 [  49 3896]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9875    0.9992    0.9933      3870
           1     0.9992    0.9876    0.9934      3945

    accuracy                         0.9933      7815
   macro avg     0.9934    0.9934    0.9933      7815
weighted avg     0.9934    0.9933    0.9933      7815

