In [7]:
#linear regression 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1. Load dataset
df = pd.read_csv("/kaggle/input/sql-injection-dataset/Modified_SQL_Dataset.csv")  # ensure columns: 'query', 'label'

# 2. Basic preprocessing
X = df['Query']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Convert text to TF-IDF features
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6))  # char-level n-grams effective for SQLi
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 4. Train Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)

# 5. Evaluate performance
y_pred = lr.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


Accuracy: 0.9898124191461837
              precision    recall  f1-score   support

           0     0.9851    0.9990    0.9920      3893
           1     0.9982    0.9742    0.9861      2291

    accuracy                         0.9898      6184
   macro avg     0.9916    0.9866    0.9890      6184
weighted avg     0.9899    0.9898    0.9898      6184



In [10]:
#logistic regression 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

# Predictions
y_pred = log_reg.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9713777490297542

Confusion Matrix:
 [[3863   30]
 [ 147 2144]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      3893
           1       0.99      0.94      0.96      2291

    accuracy                           0.97      6184
   macro avg       0.97      0.96      0.97      6184
weighted avg       0.97      0.97      0.97      6184



In [15]:
#Logistic Regression with TF-IDF
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Convert text data into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)  
X_vectorized = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

# Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9511642949547219

Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96      3893
           1       0.99      0.88      0.93      2291

    accuracy                           0.95      6184
   macro avg       0.96      0.94      0.95      6184
weighted avg       0.95      0.95      0.95      6184



In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Check columns
print(df.head())

# Convert text to numerical features
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predictions
y_pred = dt.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


                                               Query  Label
0                  " or pg_sleep  (  __TIME__  )  --      1
1  create user name identified by pass123 tempora...      1
2   AND 1  =  utl_inaddr.get_host_address   (    ...      1
3   select * from users where id  =  '1' or @ @1 ...      1
4   select * from users where id  =  1 or 1#"  ( ...      1
Accuracy: 0.9930465717981889

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3893
           1       0.99      0.99      0.99      2291

    accuracy                           0.99      6184
   macro avg       0.99      0.99      0.99      6184
weighted avg       0.99      0.99      0.99      6184


Confusion Matrix:
 [[3881   12]
 [  31 2260]]


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Convert text into numeric features
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Decision Tree model
dt = DecisionTreeClassifier(criterion="entropy", max_depth=20, random_state=42)
dt.fit(X_train, y_train)

# Predictions
y_pred = dt.predict(X_test)

# Evaluation
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Decision Tree Accuracy: 0.9925614489003881

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3893
           1       1.00      0.98      0.99      2291

    accuracy                           0.99      6184
   macro avg       0.99      0.99      0.99      6184
weighted avg       0.99      0.99      0.99      6184



In [19]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.9954721862871928

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3893
           1       1.00      0.99      0.99      2291

    accuracy                           1.00      6184
   macro avg       1.00      0.99      1.00      6184
weighted avg       1.00      1.00      1.00      6184


Confusion Matrix:
 [[3890    3]
 [  25 2266]]


In [20]:
# Support Vector Machine (SVM) Classifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
svm_model = SVC(kernel='rbf', random_state=42)  # 'linear', 'poly', 'rbf', 'sigmoid'

# Train the model
svm_model.fit(X_train, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluation
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


SVM Accuracy: 0.9859314359637775

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3893
           1       0.98      0.98      0.98      2291

    accuracy                           0.99      6184
   macro avg       0.99      0.98      0.98      6184
weighted avg       0.99      0.99      0.99      6184


Confusion Matrix:
 [[3854   39]
 [  48 2243]]


In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create KNN model (you can tune n_neighbors)
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train model
knn_model.fit(X_train, y_train)

# Predict
y_pred_knn = knn_model.predict(X_test)

# Evaluate
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


KNN Accuracy: 0.9094437257438551

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93      3893
           1       1.00      0.76      0.86      2291

    accuracy                           0.91      6184
   macro avg       0.94      0.88      0.90      6184
weighted avg       0.92      0.91      0.91      6184


Confusion Matrix:
 [[3888    5]
 [ 555 1736]]


In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, random_state=42)

# Train the model
mlp_model.fit(X_train, y_train)

# Predict on test data
y_pred_mlp = mlp_model.predict(X_test)

# Evaluate performance
print("Multi-Layer Perceptron (Neural Network) Results:")
print(confusion_matrix(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))


Multi-Layer Perceptron (Neural Network) Results:
[[3844   49]
 [  38 2253]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3893
           1       0.98      0.98      0.98      2291

    accuracy                           0.99      6184
   macro avg       0.98      0.99      0.98      6184
weighted avg       0.99      0.99      0.99      6184

