In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


In [43]:
df = pd.read_csv('./dataset/SQLiV3.csv')

In [44]:
df.columns

Index(['Sentence', 'Label', 'Unnamed: 2', 'Unnamed: 3'], dtype='object')

In [45]:
df.head()

Unnamed: 0,Sentence,Label,Unnamed: 2,Unnamed: 3
0,""" or pg_sleep ( __TIME__ ) --",1.0,,
1,create user name identified by pass123 tempora...,,1.0,
2,AND 1 = utl_inaddr.get_host_address ( ...,1.0,,
3,select * from users where id = '1' or @ @1 ...,1.0,,
4,"select * from users where id = 1 or 1#"" ( ...",1.0,,


In [53]:
df = df[df['Label'].isin(['0', '1'])]

In [62]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3'])

In [63]:
df.head()

Unnamed: 0,Sentence,Label
0,""" or pg_sleep ( __TIME__ ) --",1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
5,select name from syscolumns where id = ...,1


In [67]:
df.duplicated().sum()

np.int64(12)

In [68]:
df = df.drop_duplicates()

In [69]:
df.duplicated().sum()

np.int64(0)

In [70]:
df

Unnamed: 0,Sentence,Label
0,""" or pg_sleep ( __TIME__ ) --",1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
5,select name from syscolumns where id = ...,1
...,...,...
30914,DELETE FROM door WHERE grow = 'small',0
30915,DELETE FROM tomorrow,0
30916,SELECT wide ( s ) FROM west,0
30917,SELECT * FROM ( SELECT slide FROM breath ),0


In [72]:
df.isnull().sum()

Sentence    0
Label       0
dtype: int64

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30597 entries, 0 to 30918
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  30597 non-null  object
 1   Label     30597 non-null  object
dtypes: object(2)
memory usage: 717.1+ KB


In [74]:
df = df[df['Label'].isin(['0', '1'])]
df['Label'] = df['Label'].astype(int)

In [76]:
X = df['Sentence']
y = df['Label']

In [None]:
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42
)

In [85]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 162459 stored elements and shape (24477, 24660)>

In [80]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [81]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      3832
           1       0.97      0.90      0.93      2288

    accuracy                           0.95      6120
   macro avg       0.95      0.94      0.95      6120
weighted avg       0.95      0.95      0.95      6120



In [82]:
new_queries = [
    "SELECT * FROM users WHERE id=1",
    "DROP TABLE accounts",
    "SELECT * FROM users WHERE username='' OR '1'='1'",
    "INSERT INTO logins VALUES ('user', 'pass')"
]

X_new = vectorizer.transform(new_queries)
predictions = model.predict(X_new)

for q, p in zip(new_queries, predictions):
    print(f"{q} → {'Malicious' if p == 1 else 'Normal'}")

SELECT * FROM users WHERE id=1 → Malicious
DROP TABLE accounts → Normal
SELECT * FROM users WHERE username='' OR '1'='1' → Malicious
INSERT INTO logins VALUES ('user', 'pass') → Normal
