In [108]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.pipeline import make_pipeline

In [109]:
df = pd.read_csv("SQLiV3.csv")

In [110]:
df.head()

Unnamed: 0,Sentence,Label,Unnamed: 2,Unnamed: 3
0,""" or pg_sleep ( __TIME__ ) --",1.0,,
1,create user name identified by pass123 tempora...,,1.0,
2,AND 1 = utl_inaddr.get_host_address ( ...,1.0,,
3,select * from users where id = '1' or @ @1 ...,1.0,,
4,"select * from users where id = 1 or 1#"" ( ...",1.0,,


In [111]:
df[df['Sentence']==" "]

Unnamed: 0,Sentence,Label,Unnamed: 2,Unnamed: 3
447,,drop table temp --,1.0,
19323,,1,,


In [112]:
df.Label.value_counts()

Label
0                                                                                                                                    19268
1                                                                                                                                    11341
 --                                                                                                                                     11
waitfor delay '0:0:__TIME__'--                                                                                                           5
 DROP TABLE Suppliers                                                                                                                    2
 SELECT * FROM Customers                                                                                                                 1
SELECT SUM ( Quantity )                                                                                                                  1
 SELECT column_name (

In [113]:
df.isna().sum()

Sentence         15
Label           255
Unnamed: 2    30613
Unnamed: 3    30910
dtype: int64

In [114]:
df['Label'].value_counts()

Label
0                                                                                                                                    19268
1                                                                                                                                    11341
 --                                                                                                                                     11
waitfor delay '0:0:__TIME__'--                                                                                                           5
 DROP TABLE Suppliers                                                                                                                    2
 SELECT * FROM Customers                                                                                                                 1
SELECT SUM ( Quantity )                                                                                                                  1
 SELECT column_name (

In [115]:
df = df.loc[df['Label'].isin(["1", "0"])]

In [116]:
df = df.dropna(subset=['Label'])[['Label', 'Sentence']]

In [117]:
df


Unnamed: 0,Label,Sentence
0,1,""" or pg_sleep ( __TIME__ ) --"
2,1,AND 1 = utl_inaddr.get_host_address ( ...
3,1,select * from users where id = '1' or @ @1 ...
4,1,"select * from users where id = 1 or 1#"" ( ..."
5,1,select name from syscolumns where id = ...
...,...,...
30914,0,DELETE FROM door WHERE grow = 'small'
30915,0,DELETE FROM tomorrow
30916,0,SELECT wide ( s ) FROM west
30917,0,SELECT * FROM ( SELECT slide FROM breath )


In [118]:
df.isna().sum()

Label       0
Sentence    0
dtype: int64

In [119]:
df["Label"] = df['Label'].apply(lambda x: int(x))

In [120]:
vector = TfidfVectorizer(lowercase=True)

In [121]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Sentence'], df['Label'], test_size=0.3, random_state=0, stratify=df['Label']
)

In [122]:
vectorizer = vector.fit(X_train)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vector.transform(X_test)

In [123]:
from joblib import dump
dump(vectorizer, "tfidfvec")

['tfidfvec']

In [124]:
log_model = LogisticRegression()

In [125]:
log_model.fit(X_train_vec, y_train)

In [126]:
y_pred = log_model.predict(X_test_vec)

In [127]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5781
           1       0.98      0.93      0.96      3402

    accuracy                           0.97      9183
   macro avg       0.97      0.96      0.97      9183
weighted avg       0.97      0.97      0.97      9183



In [128]:
print(metrics.confusion_matrix(y_test, y_pred))

[[5719   62]
 [ 228 3174]]


In [129]:
dump(log_model, "sqli_jblib")

['sqli_jblib']

In [130]:
pipe = make_pipeline(vectorizer, log_model)

In [131]:
pipe.predict(X_test)

array([1, 0, 0, ..., 0, 0, 0])

In [132]:
dump(pipe, "pipeline")

['pipeline']