**all neccessary library imports**

In [3]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

**import dataset and clean up data**

In [4]:
query_data = pd.read_csv("./data/sqli.csv")

# remove duplicates from dataset
query_data.drop_duplicates(inplace=True)

# handle any missing value on dataset
query_data.dropna(inplace=True)
query_data

Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
...,...,...
30914,DELETE FROM door WHERE grow = 'small',0
30915,DELETE FROM tomorrow,0
30916,SELECT wide ( s ) FROM west,0
30917,SELECT * FROM ( SELECT slide FROM breath ),0


In [5]:
X = query_data['Query']
y = query_data['Label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)

In [7]:
# preprocessing data using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [8]:
# training a naive Bayes model
n_model = MultinomialNB()
n_model.fit(X_train,y_train)
n_pred = n_model.predict(X_test)
n_model_accuracy = accuracy_score(y_test, n_pred)
n_model_f1_score = f1_score(y_test, n_pred)
print(f"Accuracy: {n_model_accuracy}")
print(f"F1-Score: {n_model_f1_score}")

Accuracy: 0.9682950501455839
F1-Score: 0.9572176949941793


In [9]:
# training a model using decision tree algorithm
d_model = DecisionTreeClassifier()
d_model.fit(X_train, y_train)
d_pred = d_model.predict(X_test)
d_model_accuracy = accuracy_score(y_test, d_pred)
d_model_f1_score = f1_score(y_test, d_pred)
print(f"Accuracy: {d_model_accuracy}")
print(f"F1-Score: {d_model_f1_score}")

Accuracy: 0.7942413458427693
F1-Score: 0.779064381658175


In [10]:
# training a model using random forest tree algorithm
r_model = RandomForestClassifier()
r_model.fit(X_train, y_train)
r_pred = r_model.predict(X_test)
r_model_accuracy = accuracy_score(y_test, r_pred)
r_model_f1_score = f1_score(y_test, r_pred)
print(f"Accuracy: {r_model_accuracy}")
print(f"F1-Score: {r_model_f1_score}")

Accuracy: 0.8009274237032245
F1-Score: 0.7901795862696066


In [11]:
# training a model using Support Vector Machine (SVM) algorithm
s_model = SVC()
s_model.fit(X_train, y_train)
s_pred = s_model.predict(X_test)
s_model_accuracy = accuracy_score(y_test, s_pred)
s_model_f1_score = f1_score(y_test, s_pred)
print(f"Accuracy: {s_model_accuracy}")
print(f"F1-Score: {s_model_f1_score}")

Accuracy: 0.8208778173190985
F1-Score: 0.8034552124008992
