**all neccessary library imports**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from time import time
import pickle

**import dataset and clean up data**

In [13]:
query_data = pd.read_csv("./data/sqli.csv", encoding='utf-8')

# remove duplicates from dataset
query_data.drop_duplicates(inplace=True)

# handle any missing value on dataset
query_data.dropna(inplace=True)

# check if there any null value in the dataset
query_data.isnull().sum()

Query    0
Label    0
dtype: int64

In [14]:
print(f"Safe Queries: {query_data['Label'].value_counts()[0]}")
print(f"Malicious Queries: {query_data['Label'].value_counts()[1]}")
print(f"Data shape: {query_data.shape}")
query_data.head(10)

Safe Queries: 19529
Malicious Queries: 11378
Data shape: (30907, 2)


Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
5,select name from syscolumns where id = ...,1
6,select * from users where id = 1 +$+ or 1 =...,1
7,"1; ( load_file ( char ( 47,101,116,99,47...",1
8,select * from users where id = '1' or ||/1 ...,1
9,select * from users where id = '1' or \.<\ ...,1


#### data preprocessing

In [15]:
X = query_data['Query']
y = query_data['Label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1) 
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [17]:
vectorizer.get_feature_names_out()

array(['00', '000001', '000003', ..., 'zzip', 'zzkx', 'ã½'], dtype=object)

In [18]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
X_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

> test different model

In [20]:
# training a model using decision tree algorithm
d_model = DecisionTreeClassifier()
start_time = time()
d_model.fit(X_train, y_train)
end_time = time()
d_pred = d_model.predict(X_test)
d_model_accuracy = accuracy_score(y_test, d_pred)
d_model_f1_score = f1_score(y_test, d_pred)
print(f"Accuracy: {d_model_accuracy}")
print(f"F1-Score: {d_model_f1_score}")
print(f"Training Time: {end_time-start_time:.2f} seconds")

Accuracy: 0.8040547827024696
F1-Score: 0.7934523132886212
Training Time: 5.98 seconds


In [21]:
# training a model using random forest tree algorithm
r_model = RandomForestClassifier()
start_time = time()
r_model.fit(X_train, y_train)
end_time = time()
r_pred = r_model.predict(X_test)
r_model_accuracy = accuracy_score(y_test, r_pred)
r_model_f1_score = f1_score(y_test, r_pred)
print(f"Accuracy: {r_model_accuracy}")
print(f"F1-Score: {r_model_f1_score}")
print(f"Training Time: {end_time-start_time:.2f} seconds")

Accuracy: 0.7996333441173299
F1-Score: 0.7896286231884058
Training Time: 14.26 seconds


In [22]:
# training a naive Bayes model
n_model = MultinomialNB()
start_time = time()
n_model.fit(X_train,y_train)
end_time = time()
n_pred = n_model.predict(X_test)
n_model_accuracy = accuracy_score(y_test, n_pred)
n_model_f1_score = f1_score(y_test, n_pred)
print(f"Accuracy: {n_model_accuracy}")
print(f"F1-Score: {n_model_f1_score}")
print(f"Training Time: {end_time-start_time:.2f} seconds")

Accuracy: 0.9716380890758115
F1-Score: 0.9618564176939812
Training Time: 0.01 seconds


> chosen support vector machine [SVM] model

In [23]:
# training a model using Support Vector Machine (SVM) algorithm
s_model = SVC()
start_time = time()
s_model.fit(X_train, y_train)
end_time = time()
s_pred = s_model.predict(X_test)
s_model_accuracy = accuracy_score(y_test, s_pred)
s_model_f1_score = f1_score(y_test, s_pred)
print(f"Accuracy: {s_model_accuracy}")
print(f"F1-Score: {s_model_f1_score}")
print(f"Training Time: {end_time-start_time:.2f} seconds")

Accuracy: 0.9921276825191416
F1-Score: 0.9894767190428139
Training Time: 26.54 seconds


#### model evaluation

#### Formulas

> accuracy = (TP + TN) / (TP + TN + FP + FN)

> Precision = TP / (TP + FP)

> Recall = TP / (TP + FN)

> F1-score = 2 * (Precision * Recall) / (Precision + Recall)

In [24]:
from sklearn.metrics import confusion_matrix

def model_eval(model_pred, model_name:str, y_test=y_test):
    confusion = confusion_matrix(y_test, model_pred)

    TP = confusion[1,1] # true positive
    TN = confusion[0,0] # true negative
    FP = confusion[1,0] # false positive
    FN = confusion[0,1] # false negative
    
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP+FN)
    f1_score = 2 * (precision * recall) / (precision+recall)
    
    print(f"Model evaluation of {model_name}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1_score}")
    print("--------------------------------------")


In [25]:
model_eval(d_pred, "decision tree")
model_eval(r_pred, "random forest")
model_eval(s_pred, "support vection machine")
model_eval(n_pred, "naive bayes")

Model evaluation of decision tree
Accuracy: 0.8040547827024696
Precision: 0.9982837528604119
Recall: 0.658366345972458
F1-score: 0.7934523132886212
--------------------------------------
Model evaluation of random forest
Accuracy: 0.7996333441173299
Precision: 0.9974256292906178
Recall: 0.6534857571214393
F1-score: 0.7896286231884058
--------------------------------------
Model evaluation of support vection machine
Accuracy: 0.9921276825191416
Precision: 0.9816933638443935
Recall: 0.997384481255449
F1-score: 0.9894767190428139
--------------------------------------
Model evaluation of naive bayes
Accuracy: 0.9716380890758115
Precision: 0.948512585812357
Recall: 0.9755810532509561
F1-score: 0.9618564176939812
--------------------------------------


In [26]:
# save this training model for futher use
pickle.dump(s_model, open("model_svm.pkl", "wb"))

In [27]:
# load model and test 
model = pickle.load(open("model_svm.pkl", "rb"))

with open("sql_query.txt", "r") as file:
    lines = file.readlines()
    
user_given_query_data = pd.DataFrame(lines, columns=['Query'])

sql_query = vectorizer.transform(user_given_query_data['Query'])
results = model.predict(sql_query)

malicious = 0
safe = 0
for result in results:
    if result == 1:
        malicious = malicious + 1
    else:
        safe = safe + 1
        
print(f"Flag out malicious queries as malicious: {malicious}")
print(f"Flag out malicious queries as safe: {safe}")

Flag out malicious queries as malicious: 2
Flag out malicious queries as safe: 3
