In [233]:
import pandas as pd

from sklearn.metrics import roc_auc_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [187]:
sqli = pd.read_csv("./sqli/SQLiV3.csv")
xss = pd.read_csv("./xss/XSS_dataset.csv")

In [188]:
xss.shape, sqli.shape

((13686, 3), (30919, 4))

In [189]:
sqli.head()

Unnamed: 0,Sentence,Label,Unnamed: 2,Unnamed: 3
0,""" or pg_sleep ( __TIME__ ) --",1.0,,
1,create user name identified by pass123 tempora...,,1.0,
2,AND 1 = utl_inaddr.get_host_address ( ...,1.0,,
3,select * from users where id = '1' or @ @1 ...,1.0,,
4,"select * from users where id = 1 or 1#"" ( ...",1.0,,


In [190]:
sqli = sqli.loc[:, sqli.columns.isin(["Sentence", "Label"])]

In [191]:
# sqli['Sentence'].isna().sum()
sqli


Unnamed: 0,Sentence,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
...,...,...
30914,DELETE FROM door WHERE grow = 'small',0
30915,DELETE FROM tomorrow,0
30916,SELECT wide ( s ) FROM west,0
30917,SELECT * FROM ( SELECT slide FROM breath ),0


In [192]:
sqli = sqli.dropna(subset=['Label'])

In [193]:
sqli.shape

(30664, 2)

In [194]:
sqli = sqli[sqli.Label.isin(["0", "1"])]

In [195]:
sqli.shape, xss.shape

((30609, 2), (13686, 3))

In [209]:
sqli['Label'] = sqli['Label'].apply(lambda x: int(x))

In [210]:
xss.Label.dtype == sqli.Label.dtype

True

In [211]:
xss['Label'] = xss['Label'].apply(lambda x: -1 if x==1 else 0)

In [218]:
xss = xss.loc[:, xss.columns.isin(['Sentence', 'Label'])]

In [219]:
new_df = pd.concat([xss, sqli])

In [220]:
new_df

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",-1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0
...,...,...
30914,DELETE FROM door WHERE grow = 'small',0
30915,DELETE FROM tomorrow,0
30916,SELECT wide ( s ) FROM west,0
30917,SELECT * FROM ( SELECT slide FROM breath ),0


In [221]:
mapper = {
    1 :"SQLInjection",
    -1 :"XSS",
    0: 'Normal'
}
# new_df['label'] = new_df['label'].map(lambda x: mapper[x])

# new_df.head()

In [223]:
X_train, X_test, y_train, y_test = train_test_split(
    new_df['Sentence'], new_df['Label'], test_size=0.15, random_state=42, stratify=new_df['Label']
    )

In [224]:
X_train.shape, y_train.shape

((37650,), (37650,))

In [225]:
vector = TfidfVectorizer(lowercase=False, min_df=0.05, ngram_range=(1, 3))

In [226]:
vectorizer = vector.fit(X_train)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vector.transform(X_test)

In [227]:
from joblib import dump
dump(vectorizer, "tfidfvec")

['tfidfvec']

In [228]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [229]:
log_model.fit(X_train_vec, y_train)

In [230]:
y_pred = log_model.predict(X_test_vec)

In [234]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.98      0.99      1106
           0       0.97      1.00      0.99      3838
           1       0.99      0.96      0.97      1701

    accuracy                           0.98      6645
   macro avg       0.99      0.98      0.98      6645
weighted avg       0.98      0.98      0.98      6645



In [239]:
roc_auc_score(y_test, log_model.predict_proba(X_test_vec), multi_class="ovo")

0.994918907409311

In [240]:
dump(log_model, "log_model")

['log_model']