In [42]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
import pickle

In [2]:
df = pd.read_csv("XSS_dataset.csv", encoding="utf-8-sig")

In [4]:
df = df.drop(["Unnamed: 0"], axis=1)

In [13]:
X = df["Sentence"]
y = df["Label"]

In [14]:
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=4242)

In [23]:
svc = SVC(kernel="linear")
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print("SVC: ", accuracy_score(y_test, y_pred))

SVC:  0.9963476990504018


In [30]:
models = {
    "SVC": SVC(kernel="linear"),
    "Adaboost": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression()
}

In [31]:
for key, value in models.items():
    model = value
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(key, accuracy_score(y_test, y_pred))

SVC 0.9963476990504018
Adaboost 0.997808619430241
Decision Tree 0.9963476990504018
KNN 0.9649379108838568
Logistic Regression 0.995982468955442


In [37]:
xgb = XGBClassifier(n_estimators=100)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print("XGB: ", accuracy_score(y_test, y_pred))

XGB:  0.9967129291453616


In [38]:
count_vectorizer = CountVectorizer(max_df=0.7, stop_words=stopwords.words("english"))
X_scaled = count_vectorizer.fit_transform(X.values.astype("U")).toarray()

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=4243)

In [40]:
for key, value in models.items():
    model = value
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(key, accuracy_score(y_test, y_pred))

SVC 0.9963476990504018
Adaboost 0.995982468955442
Decision Tree 0.9967129291453616
KNN 0.9952520087655223
Logistic Regression 0.9963476990504018


In [41]:
xgb = XGBClassifier(n_estimators=100)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print("XGB: ", accuracy_score(y_test, y_pred))

XGB:  0.995982468955442


In [43]:
test = """<svg onunload svg onunload="javascript:javascript:alert(1)"></svg onunload>"""

In [48]:
test_scaled = count_vectorizer.transform([test])

In [49]:
xgb.predict(test_scaled)

array([1])

In [56]:
test2 = """<img>"""

In [57]:
test_scaled2 = count_vectorizer.transform([test2])

In [58]:
xgb.predict(test_scaled2)

array([0])