In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
df = pd.read_csv("XSS_dataset.csv")

In [3]:
df.head(), df.shape

(   Unnamed: 0                                           Sentence  Label
 0           0  <li><a href="/wiki/File:Socrates.png" class="i...      0
 1           1               <tt onmouseover="alert(1)">test</tt>      1
 2           2  \t </span> <span class="reference-text">Steeri...      0
 3           3  \t </span> <span class="reference-text"><cite ...      0
 4           4  \t </span>. <a href="/wiki/Digital_object_iden...      0,
 (13686, 3))

In [4]:
df = df.loc[df['Sentence'] != " "]

In [5]:
df

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0
...,...,...,...
13681,13681,<img onpointerenter=alert(1)>XSS</img>,1
13682,13682,"<source onbeforepaste=""alert(1)"" contenteditab...",1
13683,13683,"<div draggable=""true"" contenteditable>drag me<...",1
13684,13684,"<li><cite id=""CITEREFDomingos2015"" class=""cita...",0


In [6]:
df = df.loc[:, df.columns.isin(["Sentence", "Label"])]

In [7]:
df

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0
...,...,...
13681,<img onpointerenter=alert(1)>XSS</img>,1
13682,"<source onbeforepaste=""alert(1)"" contenteditab...",1
13683,"<div draggable=""true"" contenteditable>drag me<...",1
13684,"<li><cite id=""CITEREFDomingos2015"" class=""cita...",0


In [8]:
df.loc[df['Sentence'] == " "]

Unnamed: 0,Sentence,Label


In [9]:
df.Label.value_counts()

Label
1    7373
0    6201
Name: count, dtype: int64

In [10]:
df.isna().sum()

Sentence    0
Label       0
dtype: int64

In [11]:
df.isna().sum()

Sentence    0
Label       0
dtype: int64

In [12]:
vector = TfidfVectorizer(lowercase=True)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Sentence'], df['Label'], test_size=0.3, random_state=0, stratify=df['Label']
)

In [14]:
vectorizer = vector.fit(X_train)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vector.transform(X_test)

In [15]:
from joblib import dump
dump(vectorizer, "tfidfvec")

['tfidfvec']

In [16]:
log_model = LogisticRegression()
log_model.fit(X_train_vec, y_train)

In [17]:
y_pred = log_model.predict(X_test_vec)

In [18]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1861
           1       1.00      0.99      1.00      2212

    accuracy                           1.00      4073
   macro avg       1.00      1.00      1.00      4073
weighted avg       1.00      1.00      1.00      4073



In [19]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1860    1]
 [  13 2199]]


In [20]:
import joblib

In [22]:
joblib.dump(log_model, "XSS_jblib")

['XSS_jblib']

In [24]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(vectorizer, log_model)
dump(pipe, "pipe")

['pipe']