In [1]:
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("sqliv2.csv", encoding="utf-16")

In [4]:
df.isnull().sum()

Sentence    4
Label       0
dtype: int64

In [5]:
df["Label"].value_counts()

0    22305
1    11456
Name: Label, dtype: int64

In [6]:
df = df.dropna(how="any")

In [7]:
X = df["Sentence"]
y = df["Label"]

In [8]:
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.3, random_state=4242)

In [10]:
models = {
    "SVC": SVC(kernel="linear"),
    "Adaboost": AdaBoostClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier(n_estimators=100)
}

In [11]:
for key, value in models.items():
    model = value
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(key, accuracy_score(y_test, y_pred))

SVC 0.9935821484992101
Adaboost 0.9942733017377567
Decision Tree 0.5557859399684044
KNN 0.3848736176935229
Logistic Regression 0.9762045813586098
XGBoost 0.9935821484992101
