In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [6]:
dataset = pd.read_csv("./spam.csv", encoding="latin1")
X = dataset["v2"].values
y = dataset["v1"].values

In [7]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y[:5])

[0 0 1 0 0]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f"X-train {len(X_train)}")
print(f"X-test {len(X_test)}")

X-train 4457
X-test 1115


In [9]:
vectorizer = TfidfVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [10]:
def eval_res(true, pred):
    cm = confusion_matrix(true, pred)
    print(cm)
    print(f"Accuracy = {accuracy_score(true, pred)}")
    print(f"Precision = {precision_score(true, pred)}")
    print(f"Recall = {recall_score(true, pred)}")
    print(f"F1 = {f1_score(true, pred)}")

In [11]:
start = datetime.now()
model = MNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Total time = {datetime.now() - start}")
eval_res(y_test, pred)

Total time = 0:00:00.005892
[[972   0]
 [ 30 113]]
Accuracy = 0.9730941704035875
Precision = 1.0
Recall = 0.7902097902097902
F1 = 0.8828125


In [12]:
start = datetime.now()
model = LinearSVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Total time = {datetime.now() - start}")
eval_res(y_test, pred)

Total time = 0:00:00.021534
[[968   4]
 [ 19 124]]
Accuracy = 0.979372197309417
Precision = 0.96875
Recall = 0.8671328671328671
F1 = 0.915129151291513
