In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [2]:
dataset = pd.read_csv("./IMDB Dataset.csv")
X = dataset["review"].values
y = dataset["sentiment"].values

In [3]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y[:5])

[1 1 1 0 1]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f"X-train {len(X_train)}")
print(f"X-test {len(X_test)}")

X-train 40000
X-test 10000


In [5]:
vectorizer = TfidfVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:
def eval_res(true, pred):
    cm = confusion_matrix(true, pred)
    print(cm)
    print(f"Accuracy = {accuracy_score(true, pred)}")
    print(f"Precision = {precision_score(true, pred)}")
    print(f"Recall = {recall_score(true, pred)}")
    print(f"F1 = {f1_score(true, pred)}")

In [7]:
start = datetime.now()
model = MNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Total time = {datetime.now() - start}")
eval_res(y_test, pred)

Total time = 0:00:00.019897
[[4353  600]
 [ 793 4254]]
Accuracy = 0.8607
Precision = 0.8763906056860321
Recall = 0.8428769566078859
F1 = 0.8593071406928593


In [8]:
start = datetime.now()
model = LinearSVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f"Total time = {datetime.now() - start}")
eval_res(y_test, pred)

Total time = 0:00:00.364483
[[4405  548]
 [ 509 4538]]
Accuracy = 0.8943
Precision = 0.892253244199764
Recall = 0.8991480087180503
F1 = 0.8956873581367808
