In [1]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import numpy as np
np.random.seed(41)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("w11wo/reddit_indonesia_sarcastic")

In [3]:
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

In [4]:
tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfidf_vec.fit(train_df.text)

In [5]:
X_train = tfidf_vec.transform(train_df.text)
X_test = tfidf_vec.transform(test_df.text)

In [6]:
logreg = LogisticRegression()
logreg.fit(X_train, train_df.label)
logreg_preds = logreg.predict(X_test)

In [7]:
accuracy = accuracy_score(test_df.label, logreg_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_df.label, logreg_preds, average='binary')
print(f"{accuracy}\t{f1}\t{precision}\t{recall}")

0.7893059490084986	0.3756558237145855	0.7246963562753036	0.2535410764872521


In [8]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, train_df.label)
naive_bayes_preds = naive_bayes.predict(X_test)

In [9]:
accuracy = accuracy_score(test_df.label, naive_bayes_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_df.label, naive_bayes_preds, average='binary')
print(f"{accuracy}\t{f1}\t{precision}\t{recall}")

0.7546033994334278	0.03616133518776078	1.0	0.018413597733711047


In [10]:
svc = SVC()
svc.fit(X_train, train_df.label)
svc_preds = svc.predict(X_test)

In [11]:
accuracy = accuracy_score(test_df.label, svc_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_df.label, svc_preds, average='binary')
print(f"{accuracy}\t{f1}\t{precision}\t{recall}")

0.785056657223796	0.3394994559303591	0.7323943661971831	0.22096317280453256
