In [None]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('../../Dataset/dataset_review_aplikasi_ikd.csv')
df

In [None]:
df.info()

In [None]:
df = df.drop(columns=['reviewId', 'userName', 'userImage', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion'])
df

In [None]:
df.rename(columns={'score':'sentiment'}, inplace=True)
df

In [None]:
print("0 = Negatif")
print("1 = Positif")

df.replace([1,2,3], 0, inplace=True)
df.replace([4,5], 1, inplace=True)
df

In [None]:
df.info()

In [None]:
# Case Folding
def lower(text):
    text = text.lower() # mengubah ke lower case
    return text

In [None]:
df['content'] = df['content'].apply(lower)
df

In [None]:
# Remove Punctuation & Number

def remove_punctuation_and_number(text):
    text = text.translate(str.maketrans("","",string.punctuation + string.digits))
    return text

In [None]:
df['content'] = df['content'].apply(remove_punctuation_and_number)
df

In [None]:
# Remove Whitespace
def remove_whitespace(text):
    correct = str(text)
    correct = re.sub(r"//t",r"\t", correct)
    correct = re.sub(r"( )\1+",r"\1", correct)
    correct = re.sub(r"(\n)\1+",r"\1", correct)
    correct = re.sub(r"(\r)\1+",r"\1", correct)
    correct = re.sub(r"(\t)\1+",r"\1", correct)
    return correct.strip()

In [None]:
df['content'] = df['content'].apply(remove_whitespace)
df

Perbaikan ejaan harus dilakukan sebelum proses stopwords

In [None]:
kamus_kata_tidak_baku = pd.read_csv("../../Dataset/Kamus Kata Tidak Baku/kamus-alay/colloquial-indonesian-lexicon.csv")
kamus_kata_tidak_baku.head()

In [None]:
# # Spell Checking
# def spell_checking(text, nonformal_word):
#     for index in text:
#         index_nonformal = nonformal_word.slang==text[index]
#         formal = list(set(nonformal_word[index_nonformal].formal))
#         if len(formal) == 1:
#             text[index] = formal[0]
#     return text

In [None]:
# df['content'].apply(lambda text: spell_checking(text, kamus_kata_tidak_baku))

In [None]:
# Stopwords with NLTK
stopword_nltk = stopwords.words('indonesian')

def remove_stopwords_with_nltk(text):
    text = ' '.join(word for word in text.split() if word not in stopword_nltk)
    return text

In [None]:
# df['content'] = df['content'].apply(remove_stopwords_with_nltk)
# df

In [None]:
# Stopwords with Sastrawi
factory = StopWordRemoverFactory()
stopword_sastrawi = factory.create_stop_word_remover()

def remove_stopwords_with_sastrawi(text):
    # for word in text:
    #     word = stopword_sastrawi.remove(word)
    text = ' '.join(word for word in text.split() if word not in stopword_sastrawi)
    return text

Membandingkan jumlah data kata untuk filtering stopword yang dimiliki oleh library NLTK dan Sastrawi

In [None]:
# Perbandingan Stopwords dari NLTK dan Sastrawi
print(stopword_nltk)
print(stopword_sastrawi)
print(f"Jumlah Stopword NLTK = {len(stopword_nltk)}")
print(f"Jumlah Stopword Sastrawi = {len(stopword_sastrawi)}")

In [None]:
df['content'] = df['content'].apply(remove_stopwords_with_sastrawi)
df

In [None]:
# Stemming with Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
    text = ' '.join(stemmer.stem(token) for token in text)
    return text

In [None]:
df['content'] = df['content'].apply(stemming)
df

In [None]:
X = df['content']
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
count_vectorizer = CountVectorizer()
X_train_cv = count_vectorizer.fit_transform(X_train)

In [None]:
model = SVC()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = count_vectorizer.transform(X_test)

In [None]:
y_pred = model.predict(X_test_cv)
y_pred

In [None]:
training_accuracy = accuracy_score(y_train, model.predict(X_train_cv))
test_accuracy = accuracy_score(y_test, model.predict(X_test_cv))
print('Training accuracy :', training_accuracy)
print('Test accuracy :', test_accuracy)
print('Difference :', training_accuracy - test_accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(y_test, y_pred, zero_division=0, digits=4))