# **Analisis Sentimen Kepuasan Pengguna Aplikasi Identitas Kependudukan Digital (IKD) di Play Store**

oleh Ma'mur Zaky Nurrokhman

## Import Library

In [None]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from wordcloud import WordCloud

## Data Preparation

### Dataset Ulasan Aplikasi IKD di Play Store

#### 23 Maret 2024

In [None]:
# import dataset ulasan aplikasi IKD di Play Store pada bulan Maret 2024

df_ikd_play_store_maret = pd.read_csv('../../Dataset/review_ikd_play_store_23-03-2024.csv')
df_ikd_play_store_maret

#### 28 Mei 2024

In [None]:
# import dataset ulasan aplikasi IKD di Play Store pada bulan Mei 2024

df_ikd_play_store_mei = pd.read_csv('../../Dataset/review_ikd_play_store_28-05-2024.csv')
df_ikd_play_store_mei

### Dataset Ulasan Aplikasi IKD di App Store

#### 24 April 2024

In [None]:
# import dataset ulasan aplikasi IKD di App Store pada bulan April 2024

df_ikd_app_store_april = pd.read_csv('../../Dataset/review_ikd_app_store_24-04-2024.csv')
df_ikd_app_store_april

#### 28 Mei 2024

In [None]:
# import dataset ulasan aplikasi IKD di App Store pada bulan Mei 2024

df_ikd_app_store_mei = pd.read_csv('../../Dataset/review_ikd_app_store_28-05-2024.csv')
df_ikd_app_store_mei

### Menggabungkan Dataset

In [None]:
# menggabungkan semua dataset ulasan aplikasi IKD di Play Store

df_ikd_play_store = pd.concat([
    df_ikd_play_store_maret,
    df_ikd_play_store_mei
]).reset_index()
df_ikd_play_store.drop(columns=['index'], inplace=True)
df_ikd_play_store

In [None]:
# menggabungkan semua dataset ulasan aplikasi IKD di App Store

df_ikd_app_store = pd.concat([
    df_ikd_app_store_april,
    df_ikd_app_store_mei
]).reset_index()
df_ikd_app_store.drop(columns=['index'], inplace=True)
df_ikd_app_store

In [None]:
# menghapus atribut-atribut yang tidak digunakan

df_ikd_play_store.drop(columns=['reviewId', 'userName', 'userImage', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion'], inplace=True)
df_ikd_app_store.drop(columns=['date', 'isEdited', 'userName', 'title'], inplace=True)

In [None]:
# mengubah nama atribut

df_ikd_play_store.rename(columns={'content':'review','score':'sentiment'}, inplace=True)
df_ikd_app_store.rename(columns={'rating':'sentiment'}, inplace=True)

In [None]:
# menggabungkan semua dataset menjadi satu dataframe

df = pd.concat([
    df_ikd_play_store,
    df_ikd_app_store
    ]).reset_index()
df.drop(columns=['index'], inplace=True)
df

In [None]:
# melihat jumlah data duplikat

print(f"Jumlah data duplikat setelah proses penggabungan = {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Jumlah data duplikat saat ini = {df.duplicated().sum()}")

In [None]:
# melihat jumlah data

df.count()

In [None]:
# reset index

df.reset_index(inplace=True)
df.drop(columns=['index'], inplace=True)
df

## Preprocessing

In [None]:
# mengelompokan sentimen berdasarkan jumlah rating

print("Rating 4 dan 5 akan dikelompokan ke sentimen positif sedangkan rating 1,2, dan 3 akan dikelompokan ke sentimen negatif")
print("0 = Negatif")
print("1 = Positif")

df.replace([1,2,3], 0, inplace=True)
df.replace([4,5], 1, inplace=True)
df

In [None]:
# menampilkan informasi dari dataframe

df.info()

In [None]:
# melakukan teknik case folding untuk mengubah semua huruf menjadi lower case

def lower(text):
    text = text.lower() # mengubah ke lower case
    return text

df['review'] = df['review'].apply(lower)
df

In [None]:
# menghapus tanda baca dan angka

def remove_punctuation_and_number(text):
    text = text.translate(str.maketrans("","",string.punctuation + string.digits))
    return text

df['review'] = df['review'].apply(remove_punctuation_and_number)
df

In [None]:
# menghapus whitespace pada teks

def remove_whitespace(text):
    correct = str(text)
    correct = re.sub(r"//t",r"\t", correct)
    correct = re.sub(r"( )\1+",r"\1", correct)
    correct = re.sub(r"(\n)\1+",r"\1", correct)
    correct = re.sub(r"(\r)\1+",r"\1", correct)
    correct = re.sub(r"(\t)\1+",r"\1", correct)
    correct = re.sub(r"\n"," ",correct)
    return correct.strip()

df['review'] = df['review'].apply(remove_whitespace)
df

In [None]:
# text tokenization

def tokenizing(text):
    text = word_tokenize(text)
    return text

df['review'] = df['review'].apply(tokenizing)
df

In [None]:
# menghapus stop words dengan Sastrawi

factory = StopWordRemoverFactory()
stopword_sastrawi = factory.get_stop_words()

def remove_stopwords_with_sastrawi(text):
    text = [word for word in text if word not in stopword_sastrawi]
    return text

df['review'] = df['review'].apply(remove_stopwords_with_sastrawi)
df

In [None]:
# melakukan stemming dengan Sastrawi

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
    text = [stemmer.stem(token) for token in text]
    return text

df['review'] = df['review'].apply(stemming)
df

In [None]:
# Perbaikan ejaan kata

normalized_word = pd.read_csv("../../Corpus/kamus-alay/kamus-alay.csv")

normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

df['review'] = df['review'].apply(normalized_term)
df

In [None]:
# menghapus stop words kembali dengan Sastrawi

df['review'] = df['review'].apply(remove_stopwords_with_sastrawi)
df

In [None]:
# melakukan stemming ulang dengan Sastrawi

df['review'] = df['review'].apply(stemming)
df

In [None]:
# Menggabungkan kata-kata menjadi satu kalimat per baris

corpus = []

for i in range(len(df['review'])):
    word = df['review'][i]
    row = ' '.join(word)
    corpus.append(row)

df['review'] = corpus
df

In [None]:
# menampilkan frekuensi kemunculan kata menggunakan wordcloud

words = " ".join(review for review in df['review'])
print(f'Jumlah kata pada dataset adalah {len(words)} kata')

wordcloud = WordCloud(background_color='white').generate(text=words)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# memisahkan dataframe untuk sentimen positif

df_positive = df.loc[df['sentiment'] == 1].reset_index()
df_positive = df_positive.drop(columns='index')
df_positive

In [None]:
# menampilkan wordcloud untuk sentiment negatif

positive = " ".join(review for review in df_positive['review'])
print(f'Jumlah kata pada sentimen positif adalah {len(positive)} kata')

wordcloud = WordCloud(background_color='white').generate(text=positive)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# memisahkan dataframe untuk sentimen negatif

df_negative = df.loc[df['sentiment'] == 0].reset_index()
df_negative = df_negative.drop(columns='index')
df_negative

In [None]:
# menampilkan wordcloud untuk sentimen negatif

negative = " ".join(review for review in df_negative['review'])
print(f'Jumlah kata pada sentimen negatif adalah {len(negative)} kata')

wordcloud = WordCloud(background_color='white').generate(text=negative)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# membagi atribut menjadi features (X) dan label (y)

X = df['review']
y = df['sentiment']

In [None]:
# menampilkan grafik jumlah data pada kolom sentiment

print('Keterangan:')
print('0 = Negatif')
print('1 = Positif')

y.value_counts().plot(kind='bar')
plt.title('Jumlah Data Antar Kelas Sebelum Diseimbangkan')
plt.text(x=-0.075,y=y.value_counts()[0],s=y.value_counts()[0])
plt.text(x=0.925,y=y.value_counts()[1],s=y.value_counts()[1])
plt.show()

## Pembobotan Kata Menggunakan TF-IDF

In [None]:
# melakukan pembobotan kata dengan TF-IDF

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)
X_tfidf

In [None]:
first_vector = X_tfidf[0]
df_tfidf_first = pd.DataFrame(first_vector.T.todense(), index=tfidf.get_feature_names_out(), columns=['tfidf'])
df_tfidf_first.sort_values(by=['tfidf'], ascending=False).head(10)

## Menyeimbangkan Data Menggunakan SMOTE

In [None]:
# menyeimbangkan data berdasarkan atribut sentiment atau label (y)

oversampler = SMOTE()
X_tfidf_resampled, y_resampled = oversampler.fit_resample(X_tfidf, y)

In [None]:
# menampilkan kembali grafik jumlah data pada kolom sentiment

print('Keterangan:')
print('0 = Negatif')
print('1 = Positif')

y_resampled.value_counts().plot(kind='bar')
plt.title('Jumlah Data Antar Kelas Sebelum Diseimbangkan')
plt.text(x=-0.075,y=y_resampled.value_counts()[0],s=y_resampled.value_counts()[0])
plt.text(x=0.925,y=y_resampled.value_counts()[1],s=y_resampled.value_counts()[1])
plt.show()

## Pelatihan Model

In [None]:
# membagi data menjadi data latih (training data) dan data uji (test data)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# deklarasi parameter untuk hyperparameter tuning

param_space = {
    'C': (1e-3, 1e+3, 'log-uniform'),
    'gamma': (1e-3, 1e+3, 'log-uniform'),
    'kernel': ['linear', 'rbf', 'poly'],
}

In [None]:
# inisiasi Bayesian Optimization

model = BayesSearchCV(
    estimator=SVC(),
    search_spaces=param_space,
    n_iter=32,
    cv=3
)

In [None]:
model.fit(X_train, y_train)

print(f'val. score: {model.best_score_}')
print(f'test score: {model.score(X_test, y_test)}')

In [None]:
# mendapatkan informasi parameter terbaik

best_params = model.best_params_
print('Best Parameters:', best_params)

## Evaluation

In [None]:
# memeriksa overfit/underfit

training_accuracy = accuracy_score(y_train, model.predict(X_train))
test_accuracy = accuracy_score(y_test, model.predict(X_test))
print('Training accuracy :', training_accuracy)
print('Test accuracy :', test_accuracy)
print('Difference :', training_accuracy - test_accuracy)

In [None]:
# menampilkan confusion matrix

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

In [None]:
# melihat hasil laporan klasifikasi

print(classification_report(y_test, y_pred, zero_division=0, digits=4))

In [None]:
classification_report(y_test, y_pred, zero_division=0, digits=4)