In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from pprint import pprint

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('dataset/SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [4]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [5]:
documents = reviews_df['review'].astype(str).tolist()

In [6]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Basic Tagalog/Filipino stopwords (expand this list if needed)
tagalog_stopwords = {
    'ako', 'ikaw', 'siya', 'tayo', 'kami', 'kayo', 'sila',
    'ang', 'ng', 'sa', 'para', 'ay', 'na', 'nang', 'at', 'pero', 'kung', 'kasi', 'dahil',
    'hindi', 'oo', 'huwag', 'wala', 'meron', 'may', 'niya', 'rin', 'din', 'ito', 'iyan',
    'iyon', 'doon', 'dito', 'kaya', 'lamang', 'lang', 'lng', 'mga',
    'ko', 'pa', 'po', 'sya', 'xa', 'yung', 'ung', 'un', 'naman', 'nmn', 'nman',
    'kaso', 'ok', 'okay', 'ganda', 'di', 'nyo', 'nila', 'muna',
    'sana', 'bago', 'nga', 'kc', 'tnx', 'salamat', 'mag', 'nag',
    'sna', 'nasa', 'mo', 'nya', 'isa'
}

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [7]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [8]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [9]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [10]:
# Define and train the LDA model
num_topics = 10
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [11]:
# Print the topics
print("\nTopics found by LDA:")
pprint(lda_model.print_topics())


Topics found by LDA:
[(0,
  '0.044*"maganda" + 0.034*"dumating" + 0.032*"order" + 0.029*"size" + '
  '0.024*"seller" + 0.021*"quality" + 0.016*"good" + 0.014*"color" + '
  '0.014*"maliit" + 0.013*"thank"'),
 (1,
  '0.073*"seller" + 0.062*"thank" + 0.027*"order" + 0.025*"good" + '
  '0.022*"item" + 0.017*"thanks" + 0.016*"rider" + 0.015*"shopee" + '
  '0.014*"ulit" + 0.014*"quality"'),
 (2,
  '0.024*"makapal" + 0.018*"sobrang" + 0.015*"overall" + 0.015*"condition" + '
  '0.014*"talaga" + 0.010*"excellent" + 0.009*"smooth" + 0.009*"quality" + '
  '0.009*"masikip" + 0.009*"ka"'),
 (3,
  '0.022*"order" + 0.016*"kulang" + 0.015*"item" + 0.013*"tapos" + 0.011*"iba" '
  '+ 0.011*"sira" + 0.010*"isang" + 0.009*"wag" + 0.009*"sayang" + '
  '0.009*"pinadala"'),
 (4,
  '0.020*"pag" + 0.014*"pwede" + 0.014*"maganda" + 0.011*"gamitin" + '
  '0.011*"gumagana" + 0.010*"price" + 0.010*"nung" + 0.010*"parang" + '
  '0.009*"pang" + 0.009*"talaga"'),
 (5,
  '0.026*"amoy" + 0.013*"love" + 0.012*"packaged

In [12]:
# Compute coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=id2word,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.38468656343648344
