In [36]:
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases, LdaModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [37]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# Load the dataset
df = pd.read_csv('dataset/SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [39]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [40]:
documents = reviews_df['review'].astype(str).tolist()

In [41]:
#load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [42]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords 
tagalog_stopwords = load_stopwords("stopwords-tl.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [43]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [44]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [45]:
# Create bigram and trigram models
bigram = Phrases(processed_texts, min_count=3, threshold=5)
trigram = Phrases(bigram[processed_texts], threshold=5)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [46]:
# Apply phrase models
def make_ngrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

processed_texts = make_ngrams(processed_texts)

In [47]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [48]:
# Define and train the LDA model
num_topics = 10
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=20,
    iterations=1000,
    alpha='auto',
    per_word_topics=True
)

In [49]:
# Print the topics
print("\nTopics found by LDA:")
pprint(lda_model.print_topics())


Topics found by LDA:
[(0,
  '0.018*"food" + 0.009*"room" + 0.008*"place" + 0.008*"complete_orders" + '
  '0.008*"staff" + 0.007*"mama" + 0.007*"medyo_matagal" + 0.007*"pcs" + '
  '0.007*"purple" + 0.007*"tubig"'),
 (1,
  '0.033*"gumagana" + 0.017*"working" + 0.016*"kuya_rider" + '
  '0.013*"sayang_pera" + 0.010*"fan" + 0.010*"charge" + 0.008*"gumana" + '
  '0.007*"bilis" + 0.007*"charger" + 0.006*"hangin"'),
 (2,
  '0.067*"ganda" + 0.033*"size" + 0.022*"good_quality" + 0.020*"ulit" + '
  '0.019*"maliit" + 0.016*"worth" + 0.016*"sakto" + 0.012*"manipis" + '
  '0.012*"kasya" + 0.010*"malaki"'),
 (3,
  '0.015*"sulit" + 0.014*"ganda_quality" + 0.012*"expect" + '
  '0.010*"maganda_tela" + 0.008*"laki" + 0.008*"mainit" + 0.007*"pesos" + '
  '0.007*"tumagal" + 0.006*"tumbler" + 0.006*"makapal_tela"'),
 (4,
  '0.019*"battery" + 0.017*"far" + 0.013*"soon" + 0.012*"nagana" + '
  '0.010*"good_condition" + 0.009*"sakto_size" + 0.009*"remote" + '
  '0.007*"nilagay" + 0.007*"pants" + 0.006*"nadeliv

In [50]:
# Compute coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=id2word,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.45241004896749104


In [51]:
# Step 8 (Optional): View most common n-grams
from collections import Counter
import itertools

# Get only tokens that are bigrams/trigrams (contain "_")
ngram_tokens = list(itertools.chain.from_iterable(
    [token for token in doc if '_' in token] for doc in processed_texts
))

# Count frequency
ngram_counts = Counter(ngram_tokens)

# Write to a file
with open('bigrams.txt', 'w', encoding='utf-8') as f:
    for phrase, count in ngram_counts.most_common():
        f.write(f'{phrase}\t{count}\n')

print("\nSaved bigrams/trigrams to bigrams.txt")


Saved bigrams/trigrams to bigrams.txt


In [52]:
from collections import Counter
import itertools

# Flatten list of tokenized docs
all_words = list(itertools.chain.from_iterable(processed_texts))

# Count word frequencies
word_freq = Counter(all_words)

# View top N most common words
top_words = word_freq.most_common(100)  # Top 50
for word, freq in top_words:
    print(f"{word}: {freq}")

maganda: 1504
good: 1033
dumating: 883
ganda: 664
size: 563
wala: 561
items: 483
quality: 458
sira: 448
color: 372
inorder: 372
maayos: 369
mura: 361
price: 352
gumagana: 351
nice: 332
agad: 320
shop: 309
kulay: 304
disappointed: 300
damage: 300
super: 290
medyo: 284
pinadala: 272
sobrang: 270
maliit: 255
much: 251
packaging: 249
cute: 246
black: 243
good_quality: 237
sayang_pera: 237
sakin: 236
ulit: 236
manipis: 233
goods: 232
worth: 231
next_time: 218
white: 216
rider: 216
amoy: 213
tas: 213
received: 212
pwede: 212
like: 209
binigay: 207
ba: 206
akala: 204
tama: 201
ordered: 197
working: 195
man: 194
sayang: 186
picture: 184
ganun: 184
maganda_quality: 184
design: 180
ibang: 178
sakto: 178
mas: 176
box: 175
nagustuhan: 173
time: 169
deliver: 168
mali: 164
naka: 163
malaki: 161
delivery: 157
parcel: 157
baby: 157
satisfied: 154
star: 153
bumili: 152
plastic: 151
alam: 147
binili: 147
buti: 146
mabilis: 143
pink: 142
one: 142
love: 142
buy: 141
need: 139
try: 138
battery: 138
daw: 13