In [1]:
# Important files to upload to make this run on Colab
# raw dataset (change it to an external link if u like cause idk why it didn't work sa colab)
# stopwords-new.txt (can be found on our GitHub repo)

In [2]:
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases, LdaModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [4]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [5]:
documents = reviews_df['review'].astype(str).tolist()

In [6]:
# Load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [7]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords
tagalog_stopwords = load_stopwords("stopwords-new.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [8]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [9]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [10]:
# Create bigram and trigram models
bigram = Phrases(processed_texts, min_count=3, threshold=5)
trigram = Phrases(bigram[processed_texts], threshold=5)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [11]:
# Apply phrase models
def make_ngrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

processed_texts = make_ngrams(processed_texts)

In [12]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [13]:
# Define and train the LDA model
num_topics = 12
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=50,
    iterations=500,
    alpha='auto',
    per_word_topics=True
)

In [14]:
# Print the topics
print("\nTopics found by LDA:")
pprint(lda_model.print_topics())


Topics found by LDA:
[(0,
  '0.021*"battery" + 0.014*"part" + 0.014*"phone" + 0.010*"open" + '
  '0.010*"okey" + 0.009*"stand" + 0.009*"ring_light" + 0.008*"case" + '
  '0.008*"konting" + 0.008*"problem"'),
 (1,
  '0.023*"super" + 0.021*"gumagana" + 0.016*"packaging" + 0.013*"working" + '
  '0.012*"delivery" + 0.011*"box" + 0.011*"gamitin" + 0.010*"super_ganda" + '
  '0.010*"plastic" + 0.010*"like"'),
 (2,
  '0.026*"god_bless" + 0.013*"gaganda" + 0.012*"receive" + '
  '0.012*"medyo_manipis" + 0.012*"disappoint" + 0.010*"mag_order" + '
  '0.009*"magaan" + 0.009*"mouse_pad" + 0.008*"excellent_quality" + '
  '0.008*"yet"'),
 (3,
  '0.026*"salamat_seller" + 0.021*"amoy" + 0.020*"still" + 0.016*"matagal" + '
  '0.012*"maganda_tela" + 0.011*"well_packed" + 0.011*"oks" + 0.011*"rubber" + '
  '0.010*"mabango" + 0.010*"malambot"'),
 (4,
  '0.029*"shopee" + 0.026*"bilis" + 0.024*"thankyou" + 0.019*"sobrang_ganda" + '
  '0.014*"lagi" + 0.013*"anyways" + 0.013*"hehehe" + 0.010*"pro" + '
  '0.009*

In [15]:
# Compute coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=id2word,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.520099320759772


In [16]:
# Prepare the interactive visualization
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [17]:
# This is temp. I just wanna see
from collections import Counter
import itertools

# Flatten list of tokenized docs
all_words = list(itertools.chain.from_iterable(processed_texts))

# Count word frequencies
word_freq = Counter(all_words)

# View top N most common words
top_words = word_freq.most_common(100)
for word, freq in top_words:
    print(f"{word}: {freq}")

order: 1686
maganda: 1349
item: 1171
kasi: 1111
seller: 1093
good: 1014
sana: 959
ok: 941
dumating: 888
okay: 838
thank: 695
thank_seller: 692
talaga: 678
ganda: 676
size: 673
lng: 671
product: 616
wala: 550
parang: 511
items: 486
sira: 464
nung: 460
nag: 453
quality: 416
inorder: 384
maayos: 373
color: 370
mura: 344
gumagana: 340
agad: 320
nice: 313
kulay: 311
damage: 310
price: 307
super: 289
mag: 284
medyo: 281
pinadala: 271
salamat: 271
sobrang: 262
maliit: 261
disappointed: 257
black: 245
cute: 241
binigay: 238
good_quality: 235
worth: 234
goods: 233
shop: 232
packaging: 230
white: 229
amoy: 222
sayang_pera: 216
tama: 216
manipis: 215
thanks: 212
pwede: 212
akala: 209
like: 209
thanks_seller: 205
ordered: 204
picture: 200
working: 195
sakto: 193
sayang: 185
ba: 184
design: 180
mali: 178
box: 178
maganda_quality: 177
nagustuhan: 175
ganun: 172
received: 170
naka: 167
hehe: 167
man: 166
malaki: 163
ibang: 162
niyo: 161
delivery: 158
mas: 158
hnd: 157
parcel: 157
next_time: 157
si_se