In [69]:
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases, LdaModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [70]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
# Load the dataset
df = pd.read_csv('dataset/SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [72]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [73]:
documents = reviews_df['review'].astype(str).tolist()

In [74]:
#load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [75]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords 
tagalog_stopwords = load_stopwords("stopwords-tl.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [76]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [77]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [78]:
# Create bigram and trigram models
bigram = Phrases(processed_texts, min_count=3, threshold=5)
trigram = Phrases(bigram[processed_texts], threshold=5)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [79]:
# Apply phrase models
def make_ngrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

processed_texts = make_ngrams(processed_texts)

In [80]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [81]:
# Define and train the LDA model
num_topics = 10
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=20,
    iterations=1000,
    alpha='auto',
    per_word_topics=True
)

In [82]:
# Print the topics
print("\nTopics found by LDA:")
pprint(lda_model.print_topics())


Topics found by LDA:
[(0,
  '0.081*"po" + 0.047*"maganda" + 0.029*"seller" + 0.025*"ganda" + '
  '0.025*"thank_seller" + 0.025*"good" + 0.023*"order" + 0.022*"thank" + '
  '0.019*"item" + 0.016*"quality"'),
 (1,
  '0.013*"ganda_po" + 0.013*"food" + 0.013*"plastic" + 0.012*"time" + '
  '0.012*"bilis_dumating" + 0.009*"dumi" + 0.008*"expected" + '
  '0.008*"god_bless" + 0.008*"malakas" + 0.007*"part"'),
 (2,
  '0.018*"still" + 0.015*"like" + 0.008*"nagustohan" + 0.008*"anyway" + '
  '0.008*"konting" + 0.007*"ringlight" + 0.007*"even" + 0.007*"thank_thank" + '
  '0.007*"excellent_quality" + 0.007*"freebies"'),
 (3,
  '0.011*"magaan" + 0.009*"lakas" + 0.008*"bet" + 0.007*"maingay" + '
  '0.007*"pretty" + 0.006*"super_sulit" + 0.006*"handle" + 0.006*"solid" + '
  '0.006*"nice_product" + 0.005*"matagal_dumating"'),
 (4,
  '0.008*"god_bless_po" + 0.008*"mukhang_matibay" + 0.008*"mouse_pad" + '
  '0.007*"jogger" + 0.007*"secure" + 0.007*"kapal" + 0.007*"basa" + '
  '0.006*"socks" + 0.006*"mou

In [83]:
# Compute coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=id2word,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.5354085237039004


In [84]:
# Step 8 (Optional): View most common n-grams
from collections import Counter
import itertools

# Get only tokens that are bigrams/trigrams (contain "_")
ngram_tokens = list(itertools.chain.from_iterable(
    [token for token in doc if '_' in token] for doc in processed_texts
))

# Count frequency
ngram_counts = Counter(ngram_tokens)

# Write to a file
with open('bigrams.txt', 'w', encoding='utf-8') as f:
    for phrase, count in ngram_counts.most_common():
        f.write(f'{phrase}\t{count}\n')

print("\nSaved bigrams/trigrams to bigrams.txt")


Saved bigrams/trigrams to bigrams.txt
