In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from pprint import pprint

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('dataset/SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [4]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [5]:
documents = reviews_df['review'].astype(str).tolist()

In [6]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Basic Tagalog/Filipino stopwords (expand this list if needed)
tagalog_stopwords = [
    'ako', 'ikaw', 'siya', 'tayo', 'kami', 'kayo', 'sila',
    'ang', 'ng', 'sa', 'para', 'ay', 'na', 'nang', 'at', 'pero', 'kung', 'kasi', 'dahil',
    'hindi', 'oo', 'huwag', 'wala', 'meron', 'may', 'niya', 'rin', 'din', 'ito', 'iyan',
    'iyon', 'doon', 'dito', 'kaya', 'lamang', 'lang', 'mga'
]

combined_stopwords = set(english_stopwords + tagalog_stopwords)

In [7]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [8]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [9]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [10]:
# Define and train the LDA model
num_topics = 10
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [11]:
# Print the topics
print("\nTopics found by LDA:")
pprint(lda_model.print_topics())


Topics found by LDA:
[(0,
  '0.100*"size" + 0.050*"maliit" + 0.024*"kasya" + 0.022*"amoy" + 0.021*"paa" '
  '+ 0.020*"ko" + 0.017*"sakto" + 0.012*"add" + 0.010*"maraming" + '
  '0.008*"sya"'),
 (1,
  '0.064*"yung" + 0.052*"naman" + 0.036*"ko" + 0.029*"kaso" + 0.029*"sya" + '
  '0.022*"maganda" + 0.019*"di" + 0.019*"okay" + 0.014*"pa" + 0.014*"ok"'),
 (2,
  '0.039*"ko" + 0.027*"pa" + 0.018*"nag" + 0.015*"di" + 0.014*"mag" + '
  '0.011*"kulang" + 0.010*"nila" + 0.010*"sayang" + 0.010*"item" + '
  '0.009*"pera"'),
 (3,
  '0.139*"ung" + 0.078*"lng" + 0.042*"nmn" + 0.038*"ok" + 0.038*"nman" + '
  '0.017*"un" + 0.016*"kc" + 0.015*"ko" + 0.013*"xa" + 0.012*"kaso"'),
 (4,
  '0.021*"meters" + 0.021*"shipped" + 0.020*"happy" + 0.015*"masikip" + '
  '0.014*"immediately" + 0.014*"correct" + 0.013*"perfect" + 0.009*"today" + '
  '0.009*"hotel" + 0.008*"job"'),
 (5,
  '0.039*"wrap" + 0.034*"bubble" + 0.025*"food" + 0.014*"naka" + 0.014*"box" + '
  '0.013*"mouse" + 0.011*"remote" + 0.011*"mystery" +

In [12]:
# Compute coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=id2word,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.47906013215457754
