In [144]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from pprint import pprint

In [145]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Fatima
[nltk_data]     Dy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [146]:
# Load the dataset
df = pd.read_csv('dataset/SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [147]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [148]:
documents = reviews_df['review'].astype(str).tolist()

In [149]:
#load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [None]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords 
tagalog_stopwords = load_stopwords("stopwords-tl.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [151]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [152]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [153]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [154]:
# Define and train the LDA model
num_topics = 10
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

In [155]:
# Print the topics
print("\nTopics found by LDA:")
pprint(lda_model.print_topics())


Topics found by LDA:
[(0,
  '0.036*"godbless" + 0.030*"overall" + 0.021*"arrived" + 0.016*"mystery" + '
  '0.016*"immediately" + 0.013*"mejo" + 0.013*"pouch" + 0.012*"mouse" + '
  '0.011*"better" + 0.009*"ty"'),
 (1,
  '0.054*"size" + 0.044*"order" + 0.031*"yung" + 0.029*"kaso" + 0.029*"color" '
  '+ 0.029*"dumating" + 0.021*"maliit" + 0.020*"black" + 0.017*"kulay" + '
  '0.016*"mali"'),
 (2,
  '0.018*"pink" + 0.018*"kulay" + 0.017*"dumating" + 0.016*"order" + '
  '0.014*"kaso" + 0.013*"color" + 0.013*"blue" + 0.010*"inorder" + 0.009*"nag" '
  '+ 0.009*"mag"'),
 (3,
  '0.086*"yung" + 0.030*"kaso" + 0.020*"item" + 0.017*"kasi" + 0.014*"maganda" '
  '+ 0.013*"wala" + 0.012*"pag" + 0.010*"nung" + 0.010*"nya" + 0.009*"parang"'),
 (4,
  '0.061*"thank" + 0.050*"maganda" + 0.048*"seller" + 0.046*"good" + '
  '0.036*"quality" + 0.030*"price" + 0.021*"nice" + 0.018*"tela" + '
  '0.016*"medyo" + 0.015*"nya"'),
 (5,
  '0.028*"uulitin" + 0.013*"staff" + 0.013*"tumagal" + 0.009*"jogging" + '
  '0.

In [156]:
# Compute coherence score
coherence_model_lda = CoherenceModel(
    model=lda_model,
    texts=processed_texts,
    dictionary=id2word,
    coherence='c_v'
)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.4606332724074339
