In [21]:
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases, LdaModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# Load the dataset
df = pd.read_csv('SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [23]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [24]:
documents = reviews_df['review'].astype(str).tolist()

In [25]:
# Load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [26]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords
tagalog_stopwords = load_stopwords("stopwords-new.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [27]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [28]:
# Preprocess the documents
processed_texts = preprocess_data(documents)

In [29]:
# Create bigram and trigram models
bigram = Phrases(processed_texts, min_count=3, threshold=5)
trigram = Phrases(bigram[processed_texts], threshold=5)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [30]:
# Apply phrase models
def make_ngrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

processed_texts = make_ngrams(processed_texts)

In [31]:
# Create dictionary and corpus
id2word = corpora.Dictionary(processed_texts)
corpus = [id2word.doc2bow(text) for text in processed_texts]


In [32]:
# Function to train and evaluate an LDA model
def train_evaluate_lda(corpus, id2word, texts, num_topics=10, passes=10, iterations=100, alpha='auto', random_state=42):
    lda = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        random_state=random_state,
        passes=passes,
        iterations=iterations,
        alpha=alpha,
        per_word_topics=True
    )

    coherence_model = CoherenceModel(
        model=lda,
        texts=texts,
        dictionary=id2word,
        coherence='c_v'
    )
    coherence = coherence_model.get_coherence()

    return lda, coherence

In [33]:
# List of parameter configurations to test
configs = [
    {'num_topics': 10, 'passes': 20, 'iterations': 200},
    {'num_topics': 12, 'passes': 50, 'iterations': 500},
    {'num_topics': 15, 'passes': 30, 'iterations': 300},
    {'num_topics': 8, 'passes': 25, 'iterations': 400},
]

results = []

print("Training LDA models with different configurations...\n")

Training LDA models with different configurations...



In [34]:
# Train and evaluate models
for config in configs:
    print(f"Training config: {config}")
    lda_model, coherence = train_evaluate_lda(
        corpus=corpus,
        id2word=id2word,
        texts=processed_texts,
        **config
    )
    results.append({
        'config': config,
        'model': lda_model,
        'coherence': coherence
    })
    print(f"Coherence Score: {coherence:.4f}\n")

Training config: {'num_topics': 10, 'passes': 20, 'iterations': 200}
Coherence Score: 0.5389

Training config: {'num_topics': 12, 'passes': 50, 'iterations': 500}
Coherence Score: 0.5201

Training config: {'num_topics': 15, 'passes': 30, 'iterations': 300}
Coherence Score: 0.4666

Training config: {'num_topics': 8, 'passes': 25, 'iterations': 400}
Coherence Score: 0.4664



In [35]:
# Sort results by coherence score
results = sorted(results, key=lambda x: x['coherence'], reverse=True)

In [36]:
# Print summary
print("Model comparison:")
for res in results:
    print(f"Config: {res['config']} → Coherence: {res['coherence']:.4f}")

Model comparison:
Config: {'num_topics': 10, 'passes': 20, 'iterations': 200} → Coherence: 0.5389
Config: {'num_topics': 12, 'passes': 50, 'iterations': 500} → Coherence: 0.5201
Config: {'num_topics': 15, 'passes': 30, 'iterations': 300} → Coherence: 0.4666
Config: {'num_topics': 8, 'passes': 25, 'iterations': 400} → Coherence: 0.4664


In [37]:
# Visualize best model
best_model = results[0]['model']
print("\nBest Model Topics:")
pprint(best_model.print_topics())


Best Model Topics:
[(0,
  '0.010*"xxl" + 0.009*"free" + 0.008*"mganda" + 0.008*"si" + '
  '0.008*"thank_thank" + 0.008*"tho" + 0.007*"zipper" + 0.007*"saktong_sakto" '
  '+ 0.007*"hrs" + 0.006*"nag_deliver"'),
 (1,
  '0.017*"gumagana" + 0.011*"working" + 0.009*"gamitin" + 0.007*"need" + '
  '0.007*"battery" + 0.007*"since" + 0.006*"alam" + 0.006*"complete" + '
  '0.006*"bubble_wrap" + 0.006*"good_condition"'),
 (2,
  '0.012*"well_packaged" + 0.009*"kay" + 0.008*"promise" + 0.007*"magaan" + '
  '0.007*"super_cute" + 0.007*"guys" + 0.007*"ok_price" + '
  '0.007*"excellent_quality" + 0.007*"wow" + 0.007*"sakto_size"'),
 (3,
  '0.018*"sulit" + 0.009*"sna" + 0.008*"always" + 0.007*"medyo_matagal" + '
  '0.007*"although" + 0.006*"recieved" + 0.006*"masarap" + 0.006*"yet" + '
  '0.006*"dents" + 0.005*"labas"'),
 (4,
  '0.023*"order_ulit" + 0.015*"kuya_rider" + 0.011*"well_packed" + '
  '0.011*"sobrang_ganda" + 0.010*"try" + 0.010*"nagdeliver" + 0.009*"malambot" '
  '+ 0.009*"cya" + 0.009*"sh

In [38]:
# Prepare interactive visualization
vis = gensimvis.prepare(best_model, corpus, id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [39]:
# Create a DataFrame with the comparison
data = {
    "LDA Topic": [
        "Topic 0", "Topic 1", "Topic 2", "Topic 3", "Topic 4",
        "Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"
    ],
    "Top Keywords (Partial)": [
        'xxl, free, zipper, saktong_sakto, nag_deliver',
        'gumagana, working, battery, complete, good_condition',
        'well_packaged, magaan, excellent_quality, ok_price',
        'sulit, recieved, masarap, dents',
        'order_ulit, sobrang_ganda, malambot, shoes',
        'salamat_seller, amoy, mabango, sana_tumagal',
        'ganda_quality, makapal, perfect, good_price',
        'uulitin, thankyou_seller, sulit_price, yupi',
        'maraming_salamat, second_order, pen, hotel',
        'maganda, item, order, seller, sana'
    ],
    "Matching Open-Coded Aspects": [
        "PRO#SIZE, DEL#TIME, PRO#MAT",
        "PRO#FUNC, PRO#EFF, PRO#COND",
        "DEL#COND, PRO#GEN, PRI#VOM",
        "PRI#VOM, DEL#COND, PRO#EFF",
        "SAT#RECO, PRO#MAT, SAT#EMO",
        "SAT#EMO, PRO#DUR, SER#GEN",
        "PRO#GEN, PRO#MAT, PRI#VOM",
        "SAT#RECO, DEL#COND, PRI#VOM",
        "SAT#EMO, SAT#RECO, SER#GEN",
        "PRO#GEN, SER#GEN, SAT#GEN"
    ],
    "Notes": [
        "Mentions of sizing, delivery timing, and product construction",
        "Functionality and condition of items, esp. electronics",
        "Packaging, general quality, and pricing",
        "Value for money, food quality, and damage",
        "Reorders, satisfaction, and softness of materials",
        "Thanks, scent-related comments, and durability",
        "General praise, thickness, and pricing satisfaction",
        "Repeat order, damage in transit, and value",
        "Thankfulness, second order = positive emotional feedback",
        "Vague praise and seller mentions, low specificity"
    ]
}

df = pd.DataFrame(data)

# Display the table
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,LDA Topic,Top Keywords (Partial),Matching Open-Coded Aspects,Notes
0,Topic 0,"xxl, free, zipper, saktong_sakto, nag_deliver","PRO#SIZE, DEL#TIME, PRO#MAT","Mentions of sizing, delivery timing, and product construction"
1,Topic 1,"gumagana, working, battery, complete, good_condition","PRO#FUNC, PRO#EFF, PRO#COND","Functionality and condition of items, esp. electronics"
2,Topic 2,"well_packaged, magaan, excellent_quality, ok_price","DEL#COND, PRO#GEN, PRI#VOM","Packaging, general quality, and pricing"
3,Topic 3,"sulit, recieved, masarap, dents","PRI#VOM, DEL#COND, PRO#EFF","Value for money, food quality, and damage"
4,Topic 4,"order_ulit, sobrang_ganda, malambot, shoes","SAT#RECO, PRO#MAT, SAT#EMO","Reorders, satisfaction, and softness of materials"
5,Topic 5,"salamat_seller, amoy, mabango, sana_tumagal","SAT#EMO, PRO#DUR, SER#GEN","Thanks, scent-related comments, and durability"
6,Topic 6,"ganda_quality, makapal, perfect, good_price","PRO#GEN, PRO#MAT, PRI#VOM","General praise, thickness, and pricing satisfaction"
7,Topic 7,"uulitin, thankyou_seller, sulit_price, yupi","SAT#RECO, DEL#COND, PRI#VOM","Repeat order, damage in transit, and value"
8,Topic 8,"maraming_salamat, second_order, pen, hotel","SAT#EMO, SAT#RECO, SER#GEN","Thankfulness, second order = positive emotional feedback"
9,Topic 9,"maganda, item, order, seller, sana","PRO#GEN, SER#GEN, SAT#GEN","Vague praise and seller mentions, low specificity"
