In [26]:
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases, LdaModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [27]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
# Load the dataset
df = pd.read_csv('dataset/SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [29]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [30]:
documents = reviews_df['review'].astype(str).tolist()

In [31]:
#load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [32]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords 
tagalog_stopwords = load_stopwords("stopwords-tl.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [33]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [34]:
# 1. Preprocess and tokenize your documents
processed_texts = preprocess_data(documents)  # Should return list of tokenized docs

# 2. Create bigram and trigram models
bigram = Phrases(processed_texts, min_count=3, threshold=5)
trigram = Phrases(bigram[processed_texts], threshold=5)

# 3. Convert to efficient Phrasers
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# 4. Apply phrase models
def make_ngrams(texts):
    bigram_texts = [bigram_mod[doc] for doc in texts]
    trigram_texts = [trigram_mod[doc] for doc in bigram_texts]
    return trigram_texts

ngrammed_texts = make_ngrams(processed_texts)

# 5. Join tokens back into strings
texts_for_bertopic = [' '.join(doc) for doc in ngrammed_texts]

In [35]:
from bertopic import BERTopic
print("BERTopic is working!")

BERTopic is working!


In [36]:
# Initialize BERTopic
topic_model = BERTopic(language="multilingual")

# Fit the model on the preprocessed texts
topics, probs = topic_model.fit_transform(texts_for_bertopic)

In [38]:
pd.set_option("display.max_columns", None)        # Show all columns
pd.set_option("display.max_rows", None)           # Show all rows (optional)
pd.set_option("display.max_colwidth", None)       # Show full column content (this is the key!)
pd.set_option("display.width", None)              # Avoid wrapping to new lines

# Now show topic info
topic_info = topic_model.get_topic_info()
print(topic_info)

     Topic  Count  \
0       -1   3129   
1        0   1042   
2        1    421   
3        2    376   
4        3    219   
5        4    219   
6        5    216   
7        6    210   
8        7    198   
9        8    180   
10       9    168   
11      10    164   
12      11    153   
13      12    138   
14      13    126   
15      14    116   
16      15    115   
17      16    105   
18      17    105   
19      18    102   
20      19    100   
21      20     92   
22      21     92   
23      22     90   
24      23     87   
25      24     86   
26      25     81   
27      26     78   
28      27     72   
29      28     70   
30      29     69   
31      30     65   
32      31     64   
33      32     61   
34      33     55   
35      34     54   
36      35     53   
37      36     52   
38      37     49   
39      38     46   
40      39     46   
41      40     45   
42      41     44   
43      42     43   
44      43     42   
45      44     42   
46      45   