In [None]:
#!pip install pandas bertopic keybert gensim sentence-transformers scikit-learn hdbscan
#!pip install bertopic==0.16.0 keybert==0.7.0 hdbscan==0.8.33

In [6]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [8]:
import os

# Disable HuggingFace tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
df = pd.read_csv('../../data/combined_data_scores.csv')
# use line below to load the data if using Kaggle
#df = pd.read_csv('/kaggle/input/combined-data-scores/combined_data_scores.csv')
df = df[['text', 'yearmonth', 'title', 'index', 'average_toxicity_score']]

## Evaluation of BERTopic on sample of data using Coherence Score

Using `yearmonth` = `2023-10`

In [None]:
df_sample = df[df['yearmonth'] == '2023-10']

Initialise BERTopic model

In [None]:
# Create representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model, nr_topics='auto')

# Fit the model on text data
topics, probabilities = topic_model.fit_transform(df_sample['text'])

In [14]:
# Get the top 20 most frequent topics
topic_model.get_topic_info().head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11293,-1_money_because_life_singapore,"[money, because, life, singapore, say, think, ...",[> Why must Holy and unholy be defined the sa...
1,0,3461,0_gaza_palestinians_hamas_palestinian,"[gaza, palestinians, hamas, palestinian, pales...",[I dont see why people keep having the mindset...
2,1,716,1_cyclist_cyclists_roads_traffic,"[cyclist, cyclists, roads, traffic, lanes, cyc...","[>Actually, this is a huge problem because of ..."
3,2,652,2_bullying_bullied_bully_offensive,"[bullying, bullied, bully, offensive, revenge,...","[That is what I’m saying though, that I’ve “ma..."
4,3,435,3_rice_chicken_meals_meal,"[rice, chicken, meals, meal, food, meat, eatin...","[Chicken rice. My go to post gym meal., Rather..."
5,4,326,4_income_salary_earn_earning,"[income, salary, earn, earning, 5k, rich, 4k, ...",[Yeah i agree with the stats being skewed. But...
6,5,240,5_prices_affordable_btos_afford,"[prices, affordable, btos, afford, cost, bto, ...","[Latest BTO Prices from $364,000 to $509,000 3..."
7,6,228,6_depression_psychiatric_psychiatry_illnesses,"[depression, psychiatric, psychiatry, illnesse...",[Is her mental health ok? Could she be dealing...
8,7,211,7_pap_paps_papbut_pappies,"[pap, paps, papbut, pappies, paiseh, pa, pappy...","[Vote for pap, PAP so good., PAP]"
9,8,203,8_wipe_wiping_wipes_cleaning,"[wipe, wiping, wipes, cleaning, tables, cleanl...","[Need to clear table after eating. In future, ..."


In [None]:
topic_model.visualize_topics()

Coherence Score

In [15]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(df_sample['text'])

# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Tokenize the documents
tokens = [tokenizer(doc) for doc in cleaned_docs]

# Create Gensim dictionary and corpus
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

# Get the top topic words for each topic from BERTopic
topic_words = [[word for word, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Convert topic words to token IDs using the Gensim dictionary
topic_ids = [[dictionary.token2id[word] for word in topic if word in dictionary.token2id] 
             for topic in topic_words]

# Evaluate coherence score
coherence_model = CoherenceModel(topics=topic_ids, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')

# Get coherence score
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")


  self.pid = os.fork()
  self.pid = os.fork()


Coherence Score: 0.33345405750800106


# Possible Improvemnt through Manual Data Cleaning and Parameter Tuning

In [None]:
#!pip install nltk

In [29]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from umap import UMAP
import os
from pathlib import Path

Define `preprocess_text()`

In [30]:
def setup_nltk_in_kaggle():
    """
    Setup NLTK specifically for Kaggle environment
    """
    # Create directory in the Kaggle working directory
    nltk_data_dir = Path('/kaggle/working/nltk_data')
    nltk_data_dir.mkdir(parents=True, exist_ok=True)
    
    # Set the NLTK_DATA environment variable
    os.environ['NLTK_DATA'] = str(nltk_data_dir)
    
    # Add the path to NLTK's search paths
    nltk.data.path.append(str(nltk_data_dir))
    
    # Download required resources
    print("Downloading NLTK resources...")
    for resource in ['wordnet', 'stopwords', 'omw-1.4']:
        try:
            nltk.download(resource, download_dir=str(nltk_data_dir), quiet=False)
        except Exception as e:
            print(f"Error downloading {resource}: {e}")
    
    print("\nNLTK data paths:", nltk.data.path)
    
    # Alternative preprocessing function that doesn't rely on WordNet
    def preprocess_text(texts):
        """
        Preprocess text without relying on WordNet lemmatization
        """
        # Basic English stop words
        stop_words = set([
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
            'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
            'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
            'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 
            'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
            'with', 'about', 'against', 'between', 'into', 'through', 'during', 
            'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
            'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 
            'then', 'once'
        ])
        
        def clean_text(text):
            # Convert to string and lowercase
            text = str(text).lower()
            # Remove special characters and extra whitespace
            text = ' '.join(''.join(c if c.isalnum() else ' ' for c in text).split())
            # Remove stop words and short tokens
            tokens = [token for token in text.split() if token not in stop_words and len(token) > 2]
            return ' '.join(tokens)
        
        return [clean_text(text) for text in texts]
    
    return preprocess_text

# Set up NLTK and get the preprocessing function
preprocess_text = setup_nltk_in_kaggle()

Downloading NLTK resources...
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

NLTK data paths: ['/root/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data']


Initialise BERTopic model with parameters `min_df`, `max_df`, `ngram_range`, `n_neighbors`, `n_components` and `min_dist` tuned

In [None]:
# Example usage:
df = pd.read_csv('../../data/combined_data_scores.csv')
# use the line below to load the data if using Kaggle
#df = pd.read_csv('/kaggle/input/combined-data-scores/combined_data_scores.csv')
yearmonth = '2023-10'
df_sample = df[df['yearmonth'] == yearmonth]

# Preprocess the texts
preprocessed_texts = preprocess_text(df_sample['text'])

# Initialize BERTopic model
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(min_df=5,
                                   max_df=0.9,
                                   ngram_range=(1, 2),
                                   stop_words="english")
umap_model = UMAP(
    n_neighbors=15,
    n_components=10,
    min_dist=0.1,
    metric='cosine'
)
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    representation_model=representation_model,  # Use a transformer model for better coherence
    nr_topics=50,                             # Set a target number of topics
    min_topic_size=10                          # Minimum documents per topic
)

# Fit the model
topics, probabilities = topic_model.fit_transform(preprocessed_texts)

# Get topic info
print("\nTop 20 Topics:")
print(topic_model.get_topic_info().head(20))

  pid = os.fork()



Top 20 Topics:
    Topic  Count                                              Name  \
0      -1  13177  -1_singaporeans_countries_government_singaporean   
1       0   1541                               0_yes_yep_yeah_nope   
2       1   1479     1_singapore_singaporeans_singaporean_malaysia   
3       2   1352                   2_peace_war_violence_supporting   
4       3    773                   3_affordable_afford_prices_loan   
5       4    738            4_bus_public transport_driving_vehicle   
6       5    731       5_advertising_marketing_reviews_influencers   
7       6    729                   6_police_report_arrest_arrested   
8       7    599                        7_sinkie_sinkies_rice_meal   
9       8    595                           8_ceca_cai_siao_cai fan   
10      9    522                          9_women_gender_men_woman   
11     10    348                10_patients_doctors_medical_clinic   
12     11    316                      11_ugly_attractive_face_look   
13  

Improved Coherence Score

In [34]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import numpy as np

def calculate_topic_coherence(topic_model, texts, topics):
    """
    Calculate topic coherence for a BERTopic model
    
    Parameters:
    -----------
    topic_model : BERTopic model
        The fitted BERTopic model
    texts : list
        List of preprocessed text documents
    topics : list
        List of assigned topics from fit_transform
    """
    # Extract vectorizer and build tokenizer
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()
    
    # Tokenize texts
    tokens = [tokenizer(doc) for doc in texts]
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(tokens)
    
    # Create mapping of word to id
    word2id = {word: idx for idx, word in dictionary.items()}
    
    # Get topic words for each topic
    unique_topics = sorted(set(topics) - {-1})  # Exclude -1 topic
    topic_words = []
    
    for topic_idx in unique_topics:
        # Get the words for this topic
        topic = topic_model.get_topic(topic_idx)
        if topic:  # Only include non-empty topics
            # Extract just the words (not the weights)
            words = [word for word, _ in topic[:10]]  # Take top 10 words per topic
            # Convert words to dictionary ids
            word_ids = [word2id[word] for word in words if word in word2id]
            if word_ids:  # Only add if we have valid words
                topic_words.append(word_ids)
    
    # Calculate coherence only if we have valid topics
    if topic_words:
        try:
            coherence_model = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                dictionary=dictionary,
                coherence='c_v'
            )
            return coherence_model.get_coherence()
        except Exception as e:
            print(f"Error calculating coherence: {e}")
            return None
    return None

def calculate_per_topic_coherence(topic_model, texts, topics):
    """
    Calculate coherence scores for each individual topic
    """
    # Extract vectorizer and build tokenizer
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()
    
    # Tokenize texts
    tokens = [tokenizer(doc) for doc in texts]
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(tokens)
    word2id = {word: idx for idx, word in dictionary.items()}
    
    unique_topics = sorted(set(topics) - {-1})  # Exclude -1 topic
    per_topic_coherence = {}
    
    for topic_idx in unique_topics:
        # Get the words for this topic
        topic = topic_model.get_topic(topic_idx)
        if topic:
            # Extract just the words (not the weights)
            words = [word for word, _ in topic[:10]]  # Take top 10 words
            # Convert words to dictionary ids
            word_ids = [word2id[word] for word in words if word in word2id]
            if word_ids:
                try:
                    coherence_model = CoherenceModel(
                        topics=[word_ids],
                        texts=tokens,
                        dictionary=dictionary,
                        coherence='c_v'
                    )
                    per_topic_coherence[topic_idx] = coherence_model.get_coherence()
                except Exception as e:
                    print(f"Error calculating coherence for topic {topic_idx}: {e}")
                    per_topic_coherence[topic_idx] = None
    
    return per_topic_coherence

# Calculate coherence score
print("Calculating overall coherence score...")
coherence_score = calculate_topic_coherence(topic_model, preprocessed_texts, topics)

if coherence_score is not None:
    print(f"\nTopic Coherence Score (C_v): {coherence_score:.4f}")
else:
    print("\nCould not calculate overall coherence score")

Calculating overall coherence score...


  self.pid = os.fork()
  self.pid = os.fork()



Topic Coherence Score (C_v): 0.4220
