In [4]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [6]:
import os

# Disable HuggingFace tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [7]:
df = pd.read_csv('/kaggle/input/combined-data-scores/combined_data_scores.csv')
df = df[['text', 'yearmonth', 'title', 'index', 'average_toxicity_score']]

In [None]:
df_sample = df[df['yearmonth'] == '2022-10']

In [None]:
# Create representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model, nr_topics='auto')

# Fit the model on text data
topics, probabilities = topic_model.fit_transform(df_sample['text'])

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(df_sample['text'])

# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Tokenize the documents
tokens = [tokenizer(doc) for doc in cleaned_docs]

# Create Gensim dictionary and corpus
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

# Get the top topic words for each topic from BERTopic
topic_words = [[word for word, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Convert topic words to token IDs using the Gensim dictionary
topic_ids = [[dictionary.token2id[word] for word in topic if word in dictionary.token2id] 
             for topic in topic_words]

# Evaluate coherence score
coherence_model = CoherenceModel(topics=topic_ids, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')

# Get coherence score
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")


Improvemnt in Data Preprocessing and Coherent Score Calculation

In [8]:
import re
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

In [9]:
# Define function for text preprocessing
def preprocess_text(texts):
    # Initialize stop words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def clean_text(text):
        # Remove non-alphanumeric characters
        text = re.sub(r'\W+', ' ', text)
        # Tokenize and lemmatize
        tokens = text.lower().split()
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return ' '.join(tokens)
    
    return [clean_text(text) for text in texts]

# Function to get topics for each yearmonth
def process_topics_by_year(df, year, output_csv, start_month=1):
    """
    Process topics for all yearmonths in the specified year using BERTopic, and output results into a csv file.
    
    Parameters:
    - df: The original DataFrame containing the data.
    - year: The year for which to process the data (e.g., 2022).
    - output_csv: The output CSV file to save the results.
    - start_month: The starting month for processing (default is 1).
    """
    # Filter the DataFrame for the specified year
    df_filtered_year = df[df['yearmonth'].str.startswith(str(year))]

    # Get unique year-month combinations for the specified year
    unique_yearmonths = df_filtered_year['yearmonth'].unique()

    for idx, yearmonth in enumerate(unique_yearmonths):
        # Skip months before the start_month
        if int(yearmonth[-2:]) < start_month:
            continue
        else:
            print(f"Processing {yearmonth}...")
            
            # Filter data for the current year-month
            df_filtered = df[df['yearmonth'] == yearmonth]
            
            # Preprocess the text data
            preprocessed_texts = preprocess_text(df_filtered['text'])
            
            # Initialise representation model
            representation_model = KeyBERTInspired()

            # Initialize BERTopic model with custom vectorizer and parameters
            vectorizer_model = CountVectorizer(min_df=10, stop_words="english")
            topic_model = BERTopic(representation_model=representation_model, nr_topics="auto", 
                                   vectorizer_model=vectorizer_model, min_topic_size=10)

            # Fit the model on the preprocessed text data
            topics, probabilities = topic_model.fit_transform(preprocessed_texts)

            # Save topics per document
            df_topics = pd.DataFrame({
                'index': df_filtered['index'],
                'Topic': topics
            })

            # Save topic information (such as topic words and frequencies)
            topic_info = topic_model.get_topic_info()

            # Merge topics with original data
            df_combined = pd.merge(df_filtered, df_topics, on='index', how='left')

            # Merge topic info with the combined data
            df_final = pd.merge(df_combined, topic_info, on='Topic', how='left')

            # Write the result to the output CSV file
            if idx == 0:
                df_final.to_csv(output_csv, index=False, mode='w')  # Write header for the first batch
            else:
                df_final.to_csv(output_csv, index=False, mode='a', header=False)  # Append mode without header

            print(f"Processing for {yearmonth} is complete.")


In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [26]:
import nltk
import os
from pathlib import Path

def setup_nltk_in_kaggle():
    """
    Setup NLTK specifically for Kaggle environment
    """
    # Create directory in the Kaggle working directory
    nltk_data_dir = Path('/kaggle/working/nltk_data')
    nltk_data_dir.mkdir(parents=True, exist_ok=True)
    
    # Set the NLTK_DATA environment variable
    os.environ['NLTK_DATA'] = str(nltk_data_dir)
    
    # Add the path to NLTK's search paths
    nltk.data.path.append(str(nltk_data_dir))
    
    # Download required resources
    print("Downloading NLTK resources...")
    for resource in ['wordnet', 'stopwords', 'omw-1.4']:
        try:
            nltk.download(resource, download_dir=str(nltk_data_dir), quiet=False)
        except Exception as e:
            print(f"Error downloading {resource}: {e}")
    
    print("\nNLTK data paths:", nltk.data.path)
    
    # Alternative preprocessing function that doesn't rely on WordNet
    def preprocess_text(texts):
        """
        Preprocess text without relying on WordNet lemmatization
        """
        # Basic English stop words
        stop_words = set([
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
            'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
            'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
            'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 
            'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
            'with', 'about', 'against', 'between', 'into', 'through', 'during', 
            'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
            'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 
            'then', 'once'
        ])
        
        def clean_text(text):
            # Convert to string and lowercase
            text = str(text).lower()
            # Remove special characters and extra whitespace
            text = ' '.join(''.join(c if c.isalnum() else ' ' for c in text).split())
            # Remove stop words and short tokens
            tokens = [token for token in text.split() if token not in stop_words and len(token) > 2]
            return ' '.join(tokens)
        
        return [clean_text(text) for text in texts]
    
    return preprocess_text

# Set up NLTK and get the preprocessing function
preprocess_text = setup_nltk_in_kaggle()

Downloading NLTK resources...
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

NLTK data paths: ['/root/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data', '/kaggle/working/nltk_data']


In [14]:
# Example usage:
df = pd.read_csv('/kaggle/input/combined-data-scores/combined_data_scores.csv')
yearmonth = '2022-10'
df_sample = df[df['yearmonth'] == yearmonth]

# Preprocess the texts
preprocessed_texts = preprocess_text(df_sample['text'])

# Initialize BERTopic model
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(min_df=10, stop_words="english")
topic_model = BERTopic(
    representation_model=representation_model,
    nr_topics="auto",
    vectorizer_model=vectorizer_model,
    min_topic_size=10
)

# Fit the model
topics, probabilities = topic_model.fit_transform(preprocessed_texts)

# Get topic info
print("\nTop 20 Topics:")
print(topic_model.get_topic_info().head(20))

  pid = os.fork()



Top 20 Topics:
    Topic  Count                                        Name  \
0      -1  33232  -1_housing_singaporeans_income_singaporean   
1       0   2345           0_drugs_singaporeans_crimes_crime   
2       1    588           1_teachers_teacher_teaching_teach   
3       2    557                  2_women_females_men_gender   
4       3    547                 3_sauce_recipe_flavour_dish   
5       4    500              4_weather_climate_rainy_winter   
6       5    492           5_covid_pandemic_infection_immune   
7       6    485                      6_jobs_job_hiring_hire   
8       7    474                                       7____   
9       8    440                    8_toilet_pee_drain_dirty   
10      9    417       9_shipping_delivery_pricing_purchases   
11     10    408              10_thanks_thank_welcome_cheers   
12     11    255                        11_yes_yep_nope_yeah   
13     12    253              12_landlord_rents_rent_renting   
14     13    253        

In [19]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import numpy as np

def calculate_topic_coherence(topic_model, texts, topics):
    """
    Calculate topic coherence for a BERTopic model
    
    Parameters:
    -----------
    topic_model : BERTopic model
        The fitted BERTopic model
    texts : list
        List of preprocessed text documents
    topics : list
        List of assigned topics from fit_transform
    """
    # Extract vectorizer and build tokenizer
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()
    
    # Tokenize texts
    tokens = [tokenizer(doc) for doc in texts]
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(tokens)
    
    # Create mapping of word to id
    word2id = {word: idx for idx, word in dictionary.items()}
    
    # Get topic words for each topic
    unique_topics = sorted(set(topics) - {-1})  # Exclude -1 topic
    topic_words = []
    
    for topic_idx in unique_topics:
        # Get the words for this topic
        topic = topic_model.get_topic(topic_idx)
        if topic:  # Only include non-empty topics
            # Extract just the words (not the weights)
            words = [word for word, _ in topic[:10]]  # Take top 10 words per topic
            # Convert words to dictionary ids
            word_ids = [word2id[word] for word in words if word in word2id]
            if word_ids:  # Only add if we have valid words
                topic_words.append(word_ids)
    
    # Calculate coherence only if we have valid topics
    if topic_words:
        try:
            coherence_model = CoherenceModel(
                topics=topic_words,
                texts=tokens,
                dictionary=dictionary,
                coherence='c_v'
            )
            return coherence_model.get_coherence()
        except Exception as e:
            print(f"Error calculating coherence: {e}")
            return None
    return None

def calculate_per_topic_coherence(topic_model, texts, topics):
    """
    Calculate coherence scores for each individual topic
    """
    # Extract vectorizer and build tokenizer
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()
    
    # Tokenize texts
    tokens = [tokenizer(doc) for doc in texts]
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(tokens)
    word2id = {word: idx for idx, word in dictionary.items()}
    
    unique_topics = sorted(set(topics) - {-1})  # Exclude -1 topic
    per_topic_coherence = {}
    
    for topic_idx in unique_topics:
        # Get the words for this topic
        topic = topic_model.get_topic(topic_idx)
        if topic:
            # Extract just the words (not the weights)
            words = [word for word, _ in topic[:10]]  # Take top 10 words
            # Convert words to dictionary ids
            word_ids = [word2id[word] for word in words if word in word2id]
            if word_ids:
                try:
                    coherence_model = CoherenceModel(
                        topics=[word_ids],
                        texts=tokens,
                        dictionary=dictionary,
                        coherence='c_v'
                    )
                    per_topic_coherence[topic_idx] = coherence_model.get_coherence()
                except Exception as e:
                    print(f"Error calculating coherence for topic {topic_idx}: {e}")
                    per_topic_coherence[topic_idx] = None
    
    return per_topic_coherence

# Calculate coherence score
print("Calculating overall coherence score...")
coherence_score = calculate_topic_coherence(topic_model, preprocessed_texts, topics)

if coherence_score is not None:
    print(f"\nTopic Coherence Score (C_v): {coherence_score:.4f}")
else:
    print("\nCould not calculate overall coherence score")

# Calculate and print per-topic coherence scores
print("\nCalculating per-topic coherence scores...")
per_topic_scores = calculate_per_topic_coherence(topic_model, preprocessed_texts, topics)
print("\nPer-topic coherence scores:")
for topic, score in per_topic_scores.items():
    if score is not None:
        print(f"Topic {topic}: {score:.4f}")
    else:
        print(f"Topic {topic}: Could not calculate")

Calculating overall coherence score...


  self.pid = os.fork()
  self.pid = os.fork()



Topic Coherence Score (C_v): 0.3890

Calculating per-topic coherence scores...


Exception ignored in: <function _releaseLock at 0x7f7561ae0550>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 



Per-topic coherence scores:
Topic 0: 0.5018
Topic 1: 0.8172
Topic 2: 0.7348
Topic 3: 0.4741
Topic 4: 0.6870
Topic 5: 0.5177
Topic 6: 0.5924
Topic 8: 0.3151
Topic 9: 0.2668
Topic 10: 0.3469
Topic 11: 0.3239
Topic 12: 0.7671
Topic 13: 0.5096
Topic 14: 0.7815
Topic 15: 0.5582
Topic 16: 0.2685
Topic 17: 0.6695
Topic 18: 0.3281
Topic 19: 0.6500
Topic 20: 0.2392
Topic 21: 0.6216
Topic 22: 0.8706
Topic 23: 0.3421
Topic 24: 0.7085
Topic 25: 0.3439
Topic 26: 0.5631
Topic 27: 0.3601
Topic 28: 0.3036
Topic 29: 0.5065
Topic 30: 0.3955
Topic 31: 0.4048
Topic 32: 0.5611
Topic 33: 0.6094
Topic 34: 0.7270
Topic 35: 0.2779
Topic 36: 0.4091
Topic 37: 0.3298
Topic 38: 0.4045
Topic 39: 0.2965
Topic 40: 0.2907
Topic 41: 0.2684
Topic 42: 0.4786
Topic 43: 0.3358
Topic 44: 0.3479
Topic 45: 0.3990
Topic 46: 0.4045
Topic 47: 0.6843
Topic 48: 0.5755
Topic 49: 0.3267
Topic 50: 0.6768
Topic 51: 0.4415
Topic 52: 0.4582
Topic 53: 0.3104
Topic 54: 0.3015
Topic 55: 0.3683
Topic 56: 0.2832
Topic 57: 0.3676
Topic 58: 0

In [23]:
# Example usage:
df = pd.read_csv('/kaggle/input/combined-data-scores/combined_data_scores.csv')
yearmonth = '2022-05'
df_sample = df[df['yearmonth'] == yearmonth]

# Preprocess the texts
preprocessed_texts = preprocess_text(df_sample['text'])

# Initialize BERTopic model
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(min_df=10, stop_words="english")
topic_model = BERTopic(
    representation_model=representation_model,
    nr_topics="auto",
    vectorizer_model=vectorizer_model,
    min_topic_size=10
)

# Fit the model
topics, probabilities = topic_model.fit_transform(preprocessed_texts)

# Get topic info
print("\nTop 20 Topics:")
print(topic_model.get_topic_info().head(20))

  pid = os.fork()



Top 20 Topics:
    Topic  Count                                               Name  \
0      -1  44869   -1_singaporeans_singaporean_government_singapore   
1       0   1285  0_singaporeans_singaporean_singapore_singaporeraw   
2       1   1082                    1_driving_accidents_roads_drove   
3       2    854                                              2____   
4       3    676                     3_thank_thanks_welcome_amazing   
5       4    635                      4_covid_vaccines_pandemic_flu   
6       5    606                    5_equality_genders_gender_equal   
7       6    543                 6_crypto_gambling_gamble_investing   
8       7    430                   7_content_messaging_message_blog   
9       8    424                  8_parents_parent_parenting_family   
10      9    371                   9_chicken_frozen_freeze_freezing   
11     10    368            10_depression_mentally_mental_illnesses   
12     11    358                         11_consent_rape_laws

In [24]:
print("Calculating overall coherence score...")
coherence_score = calculate_topic_coherence(topic_model, preprocessed_texts, topics)

if coherence_score is not None:
    print(f"\nTopic Coherence Score (C_v): {coherence_score:.4f}")
else:
    print("\nCould not calculate overall coherence score")

# Calculate and print per-topic coherence scores
print("\nCalculating per-topic coherence scores...")
per_topic_scores = calculate_per_topic_coherence(topic_model, preprocessed_texts, topics)
print("\nPer-topic coherence scores:")
for topic, score in per_topic_scores.items():
    if score is not None:
        print(f"Topic {topic}: {score:.4f}")
    else:
        print(f"Topic {topic}: Could not calculate")

Calculating overall coherence score...


  self.pid = os.fork()
  self.pid = os.fork()



Topic Coherence Score (C_v): 0.3901

Calculating per-topic coherence scores...


Exception ignored in: <function _releaseLock at 0x7f7561ae0550>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 



Per-topic coherence scores:
Topic 0: 0.5769
Topic 1: 0.6708
Topic 3: 0.3249
Topic 4: 0.6073
Topic 5: 0.7821
Topic 6: 0.4785
Topic 7: 0.4510
Topic 8: 0.4250
Topic 9: 0.5488
Topic 10: 0.5001
Topic 11: 0.5086
Topic 12: 0.3564
Topic 13: 0.8193
Topic 14: 0.5896
Topic 15: 0.3230
Topic 16: 0.8911
Topic 17: 0.3441
Topic 18: 0.5800
Topic 19: 0.4380
Topic 20: 0.4151
Topic 21: 0.2726
Topic 22: 0.3690
Topic 23: 0.3916
Topic 24: 0.4173
Topic 25: 0.5947
Topic 26: 0.4717
Topic 27: 0.5592
Topic 28: 0.7513
Topic 29: 0.4695
Topic 30: 0.5867
Topic 31: 0.3973
Topic 32: 0.4046
Topic 33: 0.7872
Topic 34: 0.6396
Topic 35: 0.3281
Topic 36: 0.6572
Topic 37: 0.3398
Topic 38: 0.4067
Topic 39: 0.5303
Topic 40: 0.3964
Topic 41: 0.3653
Topic 42: 0.3214
Topic 43: 0.5773
Topic 44: 0.4333
Topic 45: 0.3515
Topic 46: 0.3013
Topic 47: 0.4712
Topic 48: 0.4547
Topic 49: 0.3378
Topic 50: 0.4841
Topic 51: 0.5235
Topic 52: 0.4391
Topic 53: 0.6465
Topic 54: 0.4815
Topic 55: 0.3963
Topic 56: 0.4083
Topic 57: 0.3488
Topic 58: 0

In [27]:
# Example usage:
df = pd.read_csv('/kaggle/input/combined-data-scores/combined_data_scores.csv')
yearmonth = '2022-06'
df_sample = df[df['yearmonth'] == yearmonth]

# Preprocess the texts
preprocessed_texts = preprocess_text(df_sample['text'])

# Initialize BERTopic model
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(min_df=10, stop_words="english")
topic_model = BERTopic(
    representation_model=representation_model,
    nr_topics="auto",
    vectorizer_model=vectorizer_model,
    min_topic_size=10
)

# Fit the model
topics, probabilities = topic_model.fit_transform(preprocessed_texts)

# Get topic info
print("\nTop 20 Topics:")
print(topic_model.get_topic_info().head(20))

  pid = os.fork()



Top 20 Topics:
    Topic  Count                                              Name  \
0      -1  36868      -1_singaporeans_singapore_government_support   
1       0   3174                       0_asians_asian_racial_races   
2       1   1498         1_interviews_interview_interviewed_hiring   
3       2   1028                      2_fat_diet_unhealthy_healthy   
4       3    821                        3_roads_traffic_buses_road   
5       4    708                                          4_yes___   
6       5    693                        5_covid_pandemic_flu_virus   
7       6    548                    6_fight_fighting_fights_combat   
8       7    533                   7_thank_thanks_welcome_grateful   
9       8    424               8_parents_parent_families_childhood   
10      9    411                    9_courts_accused_justice_court   
11     10    396           10_trolls_insult_insulting_conversation   
12     11    391                 11_drugs_drug_medication_medicine   
13  

In [28]:
print("Calculating overall coherence score...")
coherence_score = calculate_topic_coherence(topic_model, preprocessed_texts, topics)

if coherence_score is not None:
    print(f"\nTopic Coherence Score (C_v): {coherence_score:.4f}")
else:
    print("\nCould not calculate overall coherence score")

Calculating overall coherence score...


  self.pid = os.fork()
  self.pid = os.fork()



Topic Coherence Score (C_v): 0.3792
