In [None]:
# Install necessary libraries
# !pip install bertopic

In [2]:
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [9]:
import os

# Disable HuggingFace tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
df = pd.read_csv('../data/combined_data_scores.csv')
df = df[['text', 'yearmonth', 'title', 'index', 'average_toxicity_score']]

# BERTopic

## Function to get topics for each yearmonth

In [18]:
def process_topics_by_year(df, year, output_csv, start_month = 1):
    """
    Process topics for all yearmonths in the specified year using BERTopic, and output results into a csv file.
    
    Parameters:
    - df: The original DataFrame containing the data.
    - year: The year for which to process the data (e.g., 2022).
    - output_csv: The output CSV file to save the results.
    - start_month: The starting month for processing (default is 1).
    
    """
    # Filter the DataFrame for the specified year
    df_filtered_year = df[df['yearmonth'].str.startswith(str(year))]

    # Get unique year-month combinations for the specified year
    unique_yearmonths = df_filtered_year['yearmonth'].unique()

    for idx, yearmonth in enumerate(unique_yearmonths):
        # Skip months before the start_month
        if int(yearmonth[-2:]) < start_month:
            continue
        else:
            print(f"Processing {yearmonth}...")
            
            # Filter data for the current year-month
            df_filtered = df[df['yearmonth'] == yearmonth]

            # Initialise representation model
            representation_model = KeyBERTInspired()

            # Initialize BERTopic model
            topic_model = BERTopic(representation_model=representation_model, nr_topics="auto")

            # Fit the model on the text data
            topics, probabilities = topic_model.fit_transform(df_filtered['text'])

            # Save topics per document
            df_topics = pd.DataFrame({
                'index': df_filtered['index'],
                'Topic': topics
            })

            # Save topic information (such as topic words and frequencies)
            topic_info = topic_model.get_topic_info()

            # Merge topics with original data
            df_combined = pd.merge(df_filtered, df_topics, on='index', how='left')

            # Merge topic info with the combined data
            df_final = pd.merge(df_combined, topic_info, on='Topic', how='left')

            # Write the result to the output CSV file
            if idx == 0:
                df_final.to_csv(output_csv, index=False, mode='w')  # Write header for the first batch
            else:
                df_final.to_csv(output_csv, index=False, mode='a', header=False)  # Append mode without header

            print(f"Processing for {yearmonth} is complete.")
            # for testing
            # break  

## Call the batch processing function

In [None]:
process_topics_by_year(df, year=2023, output_csv='../data/topics_2023.csv')

## Combine topics for all years

# Evaluation of BERTopic on sample of data using Coherence Score
Using `yearmonth` = `2023-10`

In [4]:
df_sample = df[df['yearmonth'] == '2023-10']

Initialise BERTopic model

In [None]:
# Create representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model, nr_topics='auto')

# Fit the model on text data
topics, probabilities = topic_model.fit_transform(df_sample['text'])

In [6]:
# Get the top 20 most frequent topics
topic_model.get_topic_info().head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11183,-1_be_say_life_than,"[be, say, life, than, what, if, money, her, re...","[First up, I want to tell you that it makes me..."
1,0,2093,0_singaporeans_singapores_singaporean_singapore,"[singaporeans, singapores, singaporean, singap...",[While some people (OP included) prefer to liv...
2,1,701,1_gaza_palestinians_hamas_palestinian,"[gaza, palestinians, hamas, palestinian, pales...",[Suddenly everyone is talking about the crimes...
3,2,638,2_roads_lanes_driving_cyclist,"[roads, lanes, driving, cyclist, traffic, cycl...","[As a car driver, the only time I panic is whe..."
4,3,423,3_crimes_raped_rape_accused,"[crimes, raped, rape, accused, crime, murder, ...",[The point I'm making is that he compared two ...
5,4,318,4_rice_chicken_meals_food,"[rice, chicken, meals, food, meal, meat, foodp...","[if chicken expensive eat fish la, Chicken ric..."
6,5,295,5_banking_bank_banks_citibank,"[banking, bank, banks, citibank, apps, account...",[Have a million up votes my friend.\n\nI am 59...
7,6,270,6_siao_lang_sia_si,"[siao, lang, sia, si, siala, langs, siew, kian...","[Knn siao lang, Siao lang, siao lang everywher..."
8,7,255,7_voting_opposition_vote_votes,"[voting, opposition, vote, votes, voters, elec...","[Vote opposition, Vote opposition, Vote opposi..."
9,8,201,8_prices_affordable_btos_afford,"[prices, affordable, btos, afford, cost, bto, ...","[Latest BTO Prices from $364,000 to $509,000 3..."


In [7]:
topic_model.visualize_topics()

## Coherence Score

In [10]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Preprocess documents
cleaned_docs = topic_model._preprocess_text(df_sample['text'])

# Extract vectorizer and tokenizer from BERTopic
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()

# Extract features for Topic Coherence evaluation
# words = vectorizer.get_feature_names()
tokens = [tokenizer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence_model.get_coherence()

0.3329467517487214