# Automated Topic Modelling with BERTopic

## Overview

This Jupyter Notebook demonstrates the process of topic modeling using the BERTopic library. 

## Process

1. **Environment Setup**: 
    - Import necessary libraries: BERTopic, NLTK for stopwords, OS, and Pandas.
    - Set an environment variable to disable parallelism in tokenizers, ensuring thread safety.<br>

2. **Dataset Loading**:
    - Load Dataset.<br>

3. **BERTopic Modeling**:
    - BERTopic: Basic BERTopic with multiple iterations with KeyBert fixed labelling
    


        

In [3]:
import pandas as pd
import re
import os
from bertopic import BERTopic
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired 
import numpy as np

2. **Dataset Loading**


In [4]:
# dataframe = pd.read_csv('biden_df_12_01.csv')
dataframe = pd.read_csv('covid_df_20_01.csv')

3. **BERTopic Modeling**

      - Version 6: Basic BERTopic with multiple iterations with KeyBert. Fixing the labelling. 

In [None]:
# Set the environment variable to disable parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def process_dataset(dataframe: pd.DataFrame, dataset_name: str, num_runs: int) -> None:
    # Initialize the representation model
    representation_model = KeyBERTInspired()

    # Specify the embedding model to be saved with the BERTopic model
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

    for run_number in range(1, num_runs + 1):
        run_folder = f"BERTopic_run_{run_number}"
        os.makedirs(run_folder, exist_ok=True)

        if run_number > 1:
            # Correctly load the outliers from the previous run
            outliers_filename = os.path.join(f"BERTopic_run_{run_number - 1}", f"BERTopic_run_{run_number - 1}_Outliers.csv")
            try:
                dataframe = pd.read_csv(outliers_filename)
            except FileNotFoundError:
                print(f"File {outliers_filename} not found. Ending the process.")
                break

        # Create BERTopic model with the custom representation model
        topic_model = BERTopic(
            language="english",
            calculate_probabilities=True,
            verbose=True,
            representation_model=representation_model  # Add the custom representation model
        )

        topics, probabilities = topic_model.fit_transform(dataframe['text'])

        # Generate topic names and update the dataframe with these names
        topic_info = topic_model.get_topic_info()  # Get topic information
        topic_names = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
        dataframe['Topic'] = topics
        dataframe['Topic Name'] = dataframe['Topic'].apply(lambda t: topic_names.get(t, 'Unknown'))

        # Save the dataframe with topic labels using the updated naming convention
        labeled_data_filename = os.path.join(run_folder, f"BERTopic_run_{run_number}_TopicLabels.csv")
        dataframe.to_csv(labeled_data_filename, index=False)

        # Save 'Topic Name' column to a npy file using the updated naming convention
        npy_filename = os.path.join(run_folder, f"BERTopic_run_{run_number}_TopicNames.npy")
        np.save(npy_filename, dataframe['Topic Name'].values)

        # Save BERTopic results with the updated naming convention
        bertopic_results_filename = os.path.join(run_folder, f"BERTopic_run_{run_number}_Topics_Results.csv")
        topic_info.to_csv(bertopic_results_filename, index=False)

        # Save documents for each topic using the simplified naming convention
        for topic, name in topic_names.items():
            if topic != -1:  # Excluding outlier topic
                topic_indices = dataframe[dataframe['Topic'] == topic].index
                topic_dataframe = dataframe.loc[topic_indices]
                clean_name = name.replace(" ", "_").replace("/", "_")
                topic_filename = os.path.join(run_folder, f"BERTopic_run_{run_number}_Topic_{topic}_{clean_name}.csv")
                topic_dataframe.to_csv(topic_filename, index=False)

        # Handle outliers, saving them with consistent naming
        outlier_indices = dataframe[dataframe['Topic'] == -1].index
        if len(outlier_indices) > 0:
            outliers_dataframe = dataframe.loc[outlier_indices]
            outliers_filename = os.path.join(run_folder, f"BERTopic_run_{run_number}_Outliers.csv")
            outliers_dataframe.to_csv(outliers_filename, index=False)
        else:
            print("No outliers found in this run. Ending the process.")
            break

        # Save the BERTopic model using safetensors serialization
        model_dir = os.path.join(run_folder, "model_dir")
        topic_model.save(model_dir, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

        print(f"Run {run_number}: Model and data saved successfully in {model_dir} using safetensors serialization.")


In [6]:
# process_dataset(dataframe, dataframe_topic_summary_give a name, num_runs)
process_dataset(dataframe, "BERT_Topics",1)

2024-02-25 21:55:06,301 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/165 [00:00<?, ?it/s]

2024-02-25 21:55:10,349 - BERTopic - Embedding - Completed ✓
2024-02-25 21:55:10,350 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-25 21:55:38,024 - BERTopic - Dimensionality - Completed ✓
2024-02-25 21:55:38,028 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-25 21:55:41,371 - BERTopic - Cluster - Completed ✓
2024-02-25 21:55:41,378 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-25 21:55:43,287 - BERTopic - Representation - Completed ✓


Run 1: Model and data saved successfully in BERTopic_run_1/model_dir using safetensors serialization.
