### Python Code Overview

This script performs topic modeling on a dataset of text using the `BERTopic` library with several steps:

1. **Import Required Libraries**:
    - Imports various libraries for data handling (`pandas`, `pickle`), text embedding (`SentenceTransformer`), dimensionality reduction (`UMAP`), clustering (`HDBSCAN`), and topic modeling (`BERTopic`).

2. **Load Data**:
    - `load_data(file_path)`: Loads a tab-separated text file into a pandas DataFrame with columns `["text", "dataset-part", "ground truth"]`.

3. **Configure Topic Model**:
    - `configure_topic_model()`: Sets up a BERTopic model with specific configurations:
        - Uses `SentenceTransformer` for embeddings.
        - Reduces dimensionality with `UMAP`.
        - Clusters data with `HDBSCAN`.
        - Tokenizes text with `CountVectorizer`.
        - Extracts topics with `ClassTfidfTransformer`.
        - Adds optional aspect-based representations.

4. **Fit, Transform, and Save**:
    - `fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name)`:
        - Fits the BERTopic model to the text data, saves the model and topic information.
        - Updates the DataFrame with topic labels and names.
        - Saves the transformed DataFrame to a CSV file.

5. **Save Outliers**:
    - `save_outliers(df, file_name)`: Identifies and saves rows labeled as outliers (i.e., with topic label `-1`) to a separate CSV file.

6. **Usage Example**:
    - The code at the end demonstrates how to load data, configure the topic model, fit and save the results, and finally, save any outliers.

### Summary
The script is designed for topic modeling on a text corpus, providing a complete pipeline from data loading to model fitting, topic extraction, and saving results, including handling of outliers.


In [8]:
# For TSV files
import pandas as pd
import pickle
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, OpenAI, PartOfSpeech, MaximalMarginalRelevance

def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None)
    df.columns = ["text", "dataset-part", "ground truth"]
    return df.copy()

def configure_topic_model():
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words="english")
    ctfidf_model = ClassTfidfTransformer()
#     Add different aspects
    main_representation = KeyBERTInspired()
    aspect_model1 = PartOfSpeech("en_core_web_sm")
    representation_model = {
    "Main": main_representation,
    "Aspect1": aspect_model1,
    }
    return BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )

def fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name):
    # Fit the model with your text data
    try:
        topics, _ = topic_model.fit_transform(df['text'])
    except Exception as e:
        print("Error during model fitting:", str(e))
        return
    
    # Save the model after fitting
    with open(model_filename, "wb") as file:
        pickle.dump(topic_model, file)
    print("Model saved successfully after fitting at:", model_filename)
    
    # Save topic info
    try:
        topic_info = topic_model.get_topic_info()
        topic_info.to_csv(topic_info_file_name)  # Save topic info to CSV
        print("Topic info saved successfully at:", topic_info_file_name)
    except Exception as e:
        print("Error during saving topic info:", str(e))
        return

    # Ensure the transformation length matches the dataframe length
    if len(topics) != len(df):
        print(f"Length mismatch: Expected {len(df)}, got {len(topics)} topics.")
        return

    # Update dataset with topics
    topic_names = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
    df['Topic Label'] = topics
    df['First_Step_Topic_Name'] = df['Topic Label'].apply(lambda topic_num: topic_names.get(topic_num, 'Unknown'))
    df['First_Step_Topic_Keywords'] = df['Topic Label'].apply(lambda topic_num: ', '.join(term for term, _ in topic_model.get_topic(topic_num)))
    df['First_Step_Topic_Representation'] = df['Topic Label'].apply(lambda topic_num: ', '.join(f"{term} ({score:.2f})" for term, score in topic_model.get_topic(topic_num)))
    df['First_Step_Representative_Docs'] = df['Topic Label'].apply(lambda topic_num: ', '.join(topic_model.get_representative_docs(topic_num)))

    # Save the updated DataFrame
    df.to_csv(dataset_file_name)
    print("Dataset with topics saved successfully at:", dataset_file_name)

def save_outliers(df, file_name):
    outliers_df = df[df['Topic Label'] == -1]
    outliers_df.to_csv(file_name, index=False)
    print("Outliers saved successfully at:", file_name)

# Usage
file_path = 'corpus.tsv'
filtered_df = load_data(file_path)
topic_model = configure_topic_model()
fit_transform_and_save_all(filtered_df, topic_model, "BERTopic_first_step.pkl", 'BERTopic_result.csv', 'BERTopic_topic_info.csv')
save_outliers(filtered_df, 'First_step_outliers.csv')


Model saved successfully after fitting at: BERTopic_first_step.pkl
Topic info saved successfully at: BERTopic_topic_info.csv
Dataset with topics saved successfully at: BERTopic_result.csv
Outliers saved successfully at: First_step_outliers.csv


In [1]:
# For TSV files
import pandas as pd
import pickle
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, OpenAI, PartOfSpeech, MaximalMarginalRelevance

def load_data(file_path, text_column_name='text', has_header=True, nrows=1000):
    # Load the first `nrows` rows of the CSV file with or without a header based on the `has_header` flag
    df = pd.read_csv(file_path, nrows=nrows)
    # Ensure the correct column is being used for text
    if text_column_name not in df.columns:
        raise ValueError(f"Specified text column '{text_column_name}' not found in the CSV.")
    
    df = df.rename(columns={text_column_name: 'text'})
    
    return df.copy()

def configure_topic_model():
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words="english")
    ctfidf_model = ClassTfidfTransformer()
    main_representation = KeyBERTInspired()
    aspect_model1 = PartOfSpeech("en_core_web_sm")
    representation_model = {
        "Main": main_representation,
        "Aspect1": aspect_model1,
    }
    return BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model
    )

def fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name):
    try:
        topics, _ = topic_model.fit_transform(df['text'])
    except Exception as e:
        print("Error during model fitting:", str(e))
        return
    
    with open(model_filename, "wb") as file:
        pickle.dump(topic_model, file)
    print("Model saved successfully after fitting at:", model_filename)
    
    try:
        topic_info = topic_model.get_topic_info()
        topic_info.to_csv(topic_info_file_name, index=False)
        print("Topic info saved successfully at:", topic_info_file_name)
    except Exception as e:
        print("Error during saving topic info:", str(e))
        return

    if len(topics) != len(df):
        print(f"Length mismatch: Expected {len(df)}, got {len(topics)} topics.")
        return

    topic_names = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
    df['Topic Label'] = topics
    df['First_Step_Topic_Name'] = df['Topic Label'].apply(lambda topic_num: topic_names.get(topic_num, 'Unknown'))
    df['First_Step_Topic_Keywords'] = df['Topic Label'].apply(lambda topic_num: ', '.join(term for term, _ in topic_model.get_topic(topic_num)))
    df['First_Step_Topic_Representation'] = df['Topic Label'].apply(lambda topic_num: ', '.join(f"{term} ({score:.2f})" for term, score in topic_model.get_topic(topic_num)))
    df['First_Step_Representative_Docs'] = df['Topic Label'].apply(lambda topic_num: ', '.join(topic_model.get_representative_docs(topic_num)))

    df.to_csv(dataset_file_name, index=False)
    print("Dataset with topics saved successfully at:", dataset_file_name)

def save_outliers(df, file_name):
    outliers_df = df[df['Topic Label'] == -1]
    outliers_df.to_csv(file_name, index=False)
    print("Outliers saved successfully at:", file_name)

# Usage
file_path = 'filtered_arxiv_metadata_2024.csv'
text_column_name = 'abstract'  # Adjust this to the actual name of your text column
has_header = True  # Set to False if the CSV file does not have a header
filtered_df = load_data(file_path, text_column_name, has_header)
topic_model = configure_topic_model()
fit_transform_and_save_all(filtered_df, topic_model, "BERTopic_first_step.pkl", 'BERTopic_result.csv', 'BERTopic_topic_info.csv')
save_outliers(filtered_df, 'First_step_outliers.csv')


Model saved successfully after fitting at: BERTopic_first_step.pkl
Topic info saved successfully at: BERTopic_topic_info.csv
Dataset with topics saved successfully at: BERTopic_result.csv
Outliers saved successfully at: First_step_outliers.csv


In [1]:
# For CSV files
import pandas as pd
import pickle
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, OpenAI, PartOfSpeech, MaximalMarginalRelevance
from tqdm import tqdm

def load_data(file_path, text_column_name='text', has_header=True):
    # Load the first `nrows` rows of the CSV file with or without a header based on the `has_header` flag
    df = pd.read_csv(file_path)
    
    # Ensure the correct column is being used for text
    if text_column_name not in df.columns:
        raise ValueError(f"Specified text column '{text_column_name}' not found in the CSV.")
    
    df = df.rename(columns={text_column_name: 'text'})
    
    return df.copy()

def configure_topic_model():
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words="english")
    ctfidf_model = ClassTfidfTransformer()
    main_representation = KeyBERTInspired()
    aspect_model1 = PartOfSpeech("en_core_web_sm")
    representation_model = {
        "Main": main_representation,
        "Aspect1": aspect_model1,
    }
    return BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model
    )

def fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name):
    # Display progress bar during model fitting
    print("Fitting the model and transforming text data...")
    try:
        topics, _ = topic_model.fit_transform(tqdm(df['text'], desc="Processing text data"))
    except Exception as e:
        print("Error during model fitting:", str(e))
        return
    
    with open(model_filename, "wb") as file:
        pickle.dump(topic_model, file)
    print("Model saved successfully after fitting at:", model_filename)
    
    try:
        print("Saving topic information...")
        topic_info = topic_model.get_topic_info()
        topic_info.to_csv(topic_info_file_name, index=False)
        print("Topic info saved successfully at:", topic_info_file_name)
    except Exception as e:
        print("Error during saving topic info:", str(e))
        return

    if len(topics) != len(df):
        print(f"Length mismatch: Expected {len(df)}, got {len(topics)} topics.")
        return

    topic_names = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
    
    # Adding progress bar for DataFrame update
    print("Updating DataFrame with topic labels and keywords...")
    for index in tqdm(range(len(df)), desc="Updating DataFrame"):
        topic_num = topics[index]
        df.at[index, 'Topic Label'] = topic_num
        df.at[index, 'First_Step_Topic_Name'] = topic_names.get(topic_num, 'Unknown')
        df.at[index, 'First_Step_Topic_Keywords'] = ', '.join(term for term, _ in topic_model.get_topic(topic_num))
        df.at[index, 'First_Step_Topic_Representation'] = ', '.join(f"{term} ({score:.2f})" for term, score in topic_model.get_topic(topic_num))
        df.at[index, 'First_Step_Representative_Docs'] = ', '.join(topic_model.get_representative_docs(topic_num))

    df.to_csv(dataset_file_name, index=False)
    print("Dataset with topics saved successfully at:", dataset_file_name)

def save_outliers(df, file_name):
    outliers_df = df[df['Topic Label'] == -1]
    outliers_df.to_csv(file_name, index=False)
    print("Outliers saved successfully at:", file_name)

# Usage
file_path = 'filtered_arxiv_metadata_2024.csv'
text_column_name = 'abstract'  # Adjust this to the actual name of your text column
has_header = True  # Set to False if the CSV file does not have a header
filtered_df = load_data(file_path, text_column_name, has_header)
topic_model = configure_topic_model()
fit_transform_and_save_all(filtered_df, topic_model, "First_Step_BERTopic_first_step.pkl", 'First_Step_BERTopic_result.csv', 'First_Step_BERTopic_topic_info.csv')
save_outliers(filtered_df, 'First_step_outliers.csv')


Fitting the model and transforming text data...


Processing text data: 100%|██████████| 54947/54947 [00:00<00:00, 2623385.57it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after par

Model saved successfully after fitting at: First_Step_BERTopic_first_step.pkl
Saving topic information...
Topic info saved successfully at: First_Step_BERTopic_topic_info.csv
Updating DataFrame with topic labels and keywords...


Updating DataFrame: 100%|██████████| 54947/54947 [00:06<00:00, 8771.41it/s]


Dataset with topics saved successfully at: First_Step_BERTopic_result.csv
Outliers saved successfully at: First_step_outliers.csv


### K-estimation based on the first step

This algorithm is designed to determine the optimal number of topics for a second round of topic modeling, specifically focusing on documents labeled as outliers (`-1`) in the initial run.

#### Steps:

1. **Load the Dataset**:
   - Import a CSV file containing topic modeling results, where each row represents a document and the `Topic Label` column indicates the assigned topic.

2. **Calculate Total Rows**:
   - Count the total number of rows (documents) in the dataset.

3. **Identify Valid Topic Rows**:
   - Filter and count rows with valid topic labels (i.e., not labeled as `-1`).

4. **Identify Outlier Rows**:
   - Filter and count rows labeled as `-1`, representing outliers.

5. **Count Unique Valid Topics**:
   - Calculate the number of unique valid topics identified in the dataset, excluding `-1`.

6. **Calculate Topic Proportion for Outliers**:
   - Determine the proportion of outlier rows relative to valid topic rows.
   - Multiply this proportion by the number of unique valid topics to estimate the ideal number of topics for outliers.
   - Round the result to the nearest whole number.

7. **Output**:
   - Provide the total number of unique valid topics, the percentage of data labeled as outliers, and the recommended number of topics for the outliers in the second modeling run.

### Purpose
This algorithm ensures that the number of topics generated for outliers is proportional to the distribution observed in the valid topics, facilitating consistent and focused topic modeling on previously unclassified documents.


In [2]:
import pandas as pd

def calculate_proportional_topics_for_outliers(file_path, topic_column='Topic Label'):
    """
    Calculate the percentage of data assigned to valid topics and propose an ideal number of topics 
    for the rows labeled as `-1` (outliers).

    Args:
    - file_path (str): Path to the CSV file containing the topic model results.
    - topic_column (str): Name of the column in the CSV file that contains the topic assignments.

    Returns:
    - int: The ideal number of topics for the rows labeled as `-1`.
    """
    # Load dataset
    df = pd.read_csv(file_path)

    # Ensure the topic column exists
    if topic_column not in df.columns:
        raise ValueError(f"Column {topic_column} not found in the dataset.")

    # Calculate total number of rows
    total_rows = len(df)

    # Calculate the number of rows assigned a valid topic (excluding -1)
    valid_topic_rows = df[df[topic_column] != -1].shape[0]

    # Calculate the number of rows labeled as -1
    outlier_rows = df[df[topic_column] == -1].shape[0]

    # Calculate the number of unique topics (excluding -1)
    unique_valid_topics = df[df[topic_column] != -1][topic_column].nunique()

    # Calculate the proportion of the dataset labeled as -1
    percentage_outliers = (outlier_rows / total_rows) * 100

    # Estimate the ideal number of topics for the outliers
    ideal_topics_for_outliers = round((outlier_rows / valid_topic_rows) * unique_valid_topics)
    
    print(f"Total unique valid topics: {unique_valid_topics}")
    print(f"Percentage of data labeled as outliers (-1): {percentage_outliers:.2f}%")
    print(f"Ideal number of topics for the outliers (-1): {ideal_topics_for_outliers}")
    
    return ideal_topics_for_outliers

# Example usage:
file_path = 'First_Step_BERTopic_result.csv'
ideal_topics_for_outliers = calculate_proportional_topics_for_outliers(file_path)


Total unique valid topics: 523
Percentage of data labeled as outliers (-1): 37.41%
Ideal number of topics for the outliers (-1): 313


  df = pd.read_csv(file_path)
