### Python Code Overview

This script performs topic modeling on a dataset of text using the `BERTopic` library with several steps:

1. **Import Required Libraries**:
    - Imports various libraries for data handling (`pandas`, `pickle`), text embedding (`SentenceTransformer`), dimensionality reduction (`UMAP`), clustering (`HDBSCAN`), and topic modeling (`BERTopic`).

2. **Load Data**:
    - `load_data(file_path)`: Loads a tab-separated text file into a pandas DataFrame with columns `["text", "dataset-part", "ground truth"]`.

3. **Configure Topic Model**:
    - `configure_topic_model()`: Sets up a BERTopic model with specific configurations:
        - Uses `SentenceTransformer` for embeddings.
        - Reduces dimensionality with `UMAP`.
        - Clusters data with `HDBSCAN`.
        - Tokenizes text with `CountVectorizer`.
        - Extracts topics with `ClassTfidfTransformer`.
        - Adds optional aspect-based representations.

4. **Fit, Transform, and Save**:
    - `fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name)`:
        - Fits the BERTopic model to the text data, saves the model and topic information.
        - Updates the DataFrame with topic labels and names.
        - Saves the transformed DataFrame to a CSV file.

5. **Save Outliers**:
    - `save_outliers(df, file_name)`: Identifies and saves rows labeled as outliers (i.e., with topic label `-1`) to a separate CSV file.

6. **Usage Example**:
    - The code at the end demonstrates how to load data, configure the topic model, fit and save the results, and finally, save any outliers.

### Summary
The script is designed for topic modeling on a text corpus, providing a complete pipeline from data loading to model fitting, topic extraction, and saving results, including handling of outliers.


In [28]:
import pandas as pd
import pickle
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None)
    df.columns = ["text", "dataset-part", "ground truth"]
    return df.copy()

def configure_topic_model():
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    vectorizer_model = CountVectorizer(stop_words="english")
    ctfidf_model = ClassTfidfTransformer()
#     Add different aspects
    main_representation = KeyBERTInspired()
    aspect_model1 = PartOfSpeech("en_core_web_sm")
    representation_model = {
    "Main": main_representation,
    "Aspect1": aspect_model1,
    }
    return BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )

def fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name):
    # Fit the model with your text data
    try:
        topics, _ = topic_model.fit_transform(df['text'])
    except Exception as e:
        print("Error during model fitting:", str(e))
        return
    
    # Save the model after fitting
    with open(model_filename, "wb") as file:
        pickle.dump(topic_model, file)
    print("Model saved successfully after fitting at:", model_filename)
    
    # Save topic info
    try:
        topic_info = topic_model.get_topic_info()
        topic_info.to_csv(topic_info_file_name)  # Save topic info to CSV
        print("Topic info saved successfully at:", topic_info_file_name)
    except Exception as e:
        print("Error during saving topic info:", str(e))
        return

    # Ensure the transformation length matches the dataframe length
    if len(topics) != len(df):
        print(f"Length mismatch: Expected {len(df)}, got {len(topics)} topics.")
        return

    # Update dataset with topics
    topic_names = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
    df['Topic Label'] = topics
    df['First_Step_Topic Name'] = df['Topic Label'].apply(lambda topic_num: topic_names.get(topic_num, 'Unknown'))
    df['First_Step_Topic Representation'] = df['Topic Label'].apply(lambda topic_num: ', '.join(term for term, _ in topic_model.get_topic(topic_num)))
    
    # Save the updated DataFrame
    df.to_csv(dataset_file_name)
    print("Dataset with topics saved successfully at:", dataset_file_name)


def save_outliers(df, file_name):
    outliers_df = df[df['Topic Label'] == -1]
    outliers_df.to_csv(file_name, index=False)
    print("Outliers saved successfully at:", file_name)

# Usage
file_path = 'corpus.tsv'
filtered_df = load_data(file_path)
topic_model = configure_topic_model()
fit_transform_and_save_all(filtered_df, topic_model, "BERTopic_first_step.pkl", 'BERTopic_result.csv', 'BERTopic_topic_info.csv')
save_outliers(filtered_df, 'First_step_outliers.csv')


Model saved successfully after fitting at: BERTopic_first_step.pkl
Topic info saved successfully at: BERTopic_topic_info.csv
Dataset with topics saved successfully at: BERTopic_result.csv
Outliers saved successfully at: First_step_outliers.csv
