### Python Code Overview

This script performs a second round of topic modeling on outliers detected in a previous step using the `BERTopic` library with K-means clustering:

1. **Load Data**:
    - Loads a CSV file (`First_step_outliers.csv`) containing text data identified as outliers in the previous step.

2. **Extract and Reduce Embeddings**:
    - **Step 1**: Uses `SentenceTransformer` to encode text data into embeddings.
    - **Step 2**: Applies `UMAP` to reduce the dimensionality of these embeddings for clustering.

3. **K-means Clustering and Metric Calculation**:
    - **Step 3**: Runs K-means clustering on the reduced embeddings for a range of cluster numbers (from 2 to 20).
    - **Metrics**: Calculates inertia (sum of squared distances to the nearest cluster center) and silhouette scores (how similar an object is to its own cluster compared to other clusters).

4. **Plot Results**:
    - **Step 4**: Plots the Elbow Method graph to help determine the optimal number of clusters based on inertia.
    - **Step 5**: Plots the Silhouette Score graph to evaluate the quality of the clusters.

5. **Configure and Fit BERTopic Model**:
    - Configures `BERTopic` with various components including the embedding model, UMAP for dimensionality reduction, and K-means for clustering.
    - Fits the BERTopic model on the filtered text data to extract topics.

6. **Save and Update Results**:
    - **Topic Information**: Extracts topic information and saves it as a CSV file (`Second_step_clustering_results.csv`).
    - **Update DataFrame**: Updates the DataFrame with the newly assigned topics, their names, and representations, and saves the updated data as `Second_step_clustering_BERTopic_results.csv`.

7. **Save Model**:
    - Saves the fitted BERTopic model to a pickle file (`Second_step_BERTopic.pkl`) for future use.

### Summary
The script extends the topic modeling pipeline by applying K-means clustering to previously identified outliers, generating additional topic insights, and saving both the model and results for further analysis.


In [1]:
import pandas as pd
import pickle
from umap import UMAP
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

def load_data(file_path):
    """Load data from a CSV file."""
    df = pd.read_csv(file_path)
    return df.copy()

def configure_topic_model():
    """Configure the BERTopic model with K-means clustering."""
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    cluster_model = KMeans(n_clusters=313)
    vectorizer_model = CountVectorizer(stop_words="english")
    ctfidf_model = ClassTfidfTransformer()
#     Add different aspects
    main_representation = KeyBERTInspired()
    aspect_model1 = PartOfSpeech("en_core_web_sm")
    representation_model = {
    "Main": main_representation,
    "Aspect1": aspect_model1,
    }
    return BERTopic(
        embedding_model=embedding_model,          # Step 1 - Extract embeddings
        umap_model=umap_model,                    # Step 2 - Reduce dimensionality
        hdbscan_model=cluster_model,              # Step 3 - Cluster reduced embeddings using K-means
        vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
        representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
    )

def fit_transform_and_save_all(df, topic_model, model_filename, dataset_file_name, topic_info_file_name):
    """Fit the BERTopic model, save the model, topic information, and the dataset with topics."""
    try:
        topics, _ = topic_model.fit_transform(df['text'])
    except Exception as e:
        print("Error during model fitting:", str(e))
        return
    
    # Save the model after fitting
    with open(model_filename, "wb") as file:
        pickle.dump(topic_model, file)
    print("Model saved successfully after fitting at:", model_filename)
    
    # Save topic info
    try:
        topic_info = topic_model.get_topic_info()
        topic_info.to_csv(topic_info_file_name)
        print("Topic info saved successfully at:", topic_info_file_name)
    except Exception as e:
        print("Error during saving topic info:", str(e))
        return

    # Ensure the transformation length matches the dataframe length
    if len(topics) != len(df):
        print(f"Length mismatch: Expected {len(df)}, got {len(topics)} topics.")
        return

    # Update dataset with topics
    topic_names = {row['Topic']: row['Name'] for index, row in topic_info.iterrows()}
    df['Topic Label'] = topics
    df['Second_Step_Topic_Name'] = df['Topic Label'].apply(lambda topic_num: topic_names.get(topic_num, 'Unknown'))
    df['Second_Step_Topic_Keywords'] = df['Topic Label'].apply(lambda topic_num: ', '.join(term for term, _ in topic_model.get_topic(topic_num)))
    df['Second_Step_Topic_Representation'] = df['Topic Label'].apply(lambda topic_num: ', '.join(f"{term} ({score:.2f})" for term, score in topic_model.get_topic(topic_num)))
    df['Second_Step_Representative_Docs'] = df['Topic Label'].apply(lambda topic_num: ', '.join(topic_model.get_representative_docs(topic_num)))

    # Save the updated DataFrame
    df.to_csv(dataset_file_name)
    print("Dataset with topics saved successfully at:", dataset_file_name)


# Usage
file_path = 'First_step_outliers.csv'
filtered_df = load_data(file_path)
topic_model = configure_topic_model()
fit_transform_and_save_all(filtered_df, topic_model, "Second_step_BERTopic.pkl", 'Second_step_clustering_BERTopic_results.csv', 'Second_step_clustering_results.csv')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Model saved successfully after fitting at: Second_step_BERTopic.pkl
Topic info saved successfully at: Second_step_clustering_results.csv
Dataset with topics saved successfully at: Second_step_clustering_BERTopic_results.csv
