In [2]:
import pandas as pd
from bertopic import BERTopic
import matplotlib.pyplot as plt
from collections import Counter

def run_bertopic_multiple_times(dataframe: pd.DataFrame, num_runs: int = 100) -> int:
    topic_counts = []

    # Extract the text column for BERTopic only once
    documents = dataframe['text'].tolist()

    for _ in range(num_runs):
        # Apply BERTopic
        topic_model = BERTopic()
        topics, _ = topic_model.fit_transform(documents)

        # Get the number of unique topics (excluding outliers)
        num_topics = len(set(t for t in topics if t != -1))
        topic_counts.append(num_topics)

    # Save topic counts to a CSV file
    df_topic_counts = pd.DataFrame({'run_number': range(1, num_runs + 1), 'topic_count': topic_counts})
    df_topic_counts.to_csv('topic_counts.csv', index=False)

    # Plot the distribution of the number of topics
    plt.hist(topic_counts, bins='auto', alpha=0.7, rwidth=0.85)
    plt.xlabel('Number of Topics')
    plt.ylabel('Frequency')
    plt.title('Distribution of Number of Topics Across Runs')

    # Save the plot
    plt.savefig('topic_distribution.png')
    plt.close()  # Close the plot to free memory

    # Find the most common number of topics
    most_common_num_topics = Counter(topic_counts).most_common(1)[0][0]
    
    return most_common_num_topics


In [3]:
dataframe = pd.read_csv('biden_df_12_01.csv')

In [None]:
run_bertopic_multiple_times(dataframe)

In [None]:
import pandas as pd
from bertopic import BERTopic
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm  # Import tqdm

def run_bertopic_multiple_times(dataframe: pd.DataFrame, num_runs: int = 100) -> int:
    topic_counts = []

    # Extract the text column for BERTopic only once
    documents = dataframe['text'].tolist()

    # Use tqdm to add a progress bar
    for _ in tqdm(range(num_runs), desc="Running BERTopic"):
        # Apply BERTopic
        topic_model = BERTopic()
        topics, _ = topic_model.fit_transform(documents)

        # Get the number of unique topics (excluding outliers)
        num_topics = len(set(t for t in topics if t != -1))
        topic_counts.append(num_topics)

    # Save topic counts to a CSV file
    df_topic_counts = pd.DataFrame({'run_number': range(1, num_runs + 1), 'topic_count': topic_counts})
    df_topic_counts.to_csv('topic_counts.csv', index=False)

    # Plot the distribution of the number of topics
    plt.hist(topic_counts, bins='auto', alpha=0.7, rwidth=0.85)
    plt.xlabel('Number of Topics')
    plt.ylabel('Frequency')
    plt.title('Distribution of Number of Topics Across Runs')

    # Save the plot
    plt.savefig('topic_distribution.png')
    plt.close()  # Close the plot to free memory

    # Find the most common number of topics
    most_common_num_topics = Counter(topic_counts).most_common(1)[0][0]
    
    return most_common_num_topics
