In [None]:
here = '2_Application'

## Load Functions

### BERTopic

In [None]:
import os
import pickle
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd

In [None]:
from bertopic import BERTopic

In [None]:
import openai, tiktoken
from bertopic.representation import OpenAI
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.cluster import KMeans
import os

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def run_bertopic(label, output, embedding, docs, reduced_embedding = False, num_rep_docs = 10, **kwargs):

  topic_model = BERTopic(embedding_model=None, verbose=True, **kwargs)


# def run_bertopic(label, output, embedding, docs, umap_model=umap_model, hdbscan_model=hdbscan_model, gpt=False):
#   if gpt:
#     topic_model = BERTopic(embedding_model=None, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model, verbose=True)
#   else:
#     topic_model = BERTopic(embedding_model=None, umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True)

  embeddings_np = np.array(embedding.tolist())
  topics, probs = topic_model.fit_transform(docs, embeddings_np)

  print(f"{label}...")
  print("Finished Fit Transform")

  output[label] = {}
  output[label]['model'] = topic_model
  output[label]['topics'] = topics
  output[label]['probs'] = probs
  output[label]['topic_info'] = topic_model.get_topic_info().copy()

  # Get Figures ------------------------------------------------------------

  print("Obtaining Figures")

  if reduced_embedding and 'umap_model' in kwargs:
    re = kwargs['umap_model'].fit_transform(embeddings_np)
    output[label]['fig'] = topic_model.visualize_documents(docs = docs, reduced_embeddings = re)
    # output[label]['data_map'] = topic_model.visualize_document_datamap(docs = docs, reduced_embeddings = re)

  else:
    output[label]['fig'] = topic_model.visualize_documents(docs = docs, embeddings = embeddings_np)
    # output[label]['data_map'] = topic_model.visualize_document_datamap(docs = docs, embeddings = embeddings_np)

  # Get Hierarchy ------------------------------------------------------------

  output[label]['hierarchy'] = topic_model.visualize_hierarchy()


  # Get 10 rep docs ------------------------------------------------------------

  print("Obtain Rep Docs")

  doc_topic = pd.DataFrame({
    'Topic':topic_model.topics_,
    'ID':range(len(topic_model.topics_)),
    'Document':docs}
  ) # topics and docs combined, required by internal functions

  topic_model._create_topic_vectors(doc_topic,embeddings_np) # populate topic embeddings
  #topic_model._save_representative_docs(doc_topic)
  repr_docs, _, _, _=  topic_model._extract_representative_docs(
      topic_model.c_tf_idf_,
      doc_topic,
      topic_model.topic_representations_,
      nr_samples=1000,
      nr_repr_docs=num_rep_docs
  )
  topic_model.representative_docs_ = repr_docs

  rep_cols = (
  topic_model.get_topic_info()["Representative_Docs"]
    .apply(pd.Series)        # turn each list into its own row‑wise Series
    .rename(columns=lambda i: f"rep{i+1}")   # 0 → rep1, 1 → rep2, …
  )

  df = pd.concat([topic_model.get_topic_info().drop(columns=["Representative_Docs"]), rep_cols], axis=1)

  output[label]['rep_docs'] = df.copy()


## Run Bertopic



In [None]:
with open(os.path.join(here, 'data/us_dict_embedded.pkl'), 'rb') as f:
  us = pickle.load(f)

usa = pd.DataFrame(us)

### Summary Statistics

In [None]:
def count_words(text):
  """
  Counts the number of words in a given text.

  Args:
    text: The input text string.

  Returns:
    The number of words in the text.
  """
  if not isinstance(text, str):
    return 0  # Return 0 for non-string inputs
  return len(text.split())

usa['num_words'] = usa['Processed'].apply(count_words)

In [None]:
import re

def count_sentences(text):
  """
  Counts the number of sentences in a given text.

  Args:
    text: The input text string.

  Returns:
    The number of sentences in the text.
  """
  if not isinstance(text, str):
    return 0  # Return 0 for non-string inputs
  # Use a regular expression to split the text by sentence-ending punctuation
  sentences = re.split(r'[.!?]+', text)
  # Filter out any empty strings that might result from the split
  sentences = [sentence for sentence in sentences if sentence.strip()]
  return len(sentences)

# Example usage:
# text = "This is the first sentence. This is the second! And this is the third?"
# sentence_count = count_sentences(text)
# print(sentence_count)

### Define Prompt

In [None]:
output_usa = {}

In [None]:
prompt = """
Here are 12 representative responses depicting the kind of life a person wants to live:
[DOCUMENTS]

→ Give me **only** a short, 2-5 words label that describes the type(s) of ideal life desired. Don't include the specific words "desire", "ideal", or "life" in your response. Be specific and keep it to 2-5 words.
"""

In [None]:
representation_model = OpenAI(
    client,
    model="gpt-4.1-mini",
    prompt=prompt,
    nr_docs=12,
    delay_in_seconds=1
)

### SBERT HDDBSCAN

In [None]:
params = {
    'representation_model': representation_model
}

run_bertopic(
    label='SBERT_HDBSCAN_Default',
    output=output_usa,
    embedding=usa['SBERT'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

In [None]:
umap_model = UMAP(
      n_neighbors=3,
      n_components=5,
      min_dist=0.0,
      metric="cosine",
      random_state=23
  )

params = {
    'representation_model': representation_model,
    'umap_model': umap_model
}


run_bertopic(
    label='SBERT_HDBSCAN_Sensitive',
    output=output_usa,
    embedding=usa['SBERT'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

In [None]:
# Extract hierarchical topics and their representations
docs = usa['Processed']

hierarchical_topics = output_usa['SBERT_HDBSCAN_Sensitive']['model'].hierarchical_topics(docs)

# Visualize these representations
output_usa['SBERT_HDBSCAN_Sensitive']['h'] = output_usa['SBERT_HDBSCAN_Sensitive']['model'].visualize_hierarchy(hierarchical_topics=hierarchical_topics, width=3000, height=1000)

In [None]:
# Extract hierarchical topics and their representations
docs = usa['Processed']

hierarchical_topics = output_usa['BERT_HDBSCAN_Sensitive']['model'].hierarchical_topics(docs)

# Visualize these representations
output_usa['BERT_HDBSCAN_Sensitive']['model'].visualize_hierarchy(hierarchical_topics=hierarchical_topics)

### BERT HDBSCAN

In [None]:
params = {
    'representation_model': representation_model
}

run_bertopic(
    label='BERT_HDBSCAN_Default',
    output=output_usa,
    embedding=usa['BERT'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

In [None]:
umap_model = UMAP(
      n_neighbors=3,
      n_components=5,
      min_dist=0.0,
      metric="cosine",
      random_state=23
  )

params = {
    'representation_model': representation_model,
    'umap_model': umap_model
}


run_bertopic(
    label='BERT_HDBSCAN_Sensitive',
    output=output_usa,
    embedding=usa['BERT'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

### QWEN HDBSCAN

In [None]:
output_usa = {}

In [None]:
params = {
    'representation_model': representation_model
}

run_bertopic(
    label='QWEN_HDBSCAN_Default',
    output=output_usa,
    embedding=usa['QWEN'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

In [None]:
umap_model = UMAP(
      n_neighbors=3,
      n_components=5,
      min_dist=0.0,
      metric="cosine",
      random_state=23
  )

params = {
    'representation_model': representation_model,
    'umap_model': umap_model
}


run_bertopic(
    label='QWEN_HDBSCAN_Sensitive2',
    output=output_usa,
    embedding=usa['QWEN'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

### QWEN KMEANS

In [None]:
empty_dimensionality_model = BaseDimensionalityReduction()

cluster_model = KMeans(n_clusters=20)

params = {
  'representation_model': representation_model,
  'hdbscan_model': cluster_model,
  'umap_model': empty_dimensionality_model,
}

run_bertopic(
    label=f'QWEN_KMEANS_{20}_clusters',
    output=output_usa,
    embedding=usa['QWEN'],
    docs=usa['Processed'],
    num_rep_docs=25,
    **params
)

In [None]:
# Extract hierarchical topics and their representations
docs = usa['Processed']

hierarchical_topics = output_usa['QWEN_KMEANS_20_clusters']['model'].hierarchical_topics(docs)

# Visualize these representations
output_usa['QWEN_KMEANS_20_clusters']['model'].visualize_hierarchy(hierarchical_topics=hierarchical_topics)

### Output

In [None]:
with open(here + 'output/output_usa_bertopic.pkl', 'wb') as f:
  pickle.dump(output_usa, f)