# Topic Labeling Research

## Imports And Installations

In [197]:
import sys
!{sys.executable} -m pip install pandas bertopic transformers keybert google-generativeai numpy faker grpcio==1.60.0 grpcio-tools==1.60.0 google datasets





In [198]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import google.generativeai as genai
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import re

In [199]:
%load_ext autoreload
%autoreload 2

import os
import sys
import importlib

module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils.gemini_client as gemini_client

def reload_utils():
    importlib.reload(gemini_client)

reload_utils()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [218]:

import google.generativeai as genai

genai.configure(api_key=gemini_client.GEMINI_API_KEY)

gemini_model = genai.GenerativeModel("gemini-2.0-flash-lite")


In [201]:
# Step 1: Load CSV
df = pd.read_csv("..\data\datasets\youtube_comments\jack_vs_calley_1000.csv") 
texts = df["text"].dropna().astype(str).tolist() 

## Intruduction


Topic modeling is widely used to discover hidden structures in text datasets. However, labeling the discovered topics is often challenging. Traditional methods generate labels by extracting keywords, which might not always convey the full semantic meaning of the topic.

In this project, we aim to:
1. Apply BERTopic to cluster YouTube comments into topics.
2. Generate labels using:
    - BERTopic's built-in labeling
    - KeyBERT keyword extraction
    - Google Gemini LLM summarization
3. Rate the quality of labels generated by each method.

The goal is to understand whether LLMs can outperform classical methods in generating interpretable topic labels.

The process consists of the following steps:

1. **Clustering**:  
   - We use BERTopic with a pre-trained `all-MiniLM-L6-v2` embedding model to cluster the comments into topics.

2. **Labeling**:
   - **BERTopic**: Extracts representative keywords for each topic.
   - **KeyBERT**: Extracts keywords based on embedding similarity.
   - **Gemini**: Receives comments per topic and returns up to 5 keywords describing the topic.

3. **Evaluation**:  
   - We assess the quality of the generated labels using three evaluation methods:
     1. **Cluster Purity** (`compute_cluster_purity`): Measures how well the assigned labels capture the internal consistency of each cluster.
     2. **Label Stability** (`compute_label_stability`): Evaluates the robustness of labels when the data or clustering slightly changes.
     3. **Gemini-based Rating**: Uses Google Gemini to provide an external qualitative assessment of the labels, scoring each method from 1 to 100 based on clarity, relevance, and descriptiveness.


In [202]:
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=5)

## Clustering

In [203]:
# BERTopic Clustering ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer)
topics, probs = topic_model.fit_transform(texts)

## Labeling

In [204]:
from __future__ import annotations

from typing import Dict, List, Sequence

from bertopic import BERTopic
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer


class TopicLabeler:

    def __init__(
        self,
        texts: Sequence[str],
        topics: Sequence[int],
    ) -> None:
        self.texts: List[str] = list(texts)
        self.topics: List[int] = list(topics)
        self.topic_ids: List[int] = sorted({t for t in self.topics if t != -1})

        self._topic_docs: Dict[int, List[str]] = {tid: [] for tid in self.topic_ids}
        for txt, tid in zip(self.texts, self.topics):
            if tid != -1:
                self._topic_docs[tid].append(txt)

    def label_with_bertopic(
    self,
    topic_model: BERTopic,
    n_phrases: int = 6,
    phrase_len: int = 3,
    ) -> Dict[int, List[str]]:
        labels: Dict[int, List[str]] = {}

        for tid in self.topic_ids:
            topic = topic_model.get_topic(tid)
            if not topic:
                continue

            top_words = [word.replace("_", " ") for word, _ in topic[:15]]

            phrases = [
                " ".join(top_words[i:i + phrase_len])
                for i in range(len(top_words) - phrase_len + 1)
            ]

            labels[tid] = phrases[:n_phrases]

        return labels
   
   
    def label_with_keybert(
        self,
        embedding_model: SentenceTransformer,
        top_n: int = 5,
        ngram_range: tuple[int, int] = (2, 4),
    ) -> Dict[int, List[str]]:
        kw_model = KeyBERT(model=embedding_model)
        labels: Dict[int, List[str]] = {}

        for tid, docs in self._topic_docs.items():
            if not docs:
                continue

            keywords = kw_model.extract_keywords(
                " ".join(docs),
                keyphrase_ngram_range=ngram_range,
                top_n=top_n,
            )
            labels[tid] = [kw for kw, _ in keywords]

        return labels

    def label_with_gemini(
        self,
        model, 
        max_words: int = 6,
        max_docs: int = 5,
    ) -> Dict[int, List[str]]:
        labels: Dict[int, List[str]] = {}

        for tid, docs in self._topic_docs.items():
            if len(docs) < 3:
                continue

            payload = "\n".join(doc[:300] for doc in docs[:max_docs])
            prompt = (
                "You are given YouTube comments that all share one topic.\n"
                f"Return up to {max_words} short keywords or phrases that best "
                "summarise the topic, comma-separated only.\n\n"
                f"{payload}"
            )

            chat = model.start_chat()
            response = chat.send_message(prompt)

            tokens = [t.strip() for t in response.text.split(",") if t.strip()]
            labels[tid] = tokens

        return labels


In [None]:
labeler = TopicLabeler(texts, topics)
bertopic_labels = labeler.label_with_bertopic(topic_model)



In [206]:
keybert_labels = labeler.label_with_keybert(embedding_model)

In [220]:
gemini_labels = labeler.label_with_gemini(gemini_model)

In [221]:
labels_dict = {"BERTopic": bertopic_labels, "KeyBERT": keybert_labels, "Gemini": gemini_labels}

## Evaluationg

### Topic Inspection - General Topic Overview


In order to better understand the quality of the generated labels, we implement a simple visualization function. The function `show_topic_full` displays, for a given topic:
1. The labels generated by each labeling method.
2. The list of all comments associated with the selected topic.

Since topic modeling is an unsupervised task, evaluating the "correctness" of labels is inherently challenging. There is no absolute ground truth, and even similar labels can have different levels of usefulness depending on human interpretation. Therefore, visual inspection — simulating how a human would read the comments and judge the relevance of the labels — is essential.

This motivated us to later employ a Large Language Model (LLM) as an evaluator, aiming to approximate human judgment when rating the quality of the labeling.

In [223]:
import random

def show_topic_full(topic_id, labels_dict):
    print(f"=== Topic {topic_id} ===\n")
    

    for model_name, model_labels in labels_dict.items():
        labels = model_labels.get(topic_id, [])
        print(f"--- {model_name} Labels ---")
        print(", ".join(labels) if labels else "No labels")
        print()
    

    print(f"--- All Texts in Topic {topic_id} ---")
    texts_in_topic = [text for text, t in zip(texts, topics) if t == topic_id]
    
    if not texts_in_topic:
        print("No texts found for this topic.")
    else:
        for i, text in enumerate(texts_in_topic, 1):
            print(f"{i}. {text}")


random_topic = random.choice(list(set(topics) - {-1}))
show_topic_full(random_topic, labels_dict)


=== Topic 1 ===

--- BERTopic Labels ---
jack think guy, think guy dr jack, guy dr jack jack kruse, dr jack jack kruse say, jack kruse say kruse, say kruse podcast

--- KeyBERT Labels ---
healthcare jack gets doing, discussion need dr jacks, glad say dr jack, dr jack got, dr jack got mrna

--- Gemini Labels ---
Jack Kruse, podcasts, controversial topics, messenger critique, Bitcoin

--- All Texts in Topic 1 ---
1. Why would Jack turn down Rogan and Tucker? He could call them out just like he did this guy
2. Jack Kruse is goat, and a bitcoiner. lets gooo.
3. I’ve lost respect for Jack
4. I've tried to make it through this podcast 3 times!!! Danny Jones -- if you are going to cover these highly controversial topics you need to learn how to moderate your damn guests! Jack may have an important message but he SUCKS BALLS as a messenger. Ugh!!!!!!!!!!!!
5. Jack for president
6. He’s very nervous, and desperate for Jack to stop, but Jack smells fear.. and knows there’s a reason for it … ‘jac

### Automated Evaluation with Gemini

Since collecting human judgments for a large number of topics and comments is impractical given our resources, we employ Google Gemini as an automated evaluator. This allows us to approximate human-like assessment of label quality without the need for extensive manual annotation.


For every topic, Gemini is provided with:
1. The labels generated by each labeling method (BERTopic, KeyBERT, and Gemini itself).
2. A sample of comments belonging to the topic.


#### Gemini Numerical Evaluator

Gemini is asked to act as an impartial evaluator and assign a score from 1 to 100 for each set of labels, focusing on:
- **Clarity** — Are the labels understandable and well-phrased?
- **Relevance** — Do the labels reflect the topic's content?
- **Descriptiveness** — How well do the labels summarize the topic?

The function collects the individual topic scores and computes the **average rating** for each labeling method across all topics.

While this does not fully replace human evaluation, using a powerful LLM helps us approximate human judgment at scale and provides valuable insights into the relative performance of each labeling method.

In [224]:
def evaluate_all_topics_with_gemini(labels_dict, model):
    results = {name: [] for name in labels_dict.keys()}

    all_topics = list(set(topics) - {-1})
    
    for topic_id in all_topics:
        print(f" Evaluating Topic {topic_id}...")

        prompt = f"Evaluate the labeling quality for Topic {topic_id}.\n"
        prompt += "For each model, here are the labels it generated:\n\n"

        for model_name, model_labels in labels_dict.items():
            labels = model_labels.get(topic_id, [])
            prompt += f"--- {model_name} Labels ---\n"
            prompt += ", ".join(labels) if labels else "No labels"
            prompt += "\n\n"

        prompt += "--- Example Texts in this Topic ---\n"
        texts_in_topic = [text for text, t in zip(texts, topics) if t == topic_id]
        
        for i, text in enumerate(texts_in_topic, 1):
            prompt += f"{i}. {text}\n"

        prompt += ("\n\nPlease rate each model from 1 to 100, based on how well the labels describe the topic and make sense.\n"
                   "Imagine you are a professional linguist and data scientist who was not involved in generating these labels.\n"
                   "Your task is to objectively evaluate each set of labels without any consideration of their source. Focus only on clarity, relevance, and how well the labels describe the topic's content.\n"
                   "Give only numeric ratings like this:\n"
                   "- BERTopic: <score>\n"
                   "- KeyBERT: <score>\n"
                   "- Gemini: <score>\n")

        chat = model.start_chat()
        response = chat.send_message(prompt)

        for model_name in results.keys():
            try:
                line = [line for line in response.text.splitlines() if model_name in line][0]
                score = int(''.join(filter(str.isdigit, line)))
                results[model_name].append(score)
            except Exception as e:
                print(f" Failed to extract score for {model_name} in Topic {topic_id}: {e}")

    avg_scores = {model: round(np.mean(scores), 2) if scores else 0 for model, scores in results.items()}

    print("\n=== Average Scores ===")
    for model, score in avg_scores.items():
        print(f"{model}: {score}/100")

    return avg_scores

avg_scores = evaluate_all_topics_with_gemini(labels_dict, gemini_model)


 Evaluating Topic 0...
 Evaluating Topic 1...
 Evaluating Topic 2...
 Evaluating Topic 3...
 Evaluating Topic 4...
 Evaluating Topic 5...
 Evaluating Topic 6...
 Evaluating Topic 7...
 Evaluating Topic 8...
 Evaluating Topic 9...
 Evaluating Topic 10...
 Evaluating Topic 11...
 Evaluating Topic 12...
 Evaluating Topic 13...
 Evaluating Topic 14...

=== Average Scores ===
BERTopic: 56.33/100
KeyBERT: 57.33/100
Gemini: 79.0/100


#### Gemini Text Explainer


To complement the numerical evaluation, we use Gemini to provide short explanations for the average scores of each labeling method. The `explain_scores_with_gemini` function prompts Gemini to justify the given scores by commenting on aspects such as clarity, relevance, and interpretability of the generated labels.

This step helps us gain qualitative insights into the strengths and weaknesses of each method, beyond just numerical ratings.

In [238]:
def explain_scores_with_gemini(avg_scores, model):
    # Prompt
    prompt = "You are an objective evaluator.\n"
    prompt += "Please explain briefly for each model why it might have achieved its respective average score.\n"
    prompt += "Focus on label quality (clarity, relevance, interpretability).\n"
    prompt += "Respond with lines in the following format:\n"
    prompt += "<Model>: <score>/100\nExplanation: <short explanation>\n\n"

    for model_name, score in avg_scores.items():
        prompt += f"{model_name}: {score}/100\n"

    # Gemini call
    chat = model.start_chat()
    response = chat.send_message(prompt)

    print("=== Raw Gemini Response ===")
    print(response.text)

    # More robust extraction
    explanation_dict = {}
    lines = response.text.splitlines()
    current_model = None

    for line in lines:
        for model_name in avg_scores.keys():
            if model_name in line and ':' in line:
                current_model = model_name
                break

        if current_model and "Explanation" in line:
            explanation = line.split("Explanation:")[-1].strip()
            explanation_dict[current_model] = explanation
            current_model = None

    # fallback
    for model in avg_scores.keys():
        if model not in explanation_dict:
            explanation_dict[model] = "Missing"

    # DF
    df = pd.DataFrame([
        {"Model": model, "Average Score": avg_scores[model]}
        for model in avg_scores.keys()
    ])

    return df


# --- Usage ---
explanation_df = explain_scores_with_gemini(avg_scores, gemini_model)

print("=== Explanations ===")
display(explanation_df)


=== Raw Gemini Response ===
BERTopic: 56.33/100
Explanation: BERTopic might have received a lower score because its labels, while potentially reflecting underlying topics, could lack clarity or be too broad, hindering easy interpretation.

KeyBERT: 57.33/100
Explanation: KeyBERT's score suggests its labels were moderately useful. They likely extracted relevant keywords, but the labels might have lacked sufficient context, making them less interpretable than a more refined topic modeling approach.

Gemini: 79.0/100
Explanation: Gemini's higher score indicates its labels were generally of high quality. They were likely clear, relevant, and interpretable, indicating a strong understanding of the input data and a good ability to summarize or generate meaningful labels.

=== Explanations ===


Unnamed: 0,Model,Average Score
0,BERTopic,56.33
1,KeyBERT,57.33
2,Gemini,79.0


### Cluster Purity Evaluation


To assess the internal consistency and relevance of the generated labels, we define a custom **Cluster Purity** metric.

For each topic, the dominant keyword is selected as the most frequent label suggested by the model. Then, we compute the proportion of comments within the topic that actually contain this dominant keyword.

Formally:
- A purity score of 1.0 means that all comments in the topic contain the dominant keyword.
- A lower score indicates that fewer comments explicitly mention the dominant keyword.

The final purity reported for each model is the average purity across all topics.

This metric provides a simple but insightful way to measure how well the model-generated labels are grounded in the actual content of the comments. However, it is important to note that purity does not capture the full semantic alignment between labels and topics — it only measures **surface-level keyword occurrence**.

In [226]:
def tokenize(text: str) -> set:
    return set(re.findall(r'\w+', text.lower()))


def compute_cluster_purity(
    texts: Sequence[str],
    topics: Sequence[int],
    labels_dict: Dict[str, Dict[int, List[str]]],
    ) -> pd.DataFrame:
    purities = []

    for model_name, model_labels in labels_dict.items():
        model_purities = []

        for topic_id, keywords in model_labels.items():
            texts_in_topic = [text for text, t in zip(texts, topics) if t == topic_id]

            if not texts_in_topic or not keywords:
                model_purities.append(0.0)
                continue

            match_count = 0

            match_count = sum(
                    1 for text in texts_in_topic
                    if any(tokenize(kw) & tokenize(text) for kw in keywords)
                )

            purity = match_count / len(texts_in_topic)
            model_purities.append(purity)

        purities.append({
            "Model": model_name,
            "Average Purity": np.mean(model_purities)
        })

    return pd.DataFrame(purities)

purity_df = compute_cluster_purity(texts, topics, labels_dict)
display(purity_df)

Unnamed: 0,Model,Average Purity
0,BERTopic,0.766912
1,KeyBERT,0.582433
2,Gemini,0.492755


Cluster Purity (with Caution)
Note: We include the Cluster Purity metric as a lightweight, surface-level measure of consistency between topic labels and their corresponding texts. It computes the proportion of comments in a topic that contain (even partially) the assigned keywords.

However, this metric is highly limited:

It does not account for semantic similarity or paraphrasing.

It may penalize high-quality, abstract labels that don’t explicitly appear in the raw text.

It can produce misleadingly low scores for otherwise accurate labels.

Interpretation guidance:
We consider purity scores only as a sanity check, with very low weight in our overall evaluation. High purity might indicate strong surface alignment, but low purity does not necessarily mean the label is bad.

To evaluate labeling quality more meaningfully, we rely primarily on semantic-based methods (e.g., embedding similarity, human inspection, or external tasks).

In [227]:
from faker import Faker
import random

fake = Faker()

def perturb_texts(texts, p_insert=0.3, p_delete=0.2, p_shuffle=0.2):
    perturbed_texts = []

    for text in texts:
        words = text.split()

        if len(words) > 0 and random.random() < p_insert:
            idx = random.randint(0, len(words) - 1)
            words.insert(idx, fake.word())

        if len(words) > 3 and random.random() < p_delete:
            idx = random.randint(0, len(words) - 1)
            del words[idx]

        if len(words) > 4 and random.random() < p_shuffle:
            idx1, idx2 = random.sample(range(len(words)), 2)
            words[idx1], words[idx2] = words[idx2], words[idx1]

        perturbed_texts.append(" ".join(words))

    return perturbed_texts


perturb_texts — Text Augmentation with Faker
This function introduces light random perturbations to a list of input texts by inserting a synthetic (fake) word into each sentence. It's useful for:

Testing model robustness to noisy or altered input.

Augmenting datasets with slight textual variations.

Simulating user-generated content or natural language variation.

How it works:
For each input text, it randomly selects a position in the word list.

It generates a fake word using the Faker library.

The fake word is inserted into the chosen position in the sentence.

The perturbed version is added to a new list of texts.

In [228]:
def match_topics(labels1, labels2):
    matching = {}
    for topic1_id, topic1_labels in labels1.items():
        best_match = None
        best_score = -1
        set1 = set([label.lower() for label in topic1_labels])
        
        for topic2_id, topic2_labels in labels2.items():
            set2 = set([label.lower() for label in topic2_labels])
            if len(set1) == 0 or len(set2) == 0:
                continue
            score = len(set1 & set2) / len(set1 | set2)  # Jaccard
            
            if score > best_score:
                best_score = score
                best_match = topic2_id
                
        matching[topic1_id] = (best_match, best_score)
    return matching

In [229]:

def compute_stability_with_matching(labels_dict_run1, labels_dict_run2):
    stability_results = []

    for model_name in labels_dict_run1.keys():
        labels1 = labels_dict_run1[model_name]
        labels2 = labels_dict_run2[model_name]

        matching = match_topics(labels1, labels2)

        topic_stabilities = []
        for topic1_id, (topic2_id, _) in matching.items():
            if topic2_id is None:
                stability = 0.0
            else:
                set1 = set([label.lower() for label in labels1[topic1_id]])
                set2 = set([label.lower() for label in labels2[topic2_id]])
                if len(set1) == 0 and len(set2) == 0:
                    stability = 1.0
                elif len(set1) == 0 or len(set2) == 0:
                    stability = 0.0
                else:
                    stability = len(set1 & set2) / len(set1 | set2)  # Jaccard
            topic_stabilities.append(stability)

        avg_stability = np.mean(topic_stabilities)
        stability_results.append({
            "Model": model_name,
            "Average Matched Stability": round(avg_stability, 3)
        })

    return pd.DataFrame(stability_results)

# perturb the dataset
perturbed_texts = perturb_texts(texts)


In [230]:
perturbed_topics, perturbed_probs = topic_model.fit_transform(perturbed_texts)

In [231]:
second_labeler = TopicLabeler(perturbed_texts, perturbed_topics)

In [232]:
second_bertopic_labels = second_labeler.label_with_bertopic(topic_model)

In [233]:
second_keybert_labels = second_labeler.label_with_keybert(embedding_model)

In [234]:
second_gemini_labels = second_labeler.label_with_gemini(gemini_model)

In [235]:
second_labels_dict = {"BERTopic": second_bertopic_labels, "KeyBERT": second_keybert_labels, "Gemini": second_gemini_labels}
stability_df = compute_stability_with_matching(labels_dict, second_labels_dict)
display(stability_df)

Unnamed: 0,Model,Average Matched Stability
0,BERTopic,0.108
1,KeyBERT,0.166
2,Gemini,0.169


In [237]:
def final_model_ranking(purity_df, stability_df, gemini_explanation_df):
    # Compute average Stability per model
    stabilities = []
    for model in purity_df["Model"]:
        avg_stability = stability_df.loc[stability_df["Model"] == model, "Average Matched Stability"].mean()
        stabilities.append(round(avg_stability, 3) if not np.isnan(avg_stability) else 0)


    # Add Stability column
    purity_df["Stability"] = stabilities

    # Merge with Gemini Scores
    merged_df = pd.merge(purity_df, gemini_explanation_df, on="Model")

    # --- Weights ---
    w1 = 0.1  # Purity
    w2 = 0.1  # Stability
    w3 = 0.8  # Gemini Score

    # Normalize Gemini scores
    merged_df["Gemini Normalized"] = merged_df["Average Score"] / 100

    # Compute Final Score
    merged_df["Final Score"] = 100*(
        w1 * merged_df["Average Purity"] +
        w2 * merged_df["Stability"] +
        w3 * merged_df["Gemini Normalized"]
    )

    # --- Remove Average Score ---
    merged_df = merged_df.drop(columns=["Average Score"])

    # --- Reorder Columns ---
    columns_order = ["Model", "Average Purity", "Stability", "Gemini Normalized", "Final Score"]
    merged_df = merged_df[columns_order]

    # --- Display ---
    merged_df = merged_df.sort_values("Final Score", ascending=False)
    print("=== Final Model Ranking (with Gemini normalized properly) ===")
    display(merged_df)

    return merged_df


ranking_df = final_model_ranking(purity_df, stability_df, explanation_df)


=== Final Model Ranking (with Gemini normalized properly) ===


Unnamed: 0,Model,Average Purity,Stability,Gemini Normalized,Final Score
2,Gemini,0.492755,0.169,0.79,69.817553
0,BERTopic,0.766912,0.108,0.5633,53.813119
1,KeyBERT,0.582433,0.166,0.5733,53.348328


## Gemini as the Primary Evaluation Signal
In our final ranking, Gemini emerged as the top-performing labeling method — and deservedly so.

Unlike the other metrics (Purity and Stability), which rely on surface-level heuristics (e.g., keyword presence or label overlap), Gemini provides a semantic, high-level understanding of the topic-label alignment. It evaluates whether the generated labels truly reflect the underlying meaning of the clustered texts — something that keyword matching alone cannot capture.

Because of this, we assign Gemini a significantly higher weight (80%) in the final score.
The Purity and Stability metrics, while useful for identifying obvious issues like inconsistent or irrelevant labels, are limited in their ability to assess quality from a human perspective.

By prioritizing Gemini, we ground our evaluation in a model that reflects human-like judgment and intuition, ensuring that the labels selected are not only consistent — but actually meaningful.