# Topic Labeling Research

## Imports And Installations

In [None]:
import sys
!{sys.executable} -m pip install pandas bertopic transformers keybert google-generativeai numpy faker grpcio==1.60.0 grpcio-tools==1.60.0

Collecting grpcio==1.60.0
  Downloading grpcio-1.60.0-cp312-cp312-win_amd64.whl.metadata (4.2 kB)
Collecting grpcio-tools==1.60.0
  Downloading grpcio_tools-1.60.0-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Collecting protobuf<5.0dev,>=4.21.6 (from grpcio-tools==1.60.0)
  Downloading protobuf-4.25.8-cp310-abi3-win_amd64.whl.metadata (541 bytes)
INFO: pip is looking at multiple versions of grpcio-status to determine which version is compatible with other requirements. This could take a while.
Collecting grpcio-status<2.0.dev0,>=1.33.2 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading grpcio_status-1.74.0-py3-none-any.whl.metadata (1.1 kB)
  Downloading grpcio_status-1.73.1-py3-none-any.whl.metadata (1.1 kB)
  Downloading grpcio_status-1.73.0-py3-none-any.whl.metadata (1.1 kB)
  Downloading grpcio_status-1.72.2-py3-none-any.whl.me

In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import google.generativeai as genai
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
import importlib

module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils.gemini_client as gemini_client

def reload_utils():
    importlib.reload(gemini_client)

reload_utils()

In [None]:

import google.generativeai as genai

genai.configure(api_key=gemini_client.GEMINI_API_KEY)

gemini_model = genai.GenerativeModel("gemini-2.0-flash-lite")


In [None]:
# Step 1: Load CSV
df = pd.read_csv("..\data\datasets\youtube_comments\jack_vs_calley_1000.csv") 
texts = df["text"].dropna().astype(str).tolist() 

  df = pd.read_csv("..\data\datasets\youtube_comments\jack_vs_calley_1000.csv")


## Intruduction


Topic modeling is widely used to discover hidden structures in text datasets. However, labeling the discovered topics is often challenging. Traditional methods generate labels by extracting keywords, which might not always convey the full semantic meaning of the topic.

In this project, we aim to:
1. Apply BERTopic to cluster YouTube comments into topics.
2. Generate labels using:
    - BERTopic's built-in labeling
    - KeyBERT keyword extraction
    - Google Gemini LLM summarization
3. Rate the quality of labels generated by each method.

The goal is to understand whether LLMs can outperform classical methods in generating interpretable topic labels.

The process consists of the following steps:

1. **Clustering**:  
   - We use BERTopic with a pre-trained `all-MiniLM-L6-v2` embedding model to cluster the comments into topics.

2. **Labeling**:
   - **BERTopic**: Extracts representative keywords for each topic.
   - **KeyBERT**: Extracts keywords based on embedding similarity.
   - **Gemini**: Receives comments per topic and returns up to 5 keywords describing the topic.

3. **Evaluation**:  
   - We assess the quality of the generated labels using three evaluation methods:
     1. **Cluster Purity** (`compute_cluster_purity`): Measures how well the assigned labels capture the internal consistency of each cluster.
     2. **Label Stability** (`compute_label_stability`): Evaluates the robustness of labels when the data or clustering slightly changes.
     3. **Gemini-based Rating**: Uses Google Gemini to provide an external qualitative assessment of the labels, scoring each method from 1 to 100 based on clarity, relevance, and descriptiveness.


## Clustering

In [None]:
# BERTopic Clustering ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(texts)

## Labeling

In [None]:
class TopicLabeler:
    def __init__(self, texts, topics):
        self.texts = texts
        self.topics = topics

    def label_with_bertopic(self, topic_model):
        return {
            topic: [word for word, _ in topic_model.get_topic(topic) or []]
            for topic in set(self.topics) if topic != -1
        }

    def label_with_keybert(self, embedding_model, top_n=5):
        kw_model = KeyBERT(model=embedding_model)
        labels = {}
        for topic in set(self.topics):
            if topic == -1:
                continue
            docs_in_topic = [text for text, t in zip(self.texts, self.topics) if t == topic]
            keywords = kw_model.extract_keywords(" ".join(docs_in_topic), top_n=top_n, stop_words='english')
            labels[topic] = [kw[0] for kw in keywords]
        return labels
    
    def label_with_gemini(self, model, max_words=5):
        labels = {}
        for topic in set(self.topics):
            if topic == -1:
                continue

            # Collect topic texts
            docs_in_topic = [text for text, t in zip(self.texts, self.topics) if t == topic]

            # Skip small topics
            if len(docs_in_topic) < 3:
                continue

            # Limit number of comments
            docs_in_topic = docs_in_topic[:5]

            # Limit each comment length (max 300 characters per comment)
            docs_in_topic = [text[:300] for text in docs_in_topic]

            # Prepare the prompt text
            docs_text = "\n".join(docs_in_topic)

            prompt = f"""
            You are given a group of YouTube comments that share a common topic.
            Provide up to {max_words} keywords or short phrases that best summarize the main topic of these comments.
            Comments:
            {docs_text}
            Return the keywords separated by commas only.
            """

            chat = model.start_chat()
            response = chat.send_message(prompt)
            keywords = response.text.strip().split(',')

            labels[topic] = [kw.strip() for kw in keywords]

        return labels

In [None]:
labeler = TopicLabeler(texts, topics)
bertopic_labels = labeler.label_with_bertopic(topic_model)
keybert_labels = labeler.label_with_keybert(embedding_model)
gemini_labels = labeler.label_with_gemini(gemini_model)


KeyboardInterrupt: 

In [None]:
labels_dict = {"BERTopic": bertopic_labels, "KeyBERT": keybert_labels, "Gemini": gemini_labels}

## Evaluationg

### Topic Inspection - General Topic Overview


In order to better understand the quality of the generated labels, we implement a simple visualization function. The function `show_topic_full` displays, for a given topic:
1. The labels generated by each labeling method.
2. The list of all comments associated with the selected topic.

Since topic modeling is an unsupervised task, evaluating the "correctness" of labels is inherently challenging. There is no absolute ground truth, and even similar labels can have different levels of usefulness depending on human interpretation. Therefore, visual inspection ‚Äî simulating how a human would read the comments and judge the relevance of the labels ‚Äî is essential.

This motivated us to later employ a Large Language Model (LLM) as an evaluator, aiming to approximate human judgment when rating the quality of the labeling.

In [None]:
import random

def show_topic_full(topic_id, labels_dict):
    print(f"=== Topic {topic_id} ===\n")
    

    for model_name, model_labels in labels_dict.items():
        labels = model_labels.get(topic_id, [])
        print(f"--- {model_name} Labels ---")
        print(", ".join(labels) if labels else "No labels")
        print()
    

    print(f"--- All Texts in Topic {topic_id} ---")
    texts_in_topic = [text for text, t in zip(texts, topics) if t == topic_id]
    
    if not texts_in_topic:
        print("No texts found for this topic.")
    else:
        for i, text in enumerate(texts_in_topic, 1):
            print(f"{i}. {text}")


random_topic = random.choice(list(set(topics) - {-1}))
show_topic_full(random_topic, labels_dict)


=== Topic 19 ===

--- BERTopic Labels ---
cali, we, the, how, like, international, sounds, can, of, forget

--- KeyBERT Labels ---
cali, reform, policy, debate, truce

--- Gemini Labels ---
Cali Means, censorship, policy, government, debate

--- All Texts in Topic 19 ---
1. wow kinda losing lots of respect for Cali Meanes....... 100%
2. Another WOW !!!Between the UTUBE censorship and the ridiculous level of hostility of this ‚ÄúDr.‚Äù Cali Means has the patience of Job !!! These two have no idea how policy is made ! You can‚Äôt be so controversial that no one will listen to what your saying! The point is to get to the place where you ACTUALLY have the power to change things ! The deep state is in place till Monday Jan 20 @ 12:00 !
3. It sounds like Cali wants a truce that's what he means by a science reset like just look past all the crimes against humanity and forget about it all. NO WE WONT FORGET! I am starting to wonder about how much this administration will actually get done that

### Automated Evaluation with Gemini

Since collecting human judgments for a large number of topics and comments is impractical given our resources, we employ Google Gemini as an automated evaluator. This allows us to approximate human-like assessment of label quality without the need for extensive manual annotation.


For every topic, Gemini is provided with:
1. The labels generated by each labeling method (BERTopic, KeyBERT, and Gemini itself).
2. A sample of comments belonging to the topic.


#### Gemini Numerical Evaluator

Gemini is asked to act as an impartial evaluator and assign a score from 1 to 100 for each set of labels, focusing on:
- **Clarity** ‚Äî Are the labels understandable and well-phrased?
- **Relevance** ‚Äî Do the labels reflect the topic's content?
- **Descriptiveness** ‚Äî How well do the labels summarize the topic?

The function collects the individual topic scores and computes the **average rating** for each labeling method across all topics.

While this does not fully replace human evaluation, using a powerful LLM helps us approximate human judgment at scale and provides valuable insights into the relative performance of each labeling method.

In [None]:
def evaluate_all_topics_with_gemini(labels_dict, model):
    results = {name: [] for name in labels_dict.keys()}

    all_topics = list(set(topics) - {-1})
    
    for topic_id in all_topics:
        print(f" Evaluating Topic {topic_id}...")

        prompt = f"Evaluate the labeling quality for Topic {topic_id}.\n"
        prompt += "For each model, here are the labels it generated:\n\n"

        for model_name, model_labels in labels_dict.items():
            labels = model_labels.get(topic_id, [])
            prompt += f"--- {model_name} Labels ---\n"
            prompt += ", ".join(labels) if labels else "No labels"
            prompt += "\n\n"

        prompt += "--- Example Texts in this Topic ---\n"
        texts_in_topic = [text for text, t in zip(texts, topics) if t == topic_id]
        
        for i, text in enumerate(texts_in_topic, 1):
            prompt += f"{i}. {text}\n"

        prompt += ("\n\nPlease rate each model from 1 to 100, based on how well the labels describe the topic and make sense.\n"
                   "Imagine you are a professional linguist and data scientist who was not involved in generating these labels.\n"
                   "Your task is to objectively evaluate each set of labels without any consideration of their source. Focus only on clarity, relevance, and how well the labels describe the topic's content.\n"
                   "Give only numeric ratings like this:\n"
                   "- BERTopic: <score>\n"
                   "- KeyBERT: <score>\n"
                   "- Gemini: <score>\n")

        chat = model.start_chat()
        response = chat.send_message(prompt)

        for model_name in results.keys():
            try:
                line = [line for line in response.text.splitlines() if model_name in line][0]
                score = int(''.join(filter(str.isdigit, line)))
                results[model_name].append(score)
            except Exception as e:
                print(f" Failed to extract score for {model_name} in Topic {topic_id}: {e}")

    avg_scores = {model: round(np.mean(scores), 2) if scores else 0 for model, scores in results.items()}

    print("\n=== Average Scores ===")
    for model, score in avg_scores.items():
        print(f"{model}: {score}/100")

    return avg_scores

avg_scores = evaluate_all_topics_with_gemini(labels_dict, gemini_model)


 Evaluating Topic 0...
 Evaluating Topic 1...
 Evaluating Topic 2...
 Evaluating Topic 3...
 Evaluating Topic 4...
 Evaluating Topic 5...
 Evaluating Topic 6...
 Evaluating Topic 7...
 Evaluating Topic 8...
 Evaluating Topic 9...
 Evaluating Topic 10...
 Evaluating Topic 11...
 Evaluating Topic 12...
 Evaluating Topic 13...
 Evaluating Topic 14...
 Evaluating Topic 15...
 Evaluating Topic 16...
 Evaluating Topic 17...
 Evaluating Topic 18...
 Evaluating Topic 19...

=== Average Scores ===
BERTopic: 43.5/100
KeyBERT: 383.3/100
Gemini: 78.25/100


#### Gemini Text Explainer


To complement the numerical evaluation, we use Gemini to provide short explanations for the average scores of each labeling method. The `explain_scores_with_gemini` function prompts Gemini to justify the given scores by commenting on aspects such as clarity, relevance, and interpretability of the generated labels.

This step helps us gain qualitative insights into the strengths and weaknesses of each method, beyond just numerical ratings.

In [None]:
def explain_scores_with_gemini(avg_scores, model):
    # Prompt
    prompt = "You are an objective evaluator.\n"
    prompt += "Please explain briefly for each model why it might have achieved its respective average score.\n"
    prompt += "Focus on label quality (clarity, relevance, interpretability).\n"
    prompt += "Respond with lines in the following format:\n"
    prompt += "<Model>: <score>/100\nExplanation: <short explanation>\n\n"

    for model_name, score in avg_scores.items():
        prompt += f"{model_name}: {score}/100\n"

    # Gemini call
    chat = model.start_chat()
    response = chat.send_message(prompt)

    print("=== Raw Gemini Response ===")
    print(response.text)

    # More robust extraction
    explanation_dict = {}
    lines = response.text.splitlines()
    current_model = None

    for line in lines:
        for model_name in avg_scores.keys():
            if model_name in line and ':' in line:
                current_model = model_name
                break

        if current_model and "Explanation" in line:
            explanation = line.split("Explanation:")[-1].strip()
            explanation_dict[current_model] = explanation
            current_model = None

    # fallback
    for model in avg_scores.keys():
        if model not in explanation_dict:
            explanation_dict[model] = "Missing"

    # DF
    df = pd.DataFrame([
        {"Model": model, "Average Score": avg_scores[model]}
        for model in avg_scores.keys()
    ])

    return df


# --- Usage ---
explanation_df = explain_scores_with_gemini(avg_scores, gemini_model)

print("=== Explanations ===")
display(explanation_df)


=== Raw Gemini Response ===
BERTopic: 44.44/100
Explanation: BERTopic's lower score likely stems from less clear and sometimes less relevant topic labels. The hierarchical nature might have occasionally produced less interpretable or overly specific labels.

KeyBERT: 60.28/100
Explanation: KeyBERT's score is moderate, possibly due to the simplicity of its keyword-based approach. Keywords can be relevant but may lack the nuanced thematic understanding that yields superior label clarity and interpretability.

Gemini: 77.22/100
Explanation: Gemini's higher score suggests it generated more concise, relevant, and easily understandable labels. Its ability to understand context and relationships likely resulted in better label clarity and interpretability.

=== Explanations ===


Unnamed: 0,Model,Average Score
0,BERTopic,44.44
1,KeyBERT,60.28
2,Gemini,77.22


### Cluster Purity Evaluation


To assess the internal consistency and relevance of the generated labels, we define a custom **Cluster Purity** metric.

For each topic, the dominant keyword is selected as the most frequent label suggested by the model. Then, we compute the proportion of comments within the topic that actually contain this dominant keyword.

Formally:
- A purity score of 1.0 means that all comments in the topic contain the dominant keyword.
- A lower score indicates that fewer comments explicitly mention the dominant keyword.

The final purity reported for each model is the average purity across all topics.

This metric provides a simple but insightful way to measure how well the model-generated labels are grounded in the actual content of the comments. However, it is important to note that purity does not capture the full semantic alignment between labels and topics ‚Äî it only measures **surface-level keyword occurrence**.

In [None]:
def compute_cluster_purity(texts, topics, labels_dict):
    purities = []

    for model_name, model_labels in labels_dict.items():
        model_purities = []

        for topic_id in model_labels.keys():
            keywords = model_labels[topic_id]
            if len(keywords) == 0:
                purity = 0
            else:
                texts_in_topic = [text for text, t in zip(texts, topics) if t == topic_id]

                if len(texts_in_topic) == 0:
                    purity = 0
                else:
                    keyword_counts = pd.Series(keywords).value_counts()
                    dominant_keyword = keyword_counts.idxmax()

                    match_count = sum(1 for text in texts_in_topic if dominant_keyword.lower() in text.lower())
                    purity = match_count / len(texts_in_topic)

            model_purities.append(purity)
        
        avg_purity = np.mean(model_purities)
        purities.append({
            "Model": model_name,
            "Average Purity": avg_purity
        })
    
    return pd.DataFrame(purities)

purity_df = compute_cluster_purity(texts, topics, labels_dict)
display(purity_df)

Unnamed: 0,Model,Average Purity
0,BERTopic,0.678019
1,KeyBERT,0.3369
2,Gemini,0.299563


In [None]:
from faker import Faker
fake = Faker()

def perturb_texts(texts):
    perturbed_texts = []

    for text in texts:
        words = text.split()
        if len(words) > 0:
            # Choose a random word index to duplicate
            idx = random.randint(0, len(words) - 1)
            # Duplicate the word
            words.insert(idx, fake.word())
            perturbed_texts.append(' '.join(words))

    return perturbed_texts

In [None]:
def match_topics(labels1, labels2):
    matching = {}
    for topic1_id, topic1_labels in labels1.items():
        best_match = None
        best_score = -1
        set1 = set([label.lower() for label in topic1_labels])
        
        for topic2_id, topic2_labels in labels2.items():
            set2 = set([label.lower() for label in topic2_labels])
            if len(set1) == 0 or len(set2) == 0:
                continue
            score = len(set1 & set2) / len(set1 | set2)  # Jaccard
            
            if score > best_score:
                best_score = score
                best_match = topic2_id
                
        matching[topic1_id] = (best_match, best_score)
    return matching

In [None]:

def compute_stability_with_matching(labels_dict_run1, labels_dict_run2):
    stability_results = []

    for model_name in labels_dict_run1.keys():
        labels1 = labels_dict_run1[model_name]
        labels2 = labels_dict_run2[model_name]

        matching = match_topics(labels1, labels2)

        topic_stabilities = []
        for topic1_id, (topic2_id, _) in matching.items():
            if topic2_id is None:
                stability = 0.0
            else:
                set1 = set([label.lower() for label in labels1[topic1_id]])
                set2 = set([label.lower() for label in labels2[topic2_id]])
                if len(set1) == 0 and len(set2) == 0:
                    stability = 1.0
                elif len(set1) == 0 or len(set2) == 0:
                    stability = 0.0
                else:
                    stability = len(set1 & set2) / len(set1 | set2)  # Jaccard
            topic_stabilities.append(stability)

        avg_stability = np.mean(topic_stabilities)
        stability_results.append({
            "Model": model_name,
            "Average Matched Stability": round(avg_stability, 3)
        })

    return pd.DataFrame(stability_results)

# perturb the dataset
perturbed_texts = perturb_texts(texts)


In [None]:
print(texts)
print(perturbed_texts)

['Watch this episode fully uncensored & ad-free on Patreon: https://patreon.com/dannyjonesSupport the show by checking out our sponsors:https://bit.ly/viiadannyjones - Try VIIA & use code DANNYhttps://whiterabbitenergy.com/?ref=DJP - Use code DJP for 20% off', 'If they took vaccines off the market 1/2 the country would lose their shit.  These two lunatics want to sacrifice the good for the perfect.', 'Calley how many more babies and parents have to die while youre busy with niceties? Thats the issue', 'Why would Jack turn down Rogan and Tucker? He could call them out just like he did this guy', 'TRUST NOT... anyone who Slowly Measures their speech... Correct Jack... That person is FOS,  just as Jack said in 1st 4 minutes!  A young un-wise guy vs 35+ years of Wisdom.... on display!  jmo', 'Transparency bro', "OMG  Danny thank you for hosting this discussion.  I watched it twice, and the suit guy is so full of double speak, it was hard to sit through.   Scary to think he's headed back to

In [None]:
perturbed_topics, perturbed_probs = topic_model.fit_transform(perturbed_texts)

In [None]:
second_labeler = TopicLabeler(perturbed_texts, perturbed_topics)
second_bertopic_labels = labeler.label_with_bertopic(topic_model)
second_keybert_labels = labeler.label_with_keybert(embedding_model)
second_gemini_labels = labeler.label_with_gemini(gemini_model)

second_labels_dict = {"BERTopic": second_bertopic_labels, "KeyBERT": second_keybert_labels, "Gemini": second_gemini_labels}

In [None]:
stability_df = compute_stability_with_matching(labels_dict, second_labels_dict)
display(stability_df)

Unnamed: 0,Model,Average Matched Stability
0,BERTopic,0.465
1,KeyBERT,1.0
2,Gemini,0.504


In [None]:
def final_model_ranking(purity_df, stability_df, gemini_explanation_df):
    # Compute average Stability per model
    stabilities = []
    for model in purity_df["Model"]:
        avg_stability = stability_df.loc[stability_df["Model"] == model, "Average Matched Stability"].mean()
        stabilities.append(round(avg_stability, 3) if not np.isnan(avg_stability) else 0)


    # Add Stability column
    purity_df["Stability"] = stabilities

    # Merge with Gemini Scores
    merged_df = pd.merge(purity_df, gemini_explanation_df, on="Model")

    # --- Weights ---
    w1 = 0.3  # Purity
    w2 = 0.3  # Stability
    w3 = 0.4  # Gemini Score

    # Normalize Gemini scores
    merged_df["Gemini Normalized"] = merged_df["Average Score"] / 100

    # Compute Final Score
    merged_df["Final Score"] = (
        w1 * merged_df["Average Purity"] +
        w2 * merged_df["Stability"] +
        w3 * merged_df["Gemini Normalized"]
    )

    # --- Remove Average Score ---
    merged_df = merged_df.drop(columns=["Average Score"])

    # --- Reorder Columns ---
    columns_order = ["Model", "Average Purity", "Stability", "Gemini Normalized", "Final Score"]
    merged_df = merged_df[columns_order]

    # --- Display ---
    merged_df = merged_df.sort_values("Final Score", ascending=False)
    print("=== Final Model Ranking (with Gemini normalized properly) ===")
    display(merged_df)

    return merged_df



# --- ◊î◊®◊¶◊î ---
ranking_df = final_model_ranking(purity_df, stability_df, explanation_df)


NameError: name 'purity_df' is not defined