# ***topic_modeling_sentence_bert***

## Import the Libraries

In [23]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import numpy as np

In [2]:
# Load dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))


In [14]:
# View sample data
df = pd.DataFrame(newsgroups.data[:400],columns=["text"])
df

Unnamed: 0,text
0,\n\nI am sure some bashers of Pens fans are pr...
1,My brother is in the market for a high-perform...
2,\n\n\n\n\tFinally you said what you dream abou...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,1) I have an old Jasmine drive which I cann...
...,...
395,\n\nEither the government has force available ...
396,I'm new to the hardware and with a mandate to ...
397,"hi all,\n\nIN SHORT: looking for very fast ass..."
398,"\nThe ""so sacred it's secret"" explanation is a..."


## Load the sentence bert

In [16]:
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Computer Embeddings

In [24]:
embeddings=  np.array(df["text"].apply(lambda x : model.encode(x)).tolist())

In [26]:
print(embeddings.shape)

(400, 384)


## Apply umap for the Dimensionality Reduction

In [28]:
reducer = umap.UMAP()
reduced_embeddings = reducer.fit_transform(embeddings)



In [34]:
df["red_x"] = reduced_embeddings[:,0]
df["red_y"] = reduced_embeddings[:,1]

## Do HDBSCAN clustering


In [39]:
import hdbscan

In [40]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=2,min_samples=1,metric="euclidean",cluster_selection_method="eom")

In [41]:
df["cluster"] =  clusterer.fit_predict(reduced_embeddings)



## Do K-Means Clustering

In [52]:
from sklearn.cluster import KMeans
num_clusters =  5
clusterer = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df['cluster'] = clusterer.fit_predict(reduced_embeddings)

## Using CountVectorizer to find similar topics

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
vectorizer = CountVectorizer(stop_words="english")

In [55]:
def get_top_words_per_cluster(df, vectorizer, num_words=5):
    cluster_keywords = {}

    for cluster in sorted(df['cluster'].unique()):
        if cluster == -1:  # Ignore noise points
            continue

        cluster_texts = df[df['cluster'] == cluster]['text']

        # Check if there is valid text
        if cluster_texts.empty:
            print(f"⚠️ Skipping empty cluster {cluster}")
            continue

        # Convert text to lowercase to ensure consistency
        cluster_texts = cluster_texts.str.lower()

        try:
            X = vectorizer.fit_transform(cluster_texts)  # Transform text to word count matrix

            if not X.shape[1]:  # If no valid words are found
                print(f"⚠️ Skipping cluster {cluster}: No valid words after vectorization.")
                continue

            words = np.array(vectorizer.get_feature_names_out())  # Extract feature names
            word_counts = np.asarray(X.sum(axis=0)).flatten()  # Sum counts for each word

            # Get top words
            top_words = words[np.argsort(word_counts)[-num_words:]][::-1]  # Sort and get top N words
            cluster_keywords[cluster] = top_words.tolist()

        except ValueError:
            print(f"⚠️ Skipping cluster {cluster}: Empty vocabulary detected.")
            continue

    return cluster_keywords

In [56]:
get_top_words_per_cluster(df, vectorizer, num_words=5)

⚠️ Skipping cluster 3: Empty vocabulary detected.


{np.int32(0): ['nyr', 'det', 'bos', 'tor', 'mtl'],
 np.int32(1): ['use', 'windows', 'know', 'edu', 'thanks'],
 np.int32(2): ['people', 'god', 'like', 'said', 'law'],
 np.int32(4): ['water', 'just', 'like', 'use', 'know']}

## Class based TfIDF

In [57]:
from sklearn.feature_extraction.text import TfidfTransformer
# Prepare data for c-TF-IDF
def get_ctfidf_topics(df, num_words=5):
    cluster_topics = {}

    for cluster in sorted(df['cluster'].unique()):
        if cluster == -1:  # Ignore noise points
            continue

        cluster_texts = df[df['cluster'] == cluster]['text']

        if cluster_texts.empty:
            print(f"⚠️ Skipping empty cluster {cluster}")
            continue

        cluster_texts = cluster_texts.str.lower()  # Normalize text

        # Convert text into a single document per cluster
        cluster_document = " ".join(cluster_texts)

        # Ensure the document isn't empty after processing
        if not cluster_document.strip():
            print(f"⚠️ Skipping cluster {cluster}: No valid words to process.")
            continue

        # Vectorize the cluster document
        vectorizer = CountVectorizer(stop_words="english")
        try:
            X = vectorizer.fit_transform([cluster_document])
        except ValueError:
            print(f"⚠️ Skipping cluster {cluster}: Empty vocabulary detected.")
            continue

        # Check if the vocabulary is empty after vectorization
        if not X.shape[1]:
            print(f"⚠️ Skipping cluster {cluster}: No meaningful words found.")
            continue

        # Apply TF-IDF transformation
        transformer = TfidfTransformer()
        tfidf_matrix = transformer.fit_transform(X)

        words = np.array(vectorizer.get_feature_names_out())
        scores = np.asarray(tfidf_matrix.sum(axis=0)).flatten()

        # Get top words based on c-TF-IDF scores
        top_words = words[np.argsort(scores)[-num_words:]][::-1]  # Sort and select top N words
        cluster_topics[cluster] = top_words.tolist()

    return cluster_topics


# Get c-TF-IDF topics
cluster_topics = get_ctfidf_topics(df)

# Print the top words per cluster
print("\n🔹 **Top c-TF-IDF Words per Cluster** 🔹")
for cluster, words in cluster_topics.items():
    print(f"Cluster {cluster}: {', '.join(words)}")

⚠️ Skipping cluster 3: Empty vocabulary detected.

🔹 **Top c-TF-IDF Words per Cluster** 🔹
Cluster 0: nyr, det, bos, tor, mtl
Cluster 1: use, windows, know, edu, thanks
Cluster 2: people, god, like, said, law
Cluster 4: water, just, like, use, know
