In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
import numpy as np

try:
    newsgroups = fetch_20newsgroups(subset="all")
    corpus = newsgroups.data
except Exception as e:
    print(f"Error loading dataset: {e}")
    corpus = []

if corpus:

    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    embeddings = model.encode(corpus, show_progress_bar=True)

    reducer = UMAP(n_components=10)
    reduced_embeddings = reducer.fit_transform(embeddings)

    cluster = HDBSCAN(min_samples=10, min_cluster_size=10)
    cluster.fit(reduced_embeddings)

    labels = cluster.labels_

    vectorizer = TfidfVectorizer(max_features=5000)

    X = vectorizer.fit_transform(corpus)

    tfidf_matrix = X.toarray()

    class_tfidf_matrix = []
    for label in set(labels):
        if label != -1:
            class_docs = [corpus[i] for i, l in enumerate(labels) if l == label]
            class_tfidf = vectorizer.transform(class_docs).toarray().mean(axis=0)
            class_tfidf_matrix.append(class_tfidf)

    topics = []
    for class_tfidf in class_tfidf_matrix:
        topic_words = []
        for i, tfidf in enumerate(class_tfidf):
            if tfidf > 0:
                topic_words.append((vectorizer.get_feature_names_out()[i], tfidf))
        topics.append(topic_words)

    for topic in topics:
        print(topic)
else:
    print("No data to process")