In [None]:
import hdbscan
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from utils import create_preumap
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired, OpenAI
from bertopic import BERTopic
import openai
import pickle
from utils import custom_save

from parallel_pandas import ParallelPandas
ParallelPandas.initialize(n_cpu=8)

In [2]:
dataemb_path = "../data_input/dataemb_corpus.csv"
data_path = "../data_input/data_corpus.csv"
df_emb = pd.read_csv(dataemb_path, usecols=["id", "emb", "emb_reduced"])
df = pd.read_csv(data_path, usecols=["id", "text_preproc1", "ipc"])

In [3]:
def ast_eval(x):
    import numpy as np
    import ast
    return np.array(ast.literal_eval(x))

df_emb["emb"] = df_emb["emb"].p_apply(ast_eval)
df_emb["emb_reduced"] = df_emb["emb_reduced"].p_apply(ast_eval)

AST_EVAL DONE:   0%|          | 0/123667 [00:00<?, ?it/s]

AST_EVAL DONE:   0%|          | 0/123667 [00:00<?, ?it/s]

In [4]:
texts = df["text_preproc1"].tolist()

# embeddings as numpy array
embs = np.vstack(df_emb["emb"].values)
embs_reduced = np.vstack(df_emb["emb_reduced"].values)

In [5]:
ipc_kategories = df["ipc"].tolist()

In [6]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [7]:
umap_model = create_preumap(embs_reduced)

In [8]:
# Fetch English and German stop words from NLTK
english_stop_words = set(stopwords.words('english'))
german_stop_words = set(stopwords.words('german'))
custom_stop_words = set(["robot", "robotics", "roboter", "robotic", "robots", "robotically"])

# Combine English and German stop words
all_stop_words = set(english_stop_words.union(german_stop_words) | custom_stop_words)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=list(all_stop_words), max_df=0.7, min_df=5, ngram_range=(1,2))  # Customize parameters as needed

In [9]:
# Create your representation model
representation_model = KeyBERTInspired()

In [10]:
client = openai.OpenAI( 
                       api_key="<api_key>", 
    organization='org-1Ka58tWTHUlKbvvaagf5gr3A',
  project='proj_dbYcRUygXNty6uPMjX5hdhki',)
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
representation_model = OpenAI(client, prompt=prompt, model="gpt-3.5-turbo", delay_in_seconds=0.1, chat=True)
# topic_model.representation_model = representation_model


In [13]:
def get_model(**params):
    # Define your HDBSCAN model
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=30, **params)
    topic_model = BERTopic(
        vectorizer_model=tfidf_vectorizer,
        representation_model=representation_model,
        embedding_model=model,
        hdbscan_model=hdbscan_model,
        umap_model=umap_model,
        ctfidf_model=ctfidf_model,
        verbose=True,
    )
    return hdbscan_model, topic_model

In [14]:
hdbscan_model, topic_model = get_model(min_samples=6, cluster_selection_epsilon=0.16)
topics, probabilities = topic_model.fit_transform(texts, embeddings=embs, y=ipc_kategories)

2024-06-28 15:08:44,726 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-28 15:08:44,747 - BERTopic - Dimensionality - Completed ✓
2024-06-28 15:08:44,764 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-28 15:08:47,647 - BERTopic - Cluster - Completed ✓
2024-06-28 15:08:47,664 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 172/172 [02:33<00:00,  1.12it/s]
2024-06-28 15:11:36,700 - BERTopic - Representation - Completed ✓


In [21]:
# make columns no width
pd.set_option('display.max_colwidth', None)

topic_model.get_topic_info()[['Topic', 'Count', 'Name']]

Unnamed: 0,Topic,Count,Name
0,-1,29550,-1_Robot Pose Calibration and Autonomous Guidance with Deep Learning for Robotic Systems
1,0,23177,0_Mobile Robot Mapping and Navigation-Methods and Systems
2,1,14336,1_Robotic warehousing system with clamping jaw and speed reducer
3,2,13302,2_Cleaning robot water circulation system and base station
4,3,4602,3_Welding robot system and automatic equipment
...,...,...,...
167,166,31,166_Variable Stiffness Robot Joints
168,167,30,167_Pressure Detection and Measurement Devices for Robotic Systems
169,168,30,168_Modular Charging and Power System for Vehicles and Robotics
170,169,30,169_Anti Toppling Device for Service Robots


In [None]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(texts, linkage_function=linkage_function)

In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
topic_model.visualize_topics()

In [None]:
# Use custom save method
custom_save(topic_model, "output/bertopic_model_v1")

In [None]:
# Save data to disk
with open('output/big_topics/hierarch.pkl', 'wb') as f:
    pickle.dump(hierarchical_topics, f)