In [608]:
import pandas as pd

data_path = "D:/studium/analyse/data_prep/output/data_corpus_sampled.csv"
df = pd.read_csv(data_path, usecols=["id", "text_preproc1", "ipc", "emb"])

In [615]:
from parallel_pandas import ParallelPandas
ParallelPandas.initialize(n_cpu=8)

def ast_eval(x):
    import numpy as np
    import ast
    return np.array(ast.literal_eval(x))

df["emb"] = df["emb"].p_apply(ast_eval)

AST_EVAL DONE:   0%|          | 0/46673 [00:00<?, ?it/s]

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46673 entries, 0 to 46672
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             46673 non-null  object
 1   emb            46673 non-null  object
 2   text_preproc1  46673 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [35]:
df.head()

Unnamed: 0,id,emb,text_preproc1
0,US000011970344B2,"[0.011253111064434052, -0.00603919941931963, 0...",Apparatus and method for controlling robot arm...
1,US000011958200B2,"[0.06428353488445282, -0.029208185151219368, -...",Automatic robotic arm system and coordinating ...
2,US000011958552B2,"[0.004194360692054033, -0.08026488870382309, 0...",Bionic robot for all terrains. A bionic robot ...
3,US000011958687B2,"[0.08360709995031357, -0.04210223630070686, 0....","High position robot, method for calibrating re..."
4,US000011958688B2,"[0.0564134456217289, -0.02820468507707119, 0.0...",Area based operation by autonomous robots in a...


In [None]:
import numpy as np

texts = df["text_preproc1"].tolist()

# embeddings as numpy array
embs = np.vstack(df["emb"].values)

In [23]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [574]:
import hdbscan

# Define your HDBSCAN model with adjusted parameters
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50, min_samples=4, cluster_selection_epsilon=0.15)

In [575]:
from umap import UMAP

umap_model = UMAP(n_neighbors=20, n_components=5, random_state=42)

In [600]:
reduced_embeddings = umap_model.fit_transform(embs)

In [603]:
df['emb_reduced'] = reduced_embeddings.tolist()

In [604]:
df['emb_reduced']

0        [-0.1231079176068306, 0.5581996440887451, 4.18...
1        [1.215340495109558, 0.6582039594650269, 2.7411...
2        [1.6027783155441284, 2.3087809085845947, 3.742...
3        [0.31770941615104675, 0.5081982016563416, 4.43...
4        [-0.2442733645439148, 0.8733708262443542, 2.94...
                               ...                        
46668    [1.0200252532958984, 1.0609917640686035, 3.737...
46669    [0.3853992223739624, 1.2509907484054565, 3.565...
46670    [0.5257968306541443, 1.2463735342025757, 3.378...
46671    [0.7987768650054932, 1.1554850339889526, 3.310...
46672    [1.642073392868042, 1.1747041940689087, 3.9171...
Name: emb_reduced, Length: 46673, dtype: object

In [576]:
topic_model = BERTopic(embedding_model=model, hdbscan_model=hdbscan_model, umap_model=umap_model, ctfidf_model=ctfidf_model, verbose=True)

In [617]:
topics = set(df["ipc"].tolist())

In [622]:
topics, probabilities = topic_model.fit_transform(texts, embeddings=embs)

2024-05-28 16:27:40,044 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2024-05-28 16:28:05,623 - BERTopic - Dimensionality - Completed ✓
2024-05-28 16:28:05,625 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-28 16:28:06,897 - BERTopic - Cluster - Completed ✓
2024-05-28 16:28:06,898 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-28 16:28:08,935 - BERTopic - Representation - Completed ✓
2024-05-28 16:28:08,935 - BERTopic - Topic reduction - Reducing number of topics
2024-05-28 16:28:10,937 - BERTopic - Topic reduction - Reduced number of topics from 90 to 20


In [623]:
# Assign the topics to the DataFrame
df['berttopic'] = topics

In [624]:
topic_model.get_topic_info(-1).Count[0]

17982

#### 1
```
Topic -1 Count: 1991
umap_model = UMAP(n_neighbors=10, min_dist=0.0, n_components=8, random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=2, cluster_selection_epsilon=0.1)
topic_model = BERTopic(hdbscan_model=hdbscan_model, umap_model=umap_model, embedding_model=model)
```

In [625]:
freq = topic_model.get_topic_info();

0	-1	23358	-1_der_die_und_object
1	0	2304	0_exoskeleton_limb_rehabilitation_waist
2	1	1400	1_surgical_instrument_surgery_patient
3	2	970	2_welding_seam_gun_weld
4	3	602	3_pipeline_pipe_dredging_oil
5	4	508	4_charging_pile_recharging_charged
6	5	429	5_planning_path_global_algorithm
7	6	398	6_image_images_captured_camera

0	-1	17982	-1_der_und_die_eine
1	0	3375	0_underwater_cleaning_water_dust
2	1	3332	1_surgical_instrument_effector_surgery
3	2	2580	2_cloud_path_map_planning
4	3	2293	3_exoskeleton_limb_rehabilitation_waist
5	4	1219	4_map_apparatus_program_trajectory
6	5	1045	5_chassis_carrying_tray_goods
7	6	997	6_welding_seam_gun_weld

In [626]:
freq[['Topic', 'Count', 'Name']].head(30)

Unnamed: 0,Topic,Count,Name
0,-1,17982,-1_system_for_an_in
1,0,19473,0_cleaning_body_robot_welding
2,1,3661,1_surgical_instrument_robotic_effector
3,2,2293,2_exoskeleton_limb_rehabilitation_joint
4,3,1014,3_substrate_sorting_transfer_chamber
5,4,619,4_rpa_voice_automation_dialogue
6,5,247,5_aerial_wind_unmanned_flying
7,6,239,6_food_cooking_kitchen_processor
8,7,159,7_cup_glass_suction_fluid
9,8,143,8_agv_gait_biped_carrying


In [627]:
topic_model.get_topic(5)

[('aerial', 0.5583696692712207),
 ('wind', 0.4762240339759627),
 ('unmanned', 0.4664865895009757),
 ('flying', 0.4525764447657768),
 ('turbine', 0.41122533731574307),
 ('wing', 0.3922356590535333),
 ('blade', 0.3767734351352035),
 ('flight', 0.34517307599310126),
 ('flapping', 0.34396367955885493),
 ('air', 0.34376343103044876)]

In [628]:
fig = topic_model.visualize_topics(); fig

In [629]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(texts, linkage_function=linkage_function)

100%|██████████| 18/18 [00:00<00:00, 237.40it/s]


In [630]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [631]:
topic_model.get_topic_tree(hierarchical_topics)

'.\n├─■──der_einer_werkstückträgervorrichtung_des_eines ── Topic: 17\n└─is_with_body_cleaning_comprises\n     ├─is_cleaning_body_with_comprises\n     │    ├─■──neural_network_training_neuronalen_netzes ── Topic: 14\n     │    └─is_body_cleaning_with_comprises\n     │         ├─■──vehicle_passenger_autonomous_reality_service ── Topic: 18\n     │         └─is_body_cleaning_with_comprises\n     │              ├─is_cleaning_body_with_comprises\n     │              │    ├─■──fruit_picking_fruits_vegetable_vine ── Topic: 13\n     │              │    └─is_cleaning_body_with_comprises\n     │              │         ├─is_cleaning_body_with_comprises\n     │              │         │    ├─is_cleaning_body_with_comprises\n     │              │         │    │    ├─■──brake_braking_band_electromagnetic_brakes ── Topic: 11\n     │              │         │    │    └─cleaning_is_body_with_comprises\n     │              │         │    │         ├─■──snakelike_snake_shaped_section_bionic ── Topic: 16\n  

In [632]:
# topics_to_merge = [[1, 2],
#                    [3, 4]]
# topic_model.merge_topics(docs, topics_to_merge)

In [633]:
topic_model.visualize_heatmap()

In [634]:
topic_model.visualize_term_rank()

In [635]:
# topic_model.update_topics(docs, n_gram_range=(1, 3))

In [636]:
topic_model.topic_aspects_

{}

In [637]:
topic_model.visualize_barchart()

In [638]:
# topic_model.visualize_documents(texts)

In [639]:
# topic_model.visualize_distribution(probabilities[200], min_probability=0.015)

In [640]:
topic_model.reduce_topics(texts, nr_topics=20)

2024-05-28 16:28:13,258 - BERTopic - Topic reduction - Reducing number of topics
2024-05-28 16:28:13,259 - BERTopic - Topic reduction - Reduced number of topics from 20 to 20


<bertopic._bertopic.BERTopic at 0x15bbee22820>

In [641]:
topic_model.topics_

[-1,
 0,
 0,
 -1,
 -1,
 5,
 -1,
 0,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 14,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 0,
 0,
 -1,
 -1,
 0,
 0,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 0,
 3,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 2,
 1,
 -1,
 1,
 -1,
 0,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 0,
 0,
 -1,
 1,
 1,
 -1,
 -1,
 0,
 6,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 -1,
 0,
 -1,
 1,
 1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 2,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 2,
 1,
 3,
 3,
 -1,
 -1,
 11,
 -1,
 1,
 -1,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 3,
 -1,
 -1,
 3,
 -1,
 -1,
 0,
 -1,
 -1,
 1,
 0,
 4,
 1,
 -1,
 1,
 3,
 -1,
 2,
 0,
 -1,
 4,
 -1,
 -1,
 -1,
 10,
 1,
 1,
 0,
 6,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 3,
 12,
 12,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 1,
 0,
 0,
 -1,
 -1,
 0,
 -1,
 2,
 2,
 0,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 0,
 13,
 -1,
 0,
 0,
 0,
 1,
 -1,
 1,
 1,
 2,
 2,
 1,
 0,
 -1,
 -1,
 4,
 3,
 -1,
 -1,
 0,
 1,
 4,
 0,
 -1,
 -1,
 0,
 0,
 0,
 

In [642]:
topic_model.generate_topic_labels()

['-1_system_for_an',
 '0_cleaning_body_robot',
 '1_surgical_instrument_robotic',
 '2_exoskeleton_limb_rehabilitation',
 '3_substrate_sorting_transfer',
 '4_rpa_voice_automation',
 '5_aerial_wind_unmanned',
 '6_food_cooking_kitchen',
 '7_cup_glass_suction',
 '8_agv_gait_biped',
 '9_catheter_interventional_guidewire',
 '10_printing_print_printer',
 '11_brake_braking_band',
 '12_tactile_sensor_layer',
 '13_fruit_picking_fruits',
 '14_neural_network_training',
 '15_gear_planetary_wave',
 '16_snakelike_snake_shaped',
 '17_der_einer_werkstückträgervorrichtung',
 '18_vehicle_passenger_autonomous']

In [643]:
topic_model.reduce_outliers(texts, topics)

100%|██████████| 18/18 [00:10<00:00,  1.79it/s]


[3,
 0,
 0,
 3,
 3,
 5,
 18,
 0,
 12,
 4,
 0,
 0,
 1,
 0,
 14,
 18,
 4,
 18,
 0,
 18,
 4,
 14,
 16,
 1,
 4,
 1,
 7,
 1,
 18,
 1,
 1,
 0,
 0,
 4,
 1,
 0,
 0,
 1,
 3,
 1,
 0,
 18,
 3,
 0,
 3,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 11,
 0,
 0,
 4,
 1,
 1,
 0,
 1,
 0,
 6,
 0,
 4,
 1,
 4,
 1,
 0,
 3,
 0,
 11,
 1,
 0,
 4,
 1,
 1,
 1,
 9,
 1,
 1,
 1,
 2,
 0,
 1,
 6,
 2,
 1,
 1,
 1,
 3,
 1,
 4,
 1,
 1,
 1,
 3,
 0,
 2,
 1,
 3,
 3,
 3,
 1,
 11,
 18,
 1,
 3,
 0,
 1,
 1,
 1,
 1,
 11,
 3,
 18,
 13,
 3,
 3,
 3,
 0,
 1,
 15,
 1,
 0,
 4,
 1,
 15,
 1,
 3,
 0,
 2,
 0,
 18,
 4,
 3,
 0,
 3,
 10,
 1,
 1,
 0,
 6,
 0,
 1,
 1,
 1,
 1,
 3,
 3,
 12,
 12,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 5,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 4,
 0,
 1,
 2,
 2,
 0,
 1,
 1,
 3,
 1,
 1,
 7,
 3,
 0,
 13,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 0,
 3,
 0,
 4,
 3,
 3,
 18,
 0,
 1,
 4,
 0,
 18,
 1,
 0,
 0,
 0,
 4,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 10,
 3,
 13,
 5,
 1,
 4,
 0,
 0,
 18,
 0,
 0,
 0,
 3,

In [644]:
topic_model.save("../output/bertopic_model_v1")

