In [1]:
!pip install top2vec
!pip install top2vec[sentence_encoders]
!pip install top2vec[sentence_transformers]
!pip install top2vec[indexing]

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting top2vec
  Downloading top2vec-1.0.28-py3-none-any.whl (25 kB)
  Downloading top2vec-1.0.27-py3-none-any.whl (25 kB)
Collecting hdbscan>=0.8.27
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hdbscan: filename=hdbscan-0.8.29-cp37-cp37m-linux_x86_64.whl size=3426690 sha256=c4478dade0a046ecea74c85edd737bb25a21423fe694d89427707481772eea3e
  Stored in directory: /root/.cache/pip/wheels/93/78/2e/03ee191669a772e9653260aa3bd53e0b1a768751a9676e8c82
Success

In [2]:
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
from top2vec import Top2Vec

In [3]:
report_pages_ds_filename = '/kaggle/input/rspp-reports-pages/report_pages_ds.csv'
pages_df = pd.read_csv(report_pages_ds_filename, index_col=0)
pages_df.head()

Unnamed: 0,report_num,page_num,text
0,341,187,период летний обычный период существенный влия...
1,1173,9,основной направление природоохранный экология ...
2,49,75,мнение заинтересованный сторона внутренний вне...
3,1253,99,действие сфера культура спор тот г. средство в...
4,942,64,л


In [4]:
pages_df.shape[0]

118157

In [5]:
report_pages = list(pages_df.text)

In [6]:
hdbscan_args = {'min_cluster_size': 30,
                'min_samples': 15,
                'metric': 'euclidean'}
umap_args = {"low_memory": True,
             "random_state": 42}
model = Top2Vec(documents=report_pages, 
                speed='deep-learn', 
                workers=8, 
                min_count=0, 
                embedding_model='universal-sentence-encoder-multilingual',
                hdbscan_args=hdbscan_args,
                umap_args=umap_args)

2023-03-01 10:56:13,708 - top2vec - INFO - Pre-processing documents for training
2023-03-01 10:58:04,870 - top2vec - INFO - Downloading universal-sentence-encoder-multilingual model
2023-03-01 10:58:16,279 - top2vec - INFO - Creating joint document/word embedding
2023-03-01 11:01:04,970 - top2vec - INFO - Creating lower dimension embedding of documents
2023-03-01 11:04:23,005 - top2vec - INFO - Finding dense areas of documents
2023-03-01 11:04:28,683 - top2vec - INFO - Finding topics


In [None]:
# model.add_documents(report_pages[20000:40000])
# model.add_documents(report_pages[40000:60000])
# model.add_documents(report_pages[60000:80000])
# model.add_documents(report_pages[80000:100000])
# model.add_documents(report_pages[100000:])

In [8]:
num_of_topics = model.get_num_topics()
num_of_topics

325

In [9]:
topic_sizes, topic_nums = model.get_topic_sizes()
topic_nums, topic_sizes

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [12]:
model.save('model_pages_2')

In [13]:
topic_words, word_scores, topic_nums = model.get_topics(num_of_topics)

In [14]:
pages_topics = {'topic_id': [],
                'topic_words': []}
for topic_id, topic_w in tqdm(zip(topic_nums, topic_words)):
    pages_topics['topic_id'].append(topic_id)
    pages_topics['topic_words'].append(', '.join(topic_w))

325it [00:00, 40231.05it/s]


In [15]:
pages_topics_df = pd.DataFrame.from_dict(pages_topics)
pages_topics_df.head()

Unnamed: 0,topic_id,topic_words
0,0,"работник, персонал, сотрудник, кадровыи, зарпл..."
1,1,"областнои, волонтерство, фонд, спонсорство, фо..."
2,2,"эколог, экологическии, экологичныи, экологичес..."
3,3,"ценность, рентабельность, объективность, преем..."
4,4,"инвестиция, атомпроект, инвестиционныи, атомэн..."


In [16]:
pages_topics_df.to_excel('pages_topics.xlsx', index=False)

In [None]:
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)