In [9]:
from bertopic import BERTopic
from loguru import logger
import pandas as pd

DEFAULT_ENCODER_PATH = "distiluse-base-multilingual-cased-v1"


class Clusterizer:
    def __init__(
        self, encoder_path: str = DEFAULT_ENCODER_PATH, min_topic_size: int = 10
    ):
        self.topic_model = BERTopic(
            embedding_model=encoder_path,
            verbose=True,
            min_topic_size=min_topic_size,
            calculate_probabilities=True,
        )

        logger.info(f"Created topic model: {self.topic_model}")

    def predict_topics(self, texts: list[str]) -> (list[int], dict[int, str]):
        logger.info(f"Predicting topics for {len(texts)} texts...")
        topics, _ = self.topic_model.fit_transform(texts)
        logger.info(f"Predicted distinct topics: {len(set(topics))}")

        topic_info = self.topic_model.get_topic_info()[["Topic", "Name"]]
        topic_info.columns = ["topics", "keywords"]
        topic_info["keywords"] = topic_info["keywords"].apply(
            lambda x: ", ".join(x.split("_")[1:])
        )

        topic_info = {
            topic_id: keywords
            for topic_id, keywords in zip(topic_info["topics"], topic_info["keywords"])
        }

        return topics, topic_info


if __name__ == "__main__":
    

    # Шаг 1: Загрузка данных из CSV файла
    data = pd.read_json('notebooks/data/geo-reviews-dataset-2023.jsonl', lines=True)

    # Шаг 2: Извлечение текстов
    texts = data["text"].tolist()

    # Шаг 3: Создание экземпляра класса Clusterizer
    clusterizer = Clusterizer()

    # Шаг 4: Вызов функции predict_topics
    topics, topic_info = clusterizer.predict_topics(texts)

    # Шаг 5: Рассмотрите результаты
    print("Predicted Topics:", topics)
    print("Topic Information:", topic_info)

[32m2024-02-13 19:10:05.970[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m19[0m - [1mCreated topic model: BERTopic(calculate_probabilities=True, ctfidf_model=ClassTfidfTransformer(...), embedding_model=distiluse-base-multilingual-cased-v1, hdbscan_model=HDBSCAN(...), language=None, low_memory=False, min_topic_size=10, n_gram_range=(1, 1), nr_topics=None, representation_model=None, seed_topic_list=None, top_n_words=10, umap_model=UMAP(...), vectorizer_model=CountVectorizer(...), verbose=True, zeroshot_min_similarity=0.7, zeroshot_topic_list=None)[0m
[32m2024-02-13 19:10:05.987[0m | [1mINFO    [0m | [36m__main__[0m:[36mpredict_topics[0m:[36m22[0m - [1mPredicting topics for 10000 texts...[0m
2024-02-13 19:10:05,991 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2024-02-13 19:10:27,625 - BERTopic - Embedding - Completed ✓
2024-02-13 19:10:27,626 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-13 19:10:48,830 - BERTopic - Dimensionality - Completed ✓
2024-02-13 19:10:48,831 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-13 19:10:49,255 - BERTopic - Cluster - Completed ✓
2024-02-13 19:10:49,258 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-13 19:10:49,520 - BERTopic - Representation - Completed ✓
[32m2024-02-13 19:10:49.547[0m | [1mINFO    [0m | [36m__main__[0m:[36mpredict_topics[0m:[36m24[0m - [1mPredicted distinct topics: 2[0m


Predicted Topics: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

{0, 1}