In [8]:
import logging
import os
import uuid

import pandas as pd
from sklearn.model_selection import train_test_split

from autotm.base import AutoTM


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()


path_to_dataset = "data/lenta/sample_dataset_lenta.csv"
alg_name = "ga"

df = pd.read_csv(path_to_dataset)
train_df, test_df = train_test_split(df, test_size=0.1)

working_dir_path = f"./autotm_workdir_{uuid.uuid4()}"
model_path = os.path.join(working_dir_path, "autotm_model")

autotm = AutoTM(
    preprocessing_params={
        "lang": "ru",
        "min_tokens_count": 3
    },
    alg_name=alg_name,
    alg_params={
        "num_iterations": 10,
        "num_individuals": 10,
        "use_nelder_mead_in_mutation": False,
        "use_nelder_mead_in_crossover": False,
        "use_nelder_mead_in_selector": False,
        "train_option": "offline"
    },
    working_dir_path=working_dir_path
)
mixtures = autotm.fit_predict(train_df)

logger.info(f"Calculated train mixtures: {mixtures.shape}\n\n{mixtures.head(10).to_string()}")

# saving the model
autotm.save(model_path)

# loading and checking if everything is fine with predicting
autotm_loaded = AutoTM.load(model_path)
mixtures = autotm_loaded.predict(test_df)

logger.info(f"Calculated train mixtures: {mixtures.shape}\n\n{mixtures.head(10).to_string()}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:autotm.base:Stage 0: Create working dir ./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d if not exists
INFO:autotm.base:Stage 1: Dataset preparation
INFO:autotm.preprocessing.dictionaries_preparation: batches ./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/batches 
 vocabulary ./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/test_set_data_voc.txt 
 are ready


Saved to ./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/ppp.csv
Starting...
part 1/1


E1029 18:27:01.817061    17 dictionary_operations.cc:381] Error at line 1, file ./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/test_set_data_voc.txt. Expected format: <token> [<class_id>], dictionary will be gathered in random token order


./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/cooc_df.txt is ready!
./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/cooc_tf.txt is ready!
Calculating pPMI...
Calculating pPMI...
./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/ppmi_tf.txt is ready!
./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/ppmi_df.txt is ready!


E1029 18:27:04.433176    17 dictionary_operations.cc:452] Error at line 1, file ./autotm_workdir_45c9ac46-20e9-4c51-b47e-bf6989e0569d/78033323-ba5e-4959-88c0-85a55dddee09/ppmi_tf.txt. Number of values in all lines should be equal to 3, dictionary will be gathered without cooc info
INFO:autotm.base:Stage 2: Tuning the topic model
2023-10-29 18:27:04,822 - GA_algo - INFO - Starting experiment: 1698604024
2023-10-29 18:27:04,823 - GA_algo - INFO - ALGORITHM PARAMS  number of individuals 10; number of fitness evals unlimited; number of early stopping iterations 500; crossover prob None
2023-10-29 18:27:04,829 - root - INFO - Calculating fitness...


1 validation error for IndividualDTO
fitness_value
  Input should be a valid dictionary [type=dict_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.4/v/dict_type


Exception: Some exception

In [7]:
!pip3 install en-core-web-sm==3.7.0

