In [1]:
import os
import logging
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class TopicModeler:
    def __init__(self, input_path):
        self.input_path = input_path
        self.docs = []
        self.filtered_docs = []
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.coherence_scores = []
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    def load_documents(self):
        for filename in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, filename)
            if os.path.isfile(file_path) and filename.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    if isinstance(content, str):
                        self.docs.append(content)
        print(f"Loaded {len(self.docs)} documents")
        if self.docs:
            print(f"Sample document: {self.docs[0][:100]}...")

    def preprocess_documents(self):
        self.docs = [[token.lower() for token in self.tokenizer.tokenize(doc)] for doc in self.docs if isinstance(doc, str)]
        self.docs = [[token for token in doc if not token.isnumeric() and len(token) > 1] for doc in self.docs]
        stop_words = set(stopwords.words('english'))
        self.docs = [[word for word in doc if word not in stop_words] for doc in self.docs]
        self.docs = [[self.lemmatizer.lemmatize(token) for token in doc] for doc in self.docs]
        print(f"After preprocessing: {len(self.docs)} documents")
        if self.docs:
            print(f"Sample preprocessed document: {self.docs[0][:10]}...")

    def add_bigrams(self):
        bigram = Phrases(self.docs, min_count=5, threshold=100)
        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    self.docs[idx].append(token)
        print(f"Added bigrams. Sample document: {self.docs[0][:15]}...")

    def filter_with_tfidf(self, top_n=200, bottom_n=50):
        texts = [' '.join(doc) for doc in self.docs]
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
        feature_names = tfidf_vectorizer.get_feature_names_out()
        mean_tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
        
        top_indices = mean_tfidf_scores.argsort()[-top_n:]
        bottom_indices = mean_tfidf_scores.argsort()[:bottom_n]
        
        words_to_remove = set(feature_names[i] for i in np.concatenate([top_indices, bottom_indices]))

        self.filtered_docs = [[word for word in doc if word not in words_to_remove] for doc in self.docs]
        print(f"Filtered documents using TF-IDF. Sample filtered document: {self.filtered_docs[0][:10]}...")

    def create_dictionary_and_corpus(self):
        self.dictionary = Dictionary(self.filtered_docs)
        original_size = len(self.dictionary)
        self.dictionary.filter_extremes(no_below=2, no_above=0.9)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.filtered_docs]
        print(f"Dictionary size: {len(self.dictionary)} (reduced from {original_size})")
        print(f"Corpus size: {len(self.corpus)}")
        if self.corpus:
            print(f"Sample corpus entry: {self.corpus[0][:10]}")

    def train_lda_model(self, num_topics=6, chunksize=2000, passes=20, iterations=400, alpha='auto', eta='auto'):
        if not self.corpus or not self.dictionary:
            raise ValueError("Corpus or dictionary is empty. Check your preprocessing steps.")
        
        print(f"Training LDA model with num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}, alpha={alpha}, eta={eta}")
        
        self.model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            chunksize=chunksize,
            alpha=alpha,
            eta=eta,
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=None,
            random_state=42
        )
        print("LDA model trained successfully")

    def print_model_info(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
        top_topics = self.model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / self.model.num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)
        print("\nTop topics:")
        pprint(self.model.print_topics())
        print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.filtered_docs, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return coherence_lda

    def visualize_topics(self):
        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary)
        return vis

    def analyze_documents(self):
        for i, doc in enumerate(self.filtered_docs[:5]):
            bow = self.dictionary.doc2bow(doc)
            doc_topics = self.model.get_document_topics(bow)
            print(f"\nDocument {i} topics:")
            pprint(doc_topics)
            best_topic = max(doc_topics, key=lambda x: x[1])
            print(f"Best topic: {best_topic[0]}")
            print(f"Top words in this topic:")
            pprint(self.model.show_topic(best_topic[0]))
            print(f"Original document: {' '.join(doc[:30])}...")
            print()

    def save_model(self, file_path):
        if self.model is None:
            raise ValueError("No model has been trained yet.")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        self.model.save(file_path)
        print(f"Model saved to {file_path}")

    def load_model(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No model file found at {file_path}")
        self.model = LdaModel.load(file_path)
        print(f"Model loaded from {file_path}")

    def save_dictionary_and_corpus(self, dict_path, corpus_path):
        if self.dictionary is None or self.corpus is None:
            raise ValueError("Dictionary and corpus have not been created yet.")
        os.makedirs(os.path.dirname(dict_path), exist_ok=True)
        os.makedirs(os.path.dirname(corpus_path), exist_ok=True)
        self.dictionary.save(dict_path)
        with open(corpus_path, 'wb') as f:
            pickle.dump(self.corpus, f)
        print(f"Dictionary saved to {dict_path}")
        print(f"Corpus saved to {corpus_path}")

    def load_dictionary_and_corpus(self, dict_path, corpus_path):
        if not os.path.exists(dict_path) or not os.path.exists(corpus_path):
            raise FileNotFoundError(f"Dictionary or corpus file not found.")
        self.dictionary = Dictionary.load(dict_path)
        with open(corpus_path, 'rb') as f:
            self.corpus = pickle.load(f)
        print(f"Dictionary loaded from {dict_path}")
        print(f"Corpus loaded from {corpus_path}")

    def plot_coherence_scores(self, iteration):
        num_topics = [x[0] for x in self.coherence_scores]
        coherences = [x[1] for x in self.coherence_scores]
        
        plt.figure(figsize=(10, 6))
        plt.plot(num_topics, coherences, marker='o')
        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence Score')
        plt.title('Coherence Score by Number of Topics')
        plt.grid(True)
        
        plt.xticks(num_topics)
        plt.ylim(0, max(coherences) * 1.1)
        
        plt.savefig(f'coherence_scores_iteration_{iteration}.png')
        plt.close()

    def get_word_frequencies(self):
        topics = self.model.get_topics()
        word_frequencies = np.sum(topics, axis=0)
        word_frequencies /= self.model.num_topics
        return {self.dictionary[i]: freq for i, freq in enumerate(word_frequencies)}

    def get_low_frequency_words(self, threshold=0.01):
        word_frequencies = self.get_word_frequencies()
        return {word: freq for word, freq in word_frequencies.items() if freq < threshold}

    def get_high_frequency_words(self, threshold=0.00016):
        word_frequencies = self.get_word_frequencies()
        high_freq_words = {word: freq for word, freq in word_frequencies.items() if freq > threshold}
        
        if not high_freq_words:
            print(f"No words found with a frequency higher than the threshold of {threshold}.")
        
        return high_freq_words


    def save_frequency_words(self, low_freq_file, high_freq_file, low_threshold=0.01, high_threshold=0.1):
        low_freq_words = self.get_low_frequency_words(low_threshold)
        high_freq_words = self.get_high_frequency_words(high_threshold)

        df_low = pd.DataFrame(list(low_freq_words.items()), columns=['Word', 'Frequency'])
        df_low = df_low.sort_values('Frequency', ascending=True)
        df_low.to_csv(low_freq_file, index=False)
        print(f"Low frequency words saved to {low_freq_file}")

        df_high = pd.DataFrame(list(high_freq_words.items()), columns=['Word', 'Frequency'])
        df_high = df_high.sort_values('Frequency', ascending=False)
        df_high.to_csv(high_freq_file, index=False)
        print(f"High frequency words saved to {high_freq_file}")

    def run_with_params(self, num_topics, chunksize, passes, iterations):
        try:
            self.load_documents()
            self.preprocess_documents()
            self.add_bigrams()
            self.filter_with_tfidf(top_n=200, bottom_n=50)
            self.create_dictionary_and_corpus()
            self.train_lda_model(num_topics=num_topics, chunksize=chunksize, passes=passes, iterations=iterations)
            model_dir = f"lda_models/topics_{num_topics}_passes_{passes}_iterations_{iterations}"
            self.save_model(os.path.join(model_dir, 'trained_model'))
            self.save_dictionary_and_corpus(os.path.join(model_dir, 'dictionary'), os.path.join(model_dir, 'corpus'))
            coherence_lda = self.print_model_info()
            self.coherence_scores.append((num_topics, coherence_lda))
            
            # Plot and save coherence score after each run
            self.plot_coherence_scores(len(self.coherence_scores))
            
            vis = self.visualize_topics()
            pyLDAvis.save_html(vis, os.path.join(model_dir, 'lda_visualization.html'))
            self.analyze_documents()
            
            return vis
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    modeler = TopicModeler('txt')

    param_combinations = [
        (10, 2000, 10, 200),
        (20, 2000, 10, 400),
        (30, 2000, 20, 200),
        (40, 2000, 20, 400),
        (50, 2000, 20, 400),
        (60, 2000, 20, 400)
    ]

    csv_folder = 'csv_output'
    os.makedirs(csv_folder, exist_ok=True)

    for num_topics, chunksize, passes, iterations in param_combinations:
        print(f"\nRunning model with parameters: num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}")
        modeler.run_with_params(num_topics, chunksize, passes, iterations)
    
    modeler.save_frequency_words(
        os.path.join(csv_folder, 'low_freq_words.csv'),
        os.path.join(csv_folder, 'high_freq_words.csv'),
        low_threshold=0.01,
    high_threshold=0.00016
    )


Running model with parameters: num_topics=10, chunksize=2000, passes=10, iterations=200
Loaded 440 documents
Sample document: Pricing for interconnection related to the provision of number portability, as referred to in Articl...


2024-08-06 18:00:12,173 : INFO : collecting all words and their counts
2024-08-06 18:00:12,173 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:00:12,196 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:00:12,196 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:00:12,197 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:00:12.197228', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:12,352 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:00:12,361 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do

After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec']...
Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=10, chunksize=2000, passes=10, iterations=200, alpha=auto, eta=auto


2024-08-06 18:00:12,546 : INFO : optimized alpha [0.07259606, 0.071897574, 0.07156579, 0.07379289, 0.073421955, 0.06923073, 0.07034618, 0.0695245, 0.070727736, 0.07583554]
2024-08-06 18:00:12,548 : INFO : topic #5 (0.069): 0.015*"currency" + 0.012*"foreign" + 0.007*"exchange" + 0.007*"provide" + 0.007*"kingdom_spain" + 0.007*"spain" + 0.006*"foreign_currency" + 0.005*"financial" + 0.005*"year" + 0.005*"limitation"
2024-08-06 18:00:12,548 : INFO : topic #7 (0.070): 0.009*"guarantee" + 0.007*"charge" + 0.006*"examine" + 0.005*"mortgage" + 0.005*"year" + 0.005*"device" + 0.005*"event" + 0.005*"unfairness" + 0.004*"associated" + 0.004*"required"
2024-08-06 18:00:12,549 : INFO : topic #4 (0.073): 0.005*"mean" + 0.005*"damage" + 0.005*"ingredient" + 0.005*"marketed" + 0.005*"currency" + 0.005*"fact" + 0.005*"germany" + 0.005*"natural" + 0.005*"conformity" + 0.004*"list"
2024-08-06 18:00:12,549 : INFO : topic #3 (0.074): 0.011*"le" + 0.010*"et" + 0.008*"la" + 0.007*"en" + 0.007*"reference" + 

LDA model trained successfully
Model saved to lda_models/topics_10_passes_10_iterations_200/trained_model
Dictionary saved to lda_models/topics_10_passes_10_iterations_200/dictionary
Corpus saved to lda_models/topics_10_passes_10_iterations_200/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.2713.

Top topics:
[(0,
  '0.008*"call" + 0.007*"emergency" + 0.007*"advertiser" + 0.006*"comparative" '
  '+ 0.006*"3a" + 0.006*"number" + 0.006*"chain" + 0.006*"property" + '
  '0.006*"misleading" + 0.005*"advertisement"'),
 (1,
  '0.012*"producer" + 0.008*"television" + 0.008*"grand" + '
  '0.007*"registration" + 0.007*"grand_duchy" + 0.007*"duchy" + '
  '0.007*"luxembourg" + 0.007*"protected_designation" + 0.007*"fact" + '
  '0.007*"used"'),
 (2,
  '0.008*"framework" + 0.007*"ensure" + 0.007*"conjunction" + 0.007*"annex" + '
  '0.007*"ass" + 0.007*"january" + 0.006*"water" + 0.006*"public" + '
  '0.006*"creditor" + 0.006*"procedure"'),
 (3,
  '0.012*"l

2024-08-06 18:00:15,294 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:00:15,316 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.37098735710118286


2024-08-06 18:00:16,838 : INFO : collecting all words and their counts
2024-08-06 18:00:16,839 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:00:16,858 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:00:16,859 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:00:16,859 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:00:16.859403', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:16,913 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:00:16,922 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(8, 0.9939143)]
Best topic: 8
Top words in this topic:
[('provider', 0.01037252),
 ('directory', 0.009033843),
 ('data', 0.00893654),
 ('property', 0.0077048033),
 ('number', 0.0076728943),
 ('regulatory', 0.0072216517),
 ('framework', 0.0066446466),
 ('subscriber', 0.0066177933),
 ('determine', 0.0060400628),
 ('hellenic_republic', 0.0060101715)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(9, 0.9860133)]
Best topic: 9
Top words in this topic:
[('conjunction', 0.008863642),
 ('read_conjunction', 0.008858949),
 ('vehicle', 0.0070285834),
 ('device', 0.00699981),
 ('substance', 0.0068654343),
 ('required', 0.006795449),
 ('laying', 0.0063763787),
 ('society', 0.0061538955),
 ('information_society', 0.0061527737),
 ('electronic_c

2024-08-06 18:00:16,931 : INFO : using autotuned alpha, starting with [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
2024-08-06 18:00:16,932 : INFO : using serial LDA version on this node
2024-08-06 18:00:16,934 : INFO : running online (multi-pass) LDA training, 20 topics, 10 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-06 18:00:16,934 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-06 18:00:17,117 : INFO : optimized alpha [0.04268983, 0.043067373, 0.042179897, 0.04309466, 0.04332435, 0.0435642, 0.042610034, 0.042235922, 0.04293841, 0.04337061, 0.042375095, 0.041859925, 0.044037703, 0.041844063, 0.041711316, 0.04465974, 0.044842917, 0.042253695, 0.041696712, 0.042087767]
2024-08-06 18:00:17,119 : INFO : topic #14 (0.042): 0.009*"misleading" + 0.008*"propert

LDA model trained successfully
Model saved to lda_models/topics_20_passes_10_iterations_400/trained_model
Dictionary saved to lda_models/topics_20_passes_10_iterations_400/dictionary
Corpus saved to lda_models/topics_20_passes_10_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.2272.

Top topics:
[(0,
  '0.022*"call" + 0.019*"number" + 0.018*"emergency" + 0.012*"telephone" + '
  '0.011*"caller" + 0.011*"single" + 0.011*"available" + 0.010*"make" + '
  '0.009*"location" + 0.009*"emergency_call"'),
 (1,
  '0.013*"television" + 0.011*"comparative" + 0.009*"channel" + '
  '0.008*"concept" + 0.008*"paid" + 0.007*"television_programme" + '
  '0.007*"programme" + 0.007*"lender" + 0.007*"fact" + '
  '0.007*"protected_designation"'),
 (2,
  '0.016*"taking" + 0.015*"taking_account" + 0.013*"framework" + '
  '0.012*"registration" + 0.010*"designation_origin" + 0.009*"taken" + '
  '0.009*"january" + 0.009*"domestic" + 0.009*"ass" + 0.008*"cr

2024-08-06 18:00:19,374 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:00:19,449 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.4194202993059162


2024-08-06 18:00:20,425 : INFO : collecting all words and their counts
2024-08-06 18:00:20,426 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:00:20,446 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:00:20,447 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:00:20,447 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:00:20.447430', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:20,505 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:00:20,514 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(8, 0.99068063)]
Best topic: 8
Top words in this topic:
[('provider', 0.020539593),
 ('directory', 0.014955666),
 ('data', 0.013219105),
 ('subscriber', 0.010091525),
 ('property', 0.010086545),
 ('pre', 0.009465895),
 ('operator', 0.008564912),
 ('making', 0.008430571),
 ('telephone', 0.008194654),
 ('regulatory', 0.007323974)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(3, 0.97846067)]
Best topic: 3
Top words in this topic:
[('le', 0.023002684),
 ('et', 0.020615796),
 ('la', 0.014840674),
 ('en', 0.014018476),
 ('mortgage', 0.013165824),
 ('assurance', 0.011551846),
 ('nature', 0.009156242),
 ('reference', 0.008915772),
 ('characteristic', 0.008470905),
 ('que', 0.008263032)]
Original document: natural becomes scheme impleme

2024-08-06 18:00:20,514 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)", 'datetime': '2024-08-06T18:00:20.514605', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:20,515 : INFO : discarding 1581 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:00:20,516 : INFO : keeping 1835 tokens which were in no less than 2 and no more than 396 (=90.0%) documents
2024-08-06 18:00:20,517 : INFO : resulting dictionary: Dictionary<1835 unique tokens: ['adoption', 'advance', 'appeal', 'body', 'calculating']...>
2024-08-06 18:00:20,523 : INFO : using autotuned alpha, starting with

Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=30, chunksize=2000, passes=20, iterations=200, alpha=auto, eta=auto


2024-08-06 18:00:20,744 : INFO : optimized alpha [0.029575475, 0.028505024, 0.028430996, 0.02836432, 0.02922635, 0.028145883, 0.028201472, 0.027520724, 0.02865073, 0.029611778, 0.028334647, 0.02798051, 0.029205281, 0.028757432, 0.028504597, 0.028950289, 0.028913341, 0.028940096, 0.029624186, 0.028596211, 0.028186448, 0.0282866, 0.029578691, 0.030141218, 0.028447092, 0.02817058, 0.029302282, 0.029109, 0.02936383, 0.027907657]
2024-08-06 18:00:20,747 : INFO : topic #7 (0.028): 0.023*"charge" + 0.020*"authorisation" + 0.020*"licence" + 0.016*"equipment" + 0.013*"subscription" + 0.013*"mobile" + 0.013*"radio" + 0.010*"framework" + 0.010*"contains" + 0.010*"contact"
2024-08-06 18:00:20,747 : INFO : topic #29 (0.028): 0.018*"property" + 0.012*"data" + 0.009*"immovable_property" + 0.009*"present" + 0.009*"electronic_commerce" + 0.009*"processing" + 0.009*"intended" + 0.009*"determine" + 0.009*"course" + 0.009*"damage"
2024-08-06 18:00:20,747 : INFO : topic #22 (0.030): 0.020*"kingdom_spain" +

LDA model trained successfully
Model saved to lda_models/topics_30_passes_20_iterations_200/trained_model
Dictionary saved to lda_models/topics_30_passes_20_iterations_200/dictionary
Corpus saved to lda_models/topics_30_passes_20_iterations_200/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.4319.

Top topics:
[(7,
  '0.024*"charge" + 0.020*"licence" + 0.020*"authorisation" + '
  '0.017*"equipment" + 0.014*"subscription" + 0.014*"mobile" + 0.014*"radio" + '
  '0.010*"paid" + 0.010*"framework" + 0.010*"government"'),
 (29,
  '0.018*"property" + 0.012*"data" + 0.009*"course" + 0.009*"processing" + '
  '0.009*"electronic_commerce" + 0.009*"commerce" + 0.009*"immovable_property" '
  '+ 0.009*"damage" + 0.009*"intended" + 0.009*"determine"'),
 (25,
  '0.018*"energy" + 0.013*"provides" + 0.012*"objection" + 0.012*"building" + '
  '0.012*"creditor" + 0.012*"medium" + 0.009*"apartment" + 0.009*"request" + '
  '0.009*"failure" + 0.009*"ass"'),
 (11,
  

2024-08-06 18:00:23,537 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:00:23,609 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.39563301127189926


2024-08-06 18:00:24,421 : INFO : collecting all words and their counts
2024-08-06 18:00:24,422 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:00:24,443 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:00:24,443 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:00:24,443 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:00:24.443680', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:24,498 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:00:24,507 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(6, 0.99054193)]
Best topic: 6
Top words in this topic:
[('netherlands', 0.01858591),
 ('misleading', 0.014588516),
 ('appeal', 0.012410392),
 ('guarantee', 0.011422765),
 ('number', 0.009888056),
 ('investment', 0.00935293),
 ('spain', 0.009351912),
 ('agricultural', 0.009351911),
 ('premise', 0.009351807),
 ('away', 0.009351785)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(24, 0.9781087)]
Best topic: 24
Top words in this topic:
[('production', 0.016320156),
 ('natural', 0.015349838),
 ('di', 0.013507826),
 ('natural_person', 0.013167158),
 ('charge', 0.010985823),
 ('building', 0.010985823),
 ('activity', 0.010984957),
 ('austria', 0.010958249),
 ('relied', 0.010955505),
 ('region', 0.010877161)]
Original document: natural b

2024-08-06 18:00:24,517 : INFO : using autotuned alpha, starting with [0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025]
2024-08-06 18:00:24,518 : INFO : using serial LDA version on this node
2024-08-06 18:00:24,522 : INFO : running online (multi-pass) LDA training, 40 topics, 20 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-06 18:00:24,522 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-06 18:00:24,703 : INFO : optimized alpha [0.02349356, 0.023303235, 0.023301981, 0.02324872, 0.023735337, 0.022992503, 0.022927295, 0.022742668, 0.023425113, 0.023486177, 0.02305684, 0.022992043, 0.023240326, 0.022863349, 

LDA model trained successfully
Model saved to lda_models/topics_40_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_40_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_40_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.2060.

Top topics:
[(7,
  '0.028*"charge" + 0.028*"licence" + 0.024*"authorisation" + '
  '0.024*"equipment" + 0.019*"radio" + 0.019*"mobile" + 0.019*"subscription" + '
  '0.014*"paid" + 0.014*"government" + 0.014*"generic"'),
 (39,
  '0.018*"shop" + 0.016*"chain" + 0.016*"retail" + 0.014*"replacement" + '
  '0.014*"size" + 0.014*"conformity" + 0.012*"intelligible" + 0.012*"factual" '
  '+ 0.012*"instrument" + 0.009*"breach"'),
 (6,
  '0.020*"defect" + 0.016*"link" + 0.014*"damage" + 0.013*"supplied" + '
  '0.013*"hand" + 0.013*"according" + 0.013*"victim" + '
  '0.013*"july_approximation" + 0.013*"liability_defective" + '
  '0.013*"evidence"'),
 (25,
  '0.

2024-08-06 18:00:27,647 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:00:27,791 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.45781310076842285

Document 0 topics:
[(8, 0.9893978)]
Best topic: 8
Top words in this topic:
[('property', 0.020966977),
 ('territory', 0.017985748),
 ('provider', 0.017983776),
 ('tax', 0.0149988895),
 ('rental', 0.0149988895),
 ('collection', 0.0149988895),
 ('intermediation', 0.0149988895),
 ('appeal', 0.014956511),
 ('maximum', 0.013815023),
 ('operator', 0.012192103)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(9, 0.9755253)]
Best topic: 9
Top words in this topic:
[('consequence', 0.016474137),
 ('removal', 0.012382831),
 ('annulment', 0.012371701),
 ('finding', 0.010641414),
 ('found', 0.010609204),
 ('unfavourable', 0.010488412),
 ('taking', 0.009814256),
 ('ingredient', 0.009716375),
 ('take', 0.009501444),
 ('element

2024-08-06 18:00:29,225 : INFO : collecting all words and their counts
2024-08-06 18:00:29,225 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:00:29,253 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:00:29,254 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:00:29,254 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.03s', 'datetime': '2024-08-06T18:00:29.254830', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:29,340 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:00:29,350 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do

After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec']...
Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=50, chunksize=2000, passes=20, iterations=400, alpha=auto, eta=auto


2024-08-06 18:00:29,571 : INFO : optimized alpha [0.018944805, 0.018849317, 0.018939106, 0.0189471, 0.018892381, 0.01874829, 0.018892094, 0.01865193, 0.018797796, 0.018750213, 0.018650336, 0.018891875, 0.018942794, 0.01865032, 0.018602738, 0.019093635, 0.01889918, 0.01908968, 0.018893685, 0.018746495, 0.018747656, 0.018796807, 0.018891048, 0.018845234, 0.019041177, 0.018798603, 0.018897804, 0.018941784, 0.01909139, 0.018895622, 0.01909204, 0.018949687, 0.01894904, 0.01874521, 0.01894558, 0.018649394, 0.01913552, 0.018797265, 0.01874665, 0.018844029, 0.0187508, 0.019136578, 0.01899879, 0.0187478, 0.018896656, 0.01874477, 0.01884812, 0.018797081, 0.018699378, 0.018794477]
2024-08-06 18:00:29,575 : INFO : topic #14 (0.019): 0.026*"human" + 0.025*"medicinal" + 0.018*"classification" + 0.014*"taken" + 0.014*"ingredient" + 0.013*"damage" + 0.012*"consideration" + 0.011*"need" + 0.011*"safety" + 0.011*"supplement"
2024-08-06 18:00:29,576 : INFO : topic #13 (0.019): 0.019*"substance" + 0.017*"

LDA model trained successfully
Model saved to lda_models/topics_50_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_50_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_50_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.6679.

Top topics:
[(13,
  '0.032*"substance" + 0.020*"intermediary" + 0.020*"arbitration" + '
  '0.013*"providing" + 0.013*"since" + 0.013*"natural" + 0.013*"acting" + '
  '0.013*"purchaser" + 0.013*"annulment" + 0.013*"award"'),
 (7,
  '0.025*"void" + 0.017*"according" + 0.016*"year" + 0.015*"though" + '
  '0.015*"even_though" + 0.012*"event" + 0.012*"conclusion" + '
  '0.012*"annulment" + 0.012*"supplementary" + 0.012*"performance"'),
 (47,
  '0.046*"trade_mark" + 0.023*"operator" + 0.019*"conformity" + '
  '0.017*"proprietor" + 0.017*"lack_conformity" + 0.017*"lack" + '
  '0.015*"online" + 0.011*"offer" + 0.011*"bearing" + 0.009*"latter"'),
 (19,
  '0.0

2024-08-06 18:00:32,489 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:00:32,634 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.415372202021166


2024-08-06 18:00:33,923 : INFO : collecting all words and their counts
2024-08-06 18:00:33,924 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:00:33,944 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:00:33,945 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:00:33,945 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:00:33.945263', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:34,002 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:00:34,012 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(28, 0.9885939)]
Best topic: 28
Top words in this topic:
[('appeal', 0.013519115),
 ('taken', 0.013306002),
 ('misleading', 0.011443252),
 ('purchase', 0.011166524),
 ('taking', 0.010150058),
 ('taking_account', 0.010150058),
 ('particularly', 0.010150057),
 ('consequence', 0.010150057),
 ('take', 0.010060665),
 ('set', 0.009889121)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(24, 0.9736317)]
Best topic: 24
Top words in this topic:
[('reasonably', 0.014118347),
 ('registration', 0.014073485),
 ('evocation', 0.012107571),
 ('mandatory', 0.012107482),
 ('additional', 0.012107317),
 ('designation_origin', 0.012106311),
 ('protected_designation', 0.012094261),
 ('operator', 0.010096376),
 ('natural', 0.009436778),
 ('informed', 0.

2024-08-06 18:00:34,012 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)", 'datetime': '2024-08-06T18:00:34.012768', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:00:34,014 : INFO : discarding 1581 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:00:34,014 : INFO : keeping 1835 tokens which were in no less than 2 and no more than 396 (=90.0%) documents
2024-08-06 18:00:34,015 : INFO : resulting dictionary: Dictionary<1835 unique tokens: ['adoption', 'advance', 'appeal', 'body', 'calculating']...>
2024-08-06 18:00:34,022 : INFO : using autotuned alpha, starting with

Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=60, chunksize=2000, passes=20, iterations=400, alpha=auto, eta=auto


2024-08-06 18:00:34,258 : INFO : optimized alpha [0.01572199, 0.015574523, 0.015293504, 0.015712138, 0.0158083, 0.015344188, 0.015269442, 0.015427192, 0.015315065, 0.015658578, 0.015416786, 0.015454848, 0.015815957, 0.0153155485, 0.015234684, 0.015476436, 0.015056683, 0.015493409, 0.015532221, 0.015396251, 0.015232749, 0.015315992, 0.015476498, 0.015676275, 0.015314964, 0.015568478, 0.015296631, 0.015637089, 0.015714444, 0.01549731, 0.01555155, 0.0154390475, 0.015534139, 0.0152602475, 0.015495106, 0.015057095, 0.01569443, 0.0155749265, 0.01529721, 0.015554561, 0.015314582, 0.015596207, 0.015675724, 0.015374306, 0.01569544, 0.015531749, 0.015669001, 0.015495119, 0.0153922355, 0.01565641, 0.015481092, 0.015612102, 0.015712969, 0.015299165, 0.015517615, 0.015515272, 0.015416058, 0.015235112, 0.015628088, 0.01535451]
2024-08-06 18:00:34,263 : INFO : topic #35 (0.015): 0.029*"significant" + 0.029*"least" + 0.029*"obtained" + 0.028*"novel" + 0.016*"country" + 0.016*"continued" + 0.015*"confi

LDA model trained successfully
Model saved to lda_models/topics_60_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_60_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_60_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.7719.

Top topics:
[(16,
  '0.148*"ingredient" + 0.096*"list" + 0.096*"list_ingredient" + '
  '0.044*"vitamin" + 0.043*"used" + 0.028*"added" + 0.028*"include" + '
  '0.015*"formulation" + 0.015*"hungarian" + 0.015*"addition"'),
 (35,
  '0.054*"german_selected" + 0.054*"german" + 0.044*"least" + '
  '0.029*"significant" + 0.029*"obtained" + 0.029*"novel" + 0.016*"content" + '
  '0.015*"event" + 0.015*"country" + 0.015*"continued"'),
 (14,
  '0.028*"building" + 0.028*"element" + 0.023*"apartment" + '
  '0.023*"establishing" + 0.017*"agent" + 0.017*"area" + 0.017*"registered" + '
  '0.011*"co" + 0.011*"profession" + 0.011*"ownership"'),
 (57,
  '0.023*"yoghur

2024-08-06 18:00:37,412 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:00:37,564 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.40525430666257733

Document 0 topics:
[(49, 0.9879781)]
Best topic: 49
Top words in this topic:
[('framework', 0.054647855),
 ('regulatory', 0.042722065),
 ('common_regulatory', 0.02592592),
 ('common', 0.024889586),
 ('access', 0.023246186),
 ('operator', 0.020700432),
 ('appeal', 0.014435746),
 ('conjunction', 0.013808917),
 ('telecommunication', 0.013297873),
 ('number', 0.012348304)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(13, 0.9722075)]
Best topic: 13
Top words in this topic:
[('according', 0.022028448),
 ('year', 0.02176572),
 ('purchase', 0.01642598),
 ('annulment', 0.016346829),
 ('contained', 0.016346829),
 ('void', 0.016346829),
 ('natural', 0.016346827),
 ('natural_person', 0.016346827),
 ('gift', 0.016346825),