In [2]:
import os
import logging
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class TopicModeler:
    def __init__(self, input_path):
        self.input_path = input_path
        self.docs = []
        self.filtered_docs = []
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.coherence_scores = []
        self.perplexity_scores = []
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    def load_documents(self):
        for filename in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, filename)
            if os.path.isfile(file_path) and filename.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    if isinstance(content, str):
                        self.docs.append(content)
        print(f"Loaded {len(self.docs)} documents")
        if self.docs:
            print(f"Sample document: {self.docs[0][:100]}...")

    def preprocess_documents(self):
        self.docs = [[token.lower() for token in self.tokenizer.tokenize(doc)] for doc in self.docs if isinstance(doc, str)]
        self.docs = [[token for token in doc if not token.isnumeric() and len(token) > 1] for doc in self.docs]
        stop_words = set(stopwords.words('english'))
        self.docs = [[word for word in doc if word not in stop_words] for doc in self.docs]
        self.docs = [[self.lemmatizer.lemmatize(token) for token in doc] for doc in self.docs]
        print(f"After preprocessing: {len(self.docs)} documents")
        if self.docs:
            print(f"Sample preprocessed document: {self.docs[0][:10]}...")

    def add_bigrams(self):
        bigram = Phrases(self.docs, min_count=5, threshold=100)
        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    self.docs[idx].append(token)
        print(f"Added bigrams. Sample document: {self.docs[0][:15]}...")

    def filter_with_tfidf(self, top_n=200, bottom_n=50):
        texts = [' '.join(doc) for doc in self.docs]
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
        feature_names = tfidf_vectorizer.get_feature_names_out()
        mean_tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
        
        top_indices = mean_tfidf_scores.argsort()[-top_n:]
        bottom_indices = mean_tfidf_scores.argsort()[:bottom_n]
        
        words_to_remove = set(feature_names[i] for i in np.concatenate([top_indices, bottom_indices]))

        self.filtered_docs = [[word for word in doc if word not in words_to_remove] for doc in self.docs]
        print(f"Filtered documents using TF-IDF. Sample filtered document: {self.filtered_docs[0][:10]}...")

    def create_dictionary_and_corpus(self):
        self.dictionary = Dictionary(self.filtered_docs)
        original_size = len(self.dictionary)
        self.dictionary.filter_extremes(no_below=2, no_above=0.9)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.filtered_docs]
        print(f"Dictionary size: {len(self.dictionary)} (reduced from {original_size})")
        print(f"Corpus size: {len(self.corpus)}")
        if self.corpus:
            print(f"Sample corpus entry: {self.corpus[0][:10]}")

    def train_lda_model(self, num_topics=6, chunksize=2000, passes=20, iterations=400, alpha='auto', eta='auto'):
        if not self.corpus or not self.dictionary:
            raise ValueError("Corpus or dictionary is empty. Check your preprocessing steps.")
        
        print(f"Training LDA model with num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}, alpha={alpha}, eta={eta}")
        
        self.model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            chunksize=chunksize,
            alpha=alpha,
            eta=eta,
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=None,
            random_state=42
        )
        print("LDA model trained successfully")

    def print_model_info(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
        top_topics = self.model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / self.model.num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)
        print("\nTop topics:")
        pprint(self.model.print_topics())
        print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        perplexity=self.model.log_perplexity(self.corpus)
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.filtered_docs, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return coherence_lda,perplexity

    def visualize_topics(self):
        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary)
        return vis

    def analyze_documents(self):
        for i, doc in enumerate(self.filtered_docs[:5]):
            bow = self.dictionary.doc2bow(doc)
            doc_topics = self.model.get_document_topics(bow)
            print(f"\nDocument {i} topics:")
            pprint(doc_topics)
            best_topic = max(doc_topics, key=lambda x: x[1])
            print(f"Best topic: {best_topic[0]}")
            print(f"Top words in this topic:")
            pprint(self.model.show_topic(best_topic[0]))
            print(f"Original document: {' '.join(doc[:30])}...")
            print()

    def save_model(self, file_path):
        if self.model is None:
            raise ValueError("No model has been trained yet.")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        self.model.save(file_path)
        print(f"Model saved to {file_path}")

    def load_model(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No model file found at {file_path}")
        self.model = LdaModel.load(file_path)
        print(f"Model loaded from {file_path}")

    def save_dictionary_and_corpus(self, dict_path, corpus_path):
        if self.dictionary is None or self.corpus is None:
            raise ValueError("Dictionary and corpus have not been created yet.")
        os.makedirs(os.path.dirname(dict_path), exist_ok=True)
        os.makedirs(os.path.dirname(corpus_path), exist_ok=True)
        self.dictionary.save(dict_path)
        with open(corpus_path, 'wb') as f:
            pickle.dump(self.corpus, f)
        print(f"Dictionary saved to {dict_path}")
        print(f"Corpus saved to {corpus_path}")

    def load_dictionary_and_corpus(self, dict_path, corpus_path):
        if not os.path.exists(dict_path) or not os.path.exists(corpus_path):
            raise FileNotFoundError(f"Dictionary or corpus file not found.")
        self.dictionary = Dictionary.load(dict_path)
        with open(corpus_path, 'rb') as f:
            self.corpus = pickle.load(f)
        print(f"Dictionary loaded from {dict_path}")
        print(f"Corpus loaded from {corpus_path}")

    def plot_scores(self, iteration):
        num_topics = [x[0] for x in self.coherence_scores]
        coherences = [x[1] for x in self.coherence_scores]
        perplexities = [x[1] for x in self.perplexity_scores]  # Extract perplexities

        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        plt.plot(num_topics, coherences, marker='o', label='Coherence')
        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence Score')
        plt.title('Coherence Score by Number of Topics')
        plt.grid(True)
        plt.xticks(num_topics)
        plt.ylim(0, max(coherences) * 1.1)
        
        plt.subplot(1, 2, 2)
        plt.plot(num_topics, perplexities, marker='o', color='r', label='Perplexity')
        plt.xlabel('Number of Topics')
        plt.ylabel('Perplexity')
        plt.title('Perplexity by Number of Topics')
        plt.grid(True)
        plt.xticks(num_topics)

        plt.tight_layout()
        plt.savefig(f'coherence_perplexity_scores_iteration_{iteration}.png')
        plt.close()

    def get_word_frequencies(self):
        topics = self.model.get_topics()
        word_frequencies = np.sum(topics, axis=0)
        word_frequencies /= self.model.num_topics
        return {self.dictionary[i]: freq for i, freq in enumerate(word_frequencies)}

    def get_low_frequency_words(self, threshold=0.01):
        word_frequencies = self.get_word_frequencies()
        return {word: freq for word, freq in word_frequencies.items() if freq < threshold}

    def get_high_frequency_words(self, threshold=0.00016):
        word_frequencies = self.get_word_frequencies()
        high_freq_words = {word: freq for word, freq in word_frequencies.items() if freq > threshold}
        
        if not high_freq_words:
            print(f"No words found with a frequency higher than the threshold of {threshold}.")
        
        return high_freq_words


    def save_frequency_words(self, low_freq_file, high_freq_file, low_threshold=0.01, high_threshold=0.1):
        low_freq_words = self.get_low_frequency_words(low_threshold)
        high_freq_words = self.get_high_frequency_words(high_threshold)

        df_low = pd.DataFrame(list(low_freq_words.items()), columns=['Word', 'Frequency'])
        df_low = df_low.sort_values('Frequency', ascending=True)
        df_low.to_csv(low_freq_file, index=False)
        print(f"Low frequency words saved to {low_freq_file}")

        df_high = pd.DataFrame(list(high_freq_words.items()), columns=['Word', 'Frequency'])
        df_high = df_high.sort_values('Frequency', ascending=False)
        df_high.to_csv(high_freq_file, index=False)
        print(f"High frequency words saved to {high_freq_file}")

    def run_with_params(self, num_topics, chunksize, passes, iterations):
        try:
            self.load_documents()
            self.preprocess_documents()
            self.add_bigrams()
            self.filter_with_tfidf(top_n=200, bottom_n=50)
            self.create_dictionary_and_corpus()
            self.train_lda_model(num_topics=num_topics, chunksize=chunksize, passes=passes, iterations=iterations)
            model_dir = f"lda_models/topics_{num_topics}_passes_{passes}_iterations_{iterations}"
            self.save_model(os.path.join(model_dir, 'trained_model'))
            self.save_dictionary_and_corpus(os.path.join(model_dir, 'dictionary'), os.path.join(model_dir, 'corpus'))
            coherence_lda, perplexity = self.print_model_info()  # Capture perplexity
            self.coherence_scores.append((num_topics, coherence_lda))
            self.perplexity_scores.append((num_topics, perplexity))  # Store perplexity
            
            # Plot and save coherence and perplexity score after each run
            self.plot_scores(len(self.coherence_scores))
            
            vis = self.visualize_topics()
            pyLDAvis.save_html(vis, os.path.join(model_dir, 'lda_visualization.html'))
            self.analyze_documents()
            
            return vis
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    modeler = TopicModeler('txt')

    param_combinations = [
        (15, 2000, 10, 200),
        (30, 2000, 10, 400),
        (45, 2000, 20, 200),
        (60, 2000, 20, 400),
        (75, 2000, 20, 400),
        (90, 2000, 20, 400)
    ]

    csv_folder = 'csv_output'
    os.makedirs(csv_folder, exist_ok=True)

    for num_topics, chunksize, passes, iterations in param_combinations:
        print(f"\nRunning model with parameters: num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}")
        modeler.run_with_params(num_topics, chunksize, passes, iterations)
    
    modeler.save_frequency_words(
        os.path.join(csv_folder, 'low_freq_words.csv'),
        os.path.join(csv_folder, 'high_freq_words.csv'),
        low_threshold=0.01,
    high_threshold=0.00016
    )


Running model with parameters: num_topics=15, chunksize=2000, passes=10, iterations=200
Loaded 440 documents
Sample document: Pricing for interconnection related to the provision of number portability, as referred to in Articl...


2024-08-06 18:19:24,839 : INFO : collecting all words and their counts
2024-08-06 18:19:24,840 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:19:24,862 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:19:24,862 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:19:24,863 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:19:24.863347', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:19:24,938 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:19:24,948 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do

After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec']...
Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=15, chunksize=2000, passes=10, iterations=200, alpha=auto, eta=auto


2024-08-06 18:19:25,119 : INFO : optimized alpha [0.05601553, 0.055268753, 0.0526802, 0.054346085, 0.054080088, 0.053848427, 0.053853672, 0.051989175, 0.05495826, 0.056449115, 0.05256947, 0.0520408, 0.05585058, 0.054048866, 0.05386136]
2024-08-06 18:19:25,121 : INFO : topic #7 (0.052): 0.012*"device" + 0.009*"charge" + 0.008*"equipment" + 0.008*"licence" + 0.007*"paid" + 0.006*"authorisation" + 0.006*"vehicle" + 0.006*"damage" + 0.006*"year" + 0.005*"however"
2024-08-06 18:19:25,121 : INFO : topic #11 (0.052): 0.012*"operator" + 0.010*"damage" + 0.010*"wine" + 0.008*"notification" + 0.007*"established" + 0.007*"accommodation" + 0.007*"description" + 0.007*"used" + 0.006*"property" + 0.006*"third"
2024-08-06 18:19:25,122 : INFO : topic #12 (0.056): 0.008*"read_conjunction" + 0.008*"conjunction" + 0.007*"set" + 0.007*"take" + 0.007*"required" + 0.006*"water" + 0.006*"conformity" + 0.005*"place" + 0.005*"packaging" + 0.005*"second"
2024-08-06 18:19:25,122 : INFO : topic #0 (0.056): 0.009*

LDA model trained successfully
Model saved to lda_models/topics_15_passes_10_iterations_200/trained_model
Dictionary saved to lda_models/topics_15_passes_10_iterations_200/dictionary
Corpus saved to lda_models/topics_15_passes_10_iterations_200/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.8608.

Top topics:
[(0,
  '0.014*"call" + 0.012*"emergency" + 0.011*"number" + 0.008*"single" + '
  '0.007*"telephone" + 0.007*"caller" + 0.007*"lender" + 0.007*"available" + '
  '0.007*"make" + 0.006*"provide"'),
 (1,
  '0.010*"protected_designation" + 0.010*"registration" + 0.010*"agricultural" '
  '+ 0.009*"designation_origin" + 0.009*"grand" + 0.009*"grand_duchy" + '
  '0.009*"luxembourg" + 0.009*"duchy" + 0.008*"trade_mark" + 0.008*"fact"'),
 (2,
  '0.009*"framework" + 0.009*"ensure" + 0.008*"procedure" + 0.008*"regulatory" '
  '+ 0.008*"january" + 0.007*"registration" + 0.007*"domestic" + '
  '0.007*"defect" + 0.007*"drink" + 0.007*"spirit"'),
 (3,
 

2024-08-06 18:19:27,674 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:19:27,708 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.3833122839884157


2024-08-06 18:19:29,281 : INFO : collecting all words and their counts
2024-08-06 18:19:29,282 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:19:29,304 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:19:29,305 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:19:29,305 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:19:29.305826', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}



Document 0 topics:
[(8, 0.99196386)]
Best topic: 8
Top words in this topic:
[('directory', 0.014993575),
 ('data', 0.014855327),
 ('property', 0.010726822),
 ('subscriber', 0.010717523),
 ('provider', 0.010100084),
 ('telephone', 0.008550041),
 ('dispute', 0.008044642),
 ('territory', 0.008028072),
 ('procedure', 0.007948749),
 ('number', 0.0066399556)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(13, 0.9814167)]
Best topic: 13
Top words in this topic:
[('natural', 0.01735919),
 ('acting', 0.010139426),
 ('natural_person', 0.009203744),
 ('profession', 0.009188015),
 ('producer', 0.0083213225),
 ('brought', 0.008177817),
 ('manufacturer', 0.008069699),
 ('substance', 0.007156122),
 ('holiday', 0.00696203),
 ('german', 0.0069040526)]
Original docum

2024-08-06 18:19:29,365 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:19:29,376 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)
2024-08-06 18:19:29,376 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)", 'datetime': '2024-08-06T18:19:29.376567', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:19:29,377 : INFO : discarding 1581 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:19:29,377 : INFO : keeping 1835 tokens which were in no les

Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=30, chunksize=2000, passes=10, iterations=400, alpha=auto, eta=auto


2024-08-06 18:19:29,804 : INFO : optimized alpha [0.02937443, 0.028386412, 0.028165247, 0.028358344, 0.029134613, 0.028022382, 0.02801169, 0.027512934, 0.028642654, 0.029534196, 0.02818097, 0.027826682, 0.028844334, 0.028637776, 0.028375862, 0.02875994, 0.028816748, 0.028932728, 0.029548159, 0.028515493, 0.028108556, 0.0282774, 0.02942291, 0.029997833, 0.02839255, 0.028161807, 0.029253058, 0.029024916, 0.02932307, 0.027899768]
2024-08-06 18:19:29,806 : INFO : topic #7 (0.028): 0.023*"charge" + 0.020*"authorisation" + 0.020*"licence" + 0.016*"equipment" + 0.013*"subscription" + 0.013*"mobile" + 0.013*"radio" + 0.010*"framework" + 0.010*"contains" + 0.010*"contact"
2024-08-06 18:19:29,807 : INFO : topic #11 (0.028): 0.020*"wine" + 0.018*"notification" + 0.017*"substance" + 0.013*"classification" + 0.013*"control" + 0.013*"set" + 0.013*"annex" + 0.010*"adaptation" + 0.010*"progress" + 0.010*"technical"
2024-08-06 18:19:29,807 : INFO : topic #9 (0.030): 0.013*"country" + 0.010*"beer" + 0.0

LDA model trained successfully
Model saved to lda_models/topics_30_passes_10_iterations_400/trained_model
Dictionary saved to lda_models/topics_30_passes_10_iterations_400/dictionary
Corpus saved to lda_models/topics_30_passes_10_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.9244.

Top topics:
[(7,
  '0.023*"charge" + 0.020*"authorisation" + 0.020*"licence" + '
  '0.017*"equipment" + 0.013*"subscription" + 0.013*"mobile" + 0.013*"radio" + '
  '0.010*"framework" + 0.010*"contains" + 0.010*"paid"'),
 (29,
  '0.018*"property" + 0.012*"data" + 0.009*"immovable_property" + '
  '0.009*"electronic_commerce" + 0.009*"processing" + 0.009*"course" + '
  '0.009*"present" + 0.009*"intended" + 0.009*"commerce" + 0.009*"determine"'),
 (11,
  '0.019*"wine" + 0.019*"substance" + 0.017*"notification" + '
  '0.015*"preliminary_ruling" + 0.015*"preliminary" + 0.014*"ruling" + '
  '0.014*"classification" + 0.012*"set" + 0.012*"control" + 0.012*"a

2024-08-06 18:19:32,505 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:19:32,574 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.3864210857122564


2024-08-06 18:19:33,822 : INFO : collecting all words and their counts
2024-08-06 18:19:33,822 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:19:33,844 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:19:33,845 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:19:33,845 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:19:33.845386', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}



Document 0 topics:
[(6, 0.98917544)]
Best topic: 6
Top words in this topic:
[('netherlands', 0.018356325),
 ('misleading', 0.0143250795),
 ('number', 0.013241817),
 ('appeal', 0.0123210875),
 ('investment', 0.009293395),
 ('spain', 0.009293393),
 ('agricultural', 0.009293388),
 ('operator', 0.009293386),
 ('document', 0.009284686),
 ('away', 0.009282228)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(24, 0.9749858)]
Best topic: 24
Top words in this topic:
[('natural', 0.017951218),
 ('natural_person', 0.015394619),
 ('building', 0.012864882),
 ('charge', 0.012864875),
 ('reasonably', 0.010328706),
 ('activity', 0.010317606),
 ('evocation', 0.010308915),
 ('apartment', 0.010308912),
 ('packet', 0.010308908)]
Original document: natural becomes scheme

2024-08-06 18:19:33,903 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:19:33,913 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)
2024-08-06 18:19:33,913 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)", 'datetime': '2024-08-06T18:19:33.913948', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:19:33,914 : INFO : discarding 1581 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:19:33,915 : INFO : keeping 1835 tokens which were in no les

Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=45, chunksize=2000, passes=20, iterations=200, alpha=auto, eta=auto


2024-08-06 18:19:34,128 : INFO : topic #7 (0.020): 0.025*"licence" + 0.025*"charge" + 0.021*"authorisation" + 0.021*"equipment" + 0.017*"subscription" + 0.017*"mobile" + 0.017*"radio" + 0.017*"instrument" + 0.014*"bilateral" + 0.013*"milk"
2024-08-06 18:19:34,129 : INFO : topic #5 (0.020): 0.019*"finding" + 0.016*"nature" + 0.013*"full" + 0.013*"exchange" + 0.011*"limitation" + 0.011*"posse" + 0.011*"run" + 0.011*"prevents" + 0.011*"kingdom_spain" + 0.011*"seised"
2024-08-06 18:19:34,129 : INFO : topic #28 (0.021): 0.011*"taking" + 0.010*"consequence" + 0.010*"particularly" + 0.010*"taking_account" + 0.009*"take" + 0.009*"unfavourable" + 0.009*"taken" + 0.009*"mean" + 0.009*"misleading" + 0.008*"wine"
2024-08-06 18:19:34,129 : INFO : topic #15 (0.021): 0.013*"organic" + 0.010*"notification" + 0.009*"annex" + 0.009*"replacement" + 0.009*"agricultural" + 0.009*"production" + 0.008*"system" + 0.008*"nature" + 0.007*"significant" + 0.006*"domestic"
2024-08-06 18:19:34,130 : INFO : topic #3

LDA model trained successfully
Model saved to lda_models/topics_45_passes_20_iterations_200/trained_model
Dictionary saved to lda_models/topics_45_passes_20_iterations_200/dictionary
Corpus saved to lda_models/topics_45_passes_20_iterations_200/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.6677.

Top topics:
[(7,
  '0.022*"charge" + 0.022*"licence" + 0.019*"authorisation" + '
  '0.019*"equipment" + 0.019*"instrument" + 0.015*"mobile" + '
  '0.015*"subscription" + 0.015*"radio" + 0.015*"bilateral" + 0.015*"system"'),
 (2,
  '0.018*"position" + 0.017*"consequence" + 0.015*"whereby" + 0.015*"body" + '
  '0.015*"judicial" + 0.015*"absence" + 0.013*"understand" + 0.011*"found" + '
  '0.011*"dispute" + 0.011*"enabling"'),
 (25,
  '0.040*"device" + 0.035*"vehicle" + 0.019*"defeat" + 0.019*"defeat_device" + '
  '0.017*"access" + 0.014*"water" + 0.012*"installed" + 0.012*"emission" + '
  '0.010*"pre" + 0.010*"supply"'),
 (11,
  '0.024*"wine" + 0.021*"

2024-08-06 18:19:35,161 : INFO : -7.518 per-word bound, 183.3 perplexity estimate based on a held-out corpus of 440 documents with 16915 words
2024-08-06 18:19:35,164 : INFO : using ParallelWordOccurrenceAccumulator<processes=9, batch_size=64> to estimate probabilities from sliding windows
2024-08-06 18:19:37,289 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:19:37,415 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.40886857422732503


2024-08-06 18:19:38,848 : INFO : collecting all words and their counts
2024-08-06 18:19:38,849 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:19:38,870 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:19:38,871 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:19:38,871 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:19:38.871524', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:19:38,926 : INFO : adding document #0 to Dictionary<0 unique tokens: []>



Document 0 topics:
[(8, 0.9890106)]
Best topic: 8
Top words in this topic:
[('pre', 0.02138557),
 ('directory', 0.019200975),
 ('data', 0.019200975),
 ('grand', 0.01918888),
 ('subscriber', 0.014413992),
 ('number', 0.0120205),
 ('appeal', 0.012019179),
 ('registration', 0.010739128),
 ('used', 0.010442514),
 ('telephone', 0.009627009)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(3, 0.97453934)]
Best topic: 3
Top words in this topic:
[('et', 0.05041293),
 ('le', 0.048398744),
 ('la', 0.03691957),
 ('en', 0.034295086),
 ('assurance', 0.028250886),
 ('que', 0.021753293),
 ('reference', 0.019926973),
 ('nature', 0.01817723),
 ('characteristic', 0.016162498),
 ('used', 0.014147069)]
Original document: natural becomes scheme implemented company allowi

2024-08-06 18:19:38,936 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)
2024-08-06 18:19:38,936 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)", 'datetime': '2024-08-06T18:19:38.936601', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:19:38,937 : INFO : discarding 1581 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:19:38,938 : INFO : keeping 1835 tokens which were in no less than 2 and no more than 396 (=90.0%) documents
2024-08-06 18:19:38,939 : INFO : resul

Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=60, chunksize=2000, passes=20, iterations=400, alpha=auto, eta=auto


2024-08-06 18:19:39,164 : INFO : optimized alpha [0.01572199, 0.015574523, 0.015293504, 0.015712138, 0.0158083, 0.015344188, 0.015269442, 0.015427192, 0.015315065, 0.015658578, 0.015416786, 0.015454848, 0.015815957, 0.0153155485, 0.015234684, 0.015476436, 0.015056683, 0.015493409, 0.015532221, 0.015396251, 0.015232749, 0.015315992, 0.015476498, 0.015676275, 0.015314964, 0.015568478, 0.015296631, 0.015637089, 0.015714444, 0.01549731, 0.01555155, 0.0154390475, 0.015534139, 0.0152602475, 0.015495106, 0.015057095, 0.01569443, 0.0155749265, 0.01529721, 0.015554561, 0.015314582, 0.015596207, 0.015675724, 0.015374306, 0.01569544, 0.015531749, 0.015669001, 0.015495119, 0.0153922355, 0.01565641, 0.015481092, 0.015612102, 0.015712969, 0.015299165, 0.015517615, 0.015515272, 0.015416058, 0.015235112, 0.015628088, 0.01535451]
2024-08-06 18:19:39,169 : INFO : topic #35 (0.015): 0.029*"significant" + 0.029*"least" + 0.029*"obtained" + 0.028*"novel" + 0.016*"country" + 0.016*"continued" + 0.015*"confi

LDA model trained successfully
Model saved to lda_models/topics_60_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_60_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_60_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.7719.

Top topics:
[(16,
  '0.148*"ingredient" + 0.096*"list" + 0.096*"list_ingredient" + '
  '0.044*"vitamin" + 0.043*"used" + 0.028*"added" + 0.028*"include" + '
  '0.015*"formulation" + 0.015*"hungarian" + 0.015*"addition"'),
 (35,
  '0.054*"german_selected" + 0.054*"german" + 0.044*"least" + '
  '0.029*"significant" + 0.029*"obtained" + 0.029*"novel" + 0.016*"content" + '
  '0.015*"event" + 0.015*"country" + 0.015*"continued"'),
 (14,
  '0.028*"building" + 0.028*"element" + 0.023*"apartment" + '
  '0.023*"establishing" + 0.017*"agent" + 0.017*"area" + 0.017*"registered" + '
  '0.011*"co" + 0.011*"profession" + 0.011*"ownership"'),
 (57,
  '0.023*"yoghur

2024-08-06 18:19:41,981 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:19:42,179 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.40525430666257733


2024-08-06 18:19:43,806 : INFO : collecting all words and their counts
2024-08-06 18:19:43,807 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:19:43,828 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences



Document 0 topics:
[(49, 0.9879781)]
Best topic: 49
Top words in this topic:
[('framework', 0.054647855),
 ('regulatory', 0.042722065),
 ('common_regulatory', 0.02592592),
 ('common', 0.024889586),
 ('access', 0.023246186),
 ('operator', 0.020700432),
 ('appeal', 0.014435746),
 ('conjunction', 0.013808917),
 ('telecommunication', 0.013297873),
 ('number', 0.012348304)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(13, 0.9722075)]
Best topic: 13
Top words in this topic:
[('according', 0.022028448),
 ('year', 0.02176572),
 ('purchase', 0.01642598),
 ('annulment', 0.016346829),
 ('contained', 0.016346829),
 ('void', 0.016346829),
 ('natural', 0.016346827),
 ('natural_person', 0.016346827),
 ('gift', 0.016346825),
 ('purchaser', 0.01634379)]
Original d

2024-08-06 18:19:43,829 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:19:43,830 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:19:43.830123', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:19:43,890 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:19:43,900 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)
2024-08-06 18:19:43,900 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19243 corpus positions)", 'datetime': '2024-08-06T18:19:43.90063

Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=75, chunksize=2000, passes=20, iterations=400, alpha=auto, eta=auto


2024-08-06 18:19:49,749 : INFO : optimized alpha [0.012793851, 0.01279405, 0.012762488, 0.012985252, 0.012888289, 0.012857254, 0.012729078, 0.01273002, 0.012857234, 0.012794197, 0.012920316, 0.012826434, 0.012825834, 0.012761364, 0.012889173, 0.012889426, 0.012697535, 0.012792024, 0.013048324, 0.012697911, 0.012760736, 0.012730366, 0.012889898, 0.012730356, 0.012729738, 0.012793123, 0.012954483, 0.0129536055, 0.012793119, 0.012761212, 0.012825, 0.012921884, 0.01298343, 0.0127295405, 0.01279421, 0.012919031, 0.012823909, 0.012825692, 0.012792917, 0.012794032, 0.012697884, 0.012857181, 0.012858876, 0.012761688, 0.012859076, 0.012761202, 0.01288878, 0.012826726, 0.012825145, 0.012856586, 0.012763296, 0.012920044, 0.012888667, 0.012761553, 0.012888854, 0.0127301635, 0.012761657, 0.012697643, 0.012857027, 0.01279337, 0.012794489, 0.012792399, 0.012793031, 0.012825594, 0.01301743, 0.012697911, 0.012889824, 0.012794014, 0.012826157, 0.0129844695, 0.012889803, 0.013016931, 0.012857368, 0.01276

LDA model trained successfully
Model saved to lda_models/topics_75_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_75_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_75_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.0964.

Top topics:
[(40,
  '0.055*"hand" + 0.027*"plain" + 0.027*"judicial" + 0.027*"definition" + '
  '0.027*"drafted" + 0.027*"adequacy" + 0.027*"supplied" + 0.027*"relate" + '
  '0.027*"drafted_plain" + 0.027*"unfairness"'),
 (57,
  '0.042*"currency" + 0.028*"set" + 0.028*"sum" + 0.028*"foreign" + '
  '0.028*"paid" + 0.014*"determining" + 0.014*"although" + 0.014*"stipulates" '
  '+ 0.014*"corresponding" + 0.014*"finance"'),
 (33,
  '0.030*"advantage" + 0.029*"comparative" + 0.024*"reputation" + 0.023*"3a" + '
  '0.018*"transaction" + 0.018*"financial" + 0.018*"comparative_advertising" + '
  '0.018*"character" + 0.018*"distinctive" + 0.018*"distinctive_c

2024-08-06 18:19:57,522 : INFO : -7.598 per-word bound, 193.7 perplexity estimate based on a held-out corpus of 440 documents with 16915 words



Perplexity:  -7.5975731065049175


2024-08-06 18:19:57,861 : INFO : -7.598 per-word bound, 193.7 perplexity estimate based on a held-out corpus of 440 documents with 16915 words
2024-08-06 18:19:57,870 : INFO : using ParallelWordOccurrenceAccumulator<processes=9, batch_size=64> to estimate probabilities from sliding windows
2024-08-06 18:20:00,610 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:20:00,857 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.41167847056534124

Document 0 topics:
[(55, 0.9872941)]
Best topic: 55
Top words in this topic:
[('organic', 0.070412025),
 ('organic_production', 0.033185273),
 ('production', 0.033185273),
 ('september', 0.030568538),
 ('substance', 0.027976045),
 ('appeal', 0.027964095),
 ('operator', 0.022390729),
 ('laying', 0.02224804),
 ('number', 0.016815327),
 ('scheme', 0.016815323)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(24, 0.9706964)]
Best topic: 24
Top words in this topic:
[('registration', 0.10228581),
 ('designation_origin', 0.09633664),
 ('natural_person', 0.05174477),
 ('agricultural', 0.04934919),
 ('natural', 0.046896324),
 ('procedure', 0.033632834),
 ('force', 0.030252604),
 ('registered', 0.027177606),
 ('date', 0.0

2024-08-06 18:20:03,518 : INFO : collecting all words and their counts
2024-08-06 18:20:03,518 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:20:03,539 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:20:03,540 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:20:03,540 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:20:03.540279', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:20:03,598 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:20:03,607 : INFO : built Dictionary<3416 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do

After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec']...
Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3416)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=90, chunksize=2000, passes=20, iterations=400, alpha=auto, eta=auto


2024-08-06 18:20:11,265 : INFO : optimized alpha [0.010759895, 0.010838881, 0.010760144, 0.010786135, 0.010838663, 0.010680746, 0.010785938, 0.010680689, 0.010785876, 0.010786106, 0.01075992, 0.010654261, 0.010681186, 0.010785956, 0.010732136, 0.010680603, 0.010812175, 0.0107338475, 0.010759259, 0.010654641, 0.010759681, 0.010785073, 0.010864587, 0.010707409, 0.010681111, 0.010786151, 0.01081354, 0.010786327, 0.010760175, 0.010811981, 0.010813093, 0.010760175, 0.010786491, 0.010706808, 0.010812376, 0.01070753, 0.010759406, 0.01073386, 0.010707249, 0.010786971, 0.010785873, 0.010812482, 0.010839155, 0.010732801, 0.010786754, 0.010786051, 0.010734225, 0.010813283, 0.010680475, 0.010654567, 0.010760196, 0.010785513, 0.010812476, 0.010759555, 0.0107078, 0.010918377, 0.010707021, 0.010732694, 0.010681115, 0.010707213, 0.010813477, 0.010786006, 0.010706863, 0.010680741, 0.010812678, 0.010813289, 0.010813492, 0.010707433, 0.010811947, 0.010733162, 0.010785832, 0.010786066, 0.010786095, 0.0107

LDA model trained successfully
Model saved to lda_models/topics_90_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_90_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_90_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -8.1622.

Top topics:
[(84,
  '0.001*"judicial" + 0.001*"body" + 0.001*"following" + 0.001*"previously" + '
  '0.001*"serious" + 0.001*"includes" + 0.001*"place" + 0.001*"position" + '
  '0.001*"subsequent" + 0.001*"selection"'),
 (86,
  '0.048*"element" + 0.039*"finding" + 0.027*"annulment" + 0.024*"consequence" '
  '+ 0.020*"amendment" + 0.020*"restoration" + 0.020*"objective" + '
  '0.020*"removing" + 0.020*"lead" + 0.020*"altering"'),
 (48,
  '0.042*"ceiling" + 0.042*"instrument" + 0.021*"exceed" + 0.021*"set" + '
  '0.021*"signed" + 0.021*"civil" + 0.021*"convention" + 0.021*"behalf" + '
  '0.021*"transposing" + 0.021*"determined"'),
 (11,
  '0.068*"exten

2024-08-06 18:20:17,965 : INFO : -7.614 per-word bound, 195.8 perplexity estimate based on a held-out corpus of 440 documents with 16915 words



Perplexity:  -7.613544123660979


2024-08-06 18:20:18,378 : INFO : -7.614 per-word bound, 195.8 perplexity estimate based on a held-out corpus of 440 documents with 16915 words
2024-08-06 18:20:18,422 : INFO : using ParallelWordOccurrenceAccumulator<processes=9, batch_size=64> to estimate probabilities from sliding windows
2024-08-06 18:20:20,828 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:20:21,116 : INFO : accumulated word occurrence stats for 1774 virtual documents



Coherence Score:  0.4538564207186806

Document 0 topics:
[(69, 0.9868045)]
Best topic: 69
Top words in this topic:
[('appeal', 0.039537888),
 ('operator', 0.02376595),
 ('number', 0.02376595),
 ('examine', 0.023765948),
 ('hearing', 0.023765946),
 ('dispute', 0.015873138),
 ('set', 0.015873138),
 ('portability', 0.015873138),
 ('body', 0.015873138),
 ('regulatory', 0.015873138)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(87, 0.9696061)]
Best topic: 87
Top words in this topic:
[('united', 0.04347172),
 ('united_kingdom', 0.043471716),
 ('milk', 0.037284274),
 ('di', 0.037265968),
 ('natural', 0.031074615),
 ('natural_person', 0.031074615),
 ('pdo', 0.0248791),
 ('northern', 0.024871359),
 ('ireland', 0.02365773),
 ('system', 0.021143978)]
Origina