In [2]:
import os
import logging
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class TopicModeler:
    def __init__(self, input_path):
        self.input_path = input_path
        self.docs = []
        self.filtered_docs = []
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.coherence_scores = []
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    def load_documents(self):
        for filename in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, filename)
            if os.path.isfile(file_path) and filename.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    if isinstance(content, str):
                        self.docs.append(content)
        print(f"Loaded {len(self.docs)} documents")
        if self.docs:
            print(f"Sample document: {self.docs[0][:100]}...")

    def preprocess_documents(self):
        self.docs = [[token.lower() for token in self.tokenizer.tokenize(doc)] for doc in self.docs if isinstance(doc, str)]
        self.docs = [[token for token in doc if not token.isnumeric() and len(token) > 1] for doc in self.docs]
        stop_words = set(stopwords.words('english'))
        self.docs = [[word for word in doc if word not in stop_words] for doc in self.docs]
        self.docs = [[self.lemmatizer.lemmatize(token) for token in doc] for doc in self.docs]
        print(f"After preprocessing: {len(self.docs)} documents")
        if self.docs:
            print(f"Sample preprocessed document: {self.docs[0][:10]}...")

    def add_bigrams(self):
        bigram = Phrases(self.docs, min_count=5, threshold=100)
        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    self.docs[idx].append(token)
        print(f"Added bigrams. Sample document: {self.docs[0][:15]}...")

    def filter_with_tfidf(self, top_n=200, bottom_n=50):
        texts = [' '.join(doc) for doc in self.docs]
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
        feature_names = tfidf_vectorizer.get_feature_names_out()
        mean_tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

      
        top_indices = mean_tfidf_scores.argsort()[-top_n:]
        bottom_indices = mean_tfidf_scores.argsort()[:bottom_n]
        
       
        words_to_remove = set(feature_names[i] for i in top_indices)
        
        self.filtered_docs = [[word for word in doc if word not in words_to_remove] for doc in self.docs]
        print(f"Filtered documents using TF-IDF. Sample filtered document: {self.filtered_docs[0][:10]}...")


    def create_dictionary_and_corpus(self):
        self.dictionary = Dictionary(self.filtered_docs)
        original_size = len(self.dictionary)
        self.dictionary.filter_extremes(no_below=2, no_above=0.9)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.filtered_docs]
        print(f"Dictionary size: {len(self.dictionary)} (reduced from {original_size})")
        print(f"Corpus size: {len(self.corpus)}")
        if self.corpus:
            print(f"Sample corpus entry: {self.corpus[0][:10]}")

    def train_lda_model(self, num_topics=6, chunksize=2000, passes=20, iterations=400, alpha='auto', eta='auto'):
        if not self.corpus or not self.dictionary:
            raise ValueError("Corpus or dictionary is empty. Check your preprocessing steps.")
        
        print(f"Training LDA model with num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}, alpha={alpha}, eta={eta}")
        
        self.model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            chunksize=chunksize,
            alpha=alpha,
            eta=eta,
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=None,
            random_state=42
        )
        print("LDA model trained successfully")

    def print_model_info(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
        top_topics = self.model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / self.model.num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)
        print("\nTop topics:")
        pprint(self.model.print_topics())
        print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.filtered_docs, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return coherence_lda

    def visualize_topics(self):
        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary)
        return vis

    def analyze_documents(self):
        for i, doc in enumerate(self.filtered_docs[:5]):
            bow = self.dictionary.doc2bow(doc)
            doc_topics = self.model.get_document_topics(bow)
            print(f"\nDocument {i} topics:")
            pprint(doc_topics)
            best_topic = max(doc_topics, key=lambda x: x[1])
            print(f"Best topic: {best_topic[0]}")
            print(f"Top words in this topic:")
            pprint(self.model.show_topic(best_topic[0]))
            print(f"Original document: {' '.join(doc[:30])}...")
            print()

    def save_model(self, file_path):
        if self.model is None:
            raise ValueError("No model has been trained yet.")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        self.model.save(file_path)
        print(f"Model saved to {file_path}")

    def load_model(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No model file found at {file_path}")
        self.model = LdaModel.load(file_path)
        print(f"Model loaded from {file_path}")

    def save_dictionary_and_corpus(self, dict_path, corpus_path):
        if self.dictionary is None or self.corpus is None:
            raise ValueError("Dictionary and corpus have not been created yet.")
        os.makedirs(os.path.dirname(dict_path), exist_ok=True)
        os.makedirs(os.path.dirname(corpus_path), exist_ok=True)
        self.dictionary.save(dict_path)
        with open(corpus_path, 'wb') as f:
            pickle.dump(self.corpus, f)
        print(f"Dictionary saved to {dict_path}")
        print(f"Corpus saved to {corpus_path}")

    def load_dictionary_and_corpus(self, dict_path, corpus_path):
        if not os.path.exists(dict_path) or not os.path.exists(corpus_path):
            raise FileNotFoundError(f"Dictionary or corpus file not found.")
        self.dictionary = Dictionary.load(dict_path)
        with open(corpus_path, 'rb') as f:
            self.corpus = pickle.load(f)
        print(f"Dictionary loaded from {dict_path}")
        print(f"Corpus loaded from {corpus_path}")

    def plot_coherence_scores(self, iteration):
        num_topics = [x[0] for x in self.coherence_scores]
        coherences = [x[1] for x in self.coherence_scores]
        
        plt.figure(figsize=(10, 6))
        plt.plot(num_topics, coherences, marker='o')
        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence Score')
        plt.title('Coherence Score by Number of Topics')
        plt.grid(True)
        
        plt.xticks(num_topics)
        plt.ylim(0, max(coherences) * 1.1)
        
        plt.savefig(f'coherence_scores_iteration_{iteration}.png')
        plt.close()

    def get_word_frequencies(self):
        topics = self.model.get_topics()
        word_frequencies = np.sum(topics, axis=0)
        word_frequencies /= self.model.num_topics
        return {self.dictionary[i]: freq for i, freq in enumerate(word_frequencies)}

    def get_low_frequency_words(self, threshold=0.01):
        word_frequencies = self.get_word_frequencies()
        return {word: freq for word, freq in word_frequencies.items() if freq < threshold}

    def get_high_frequency_words(self, threshold=0.00016):
        word_frequencies = self.get_word_frequencies()
        high_freq_words = {word: freq for word, freq in word_frequencies.items() if freq > threshold}
        
        if not high_freq_words:
            print(f"No words found with a frequency higher than the threshold of {threshold}.")
        
        return high_freq_words


    def save_frequency_words(self, low_freq_file, high_freq_file, low_threshold=0.01, high_threshold=0.1):
        low_freq_words = self.get_low_frequency_words(low_threshold)
        high_freq_words = self.get_high_frequency_words(high_threshold)

        df_low = pd.DataFrame(list(low_freq_words.items()), columns=['Word', 'Frequency'])
        df_low = df_low.sort_values('Frequency', ascending=True)
        df_low.to_csv(low_freq_file, index=False)
        print(f"Low frequency words saved to {low_freq_file}")

        df_high = pd.DataFrame(list(high_freq_words.items()), columns=['Word', 'Frequency'])
        df_high = df_high.sort_values('Frequency', ascending=False)
        df_high.to_csv(high_freq_file, index=False)
        print(f"High frequency words saved to {high_freq_file}")

    def run_with_params(self, num_topics, chunksize, passes, iterations):
        try:
            self.load_documents()
            self.preprocess_documents()
            self.add_bigrams()
            self.filter_with_tfidf(top_n=200, bottom_n=50)
            self.create_dictionary_and_corpus()
            self.train_lda_model(num_topics=num_topics, chunksize=chunksize, passes=passes, iterations=iterations)
            model_dir = f"lda_models/topics_{num_topics}_passes_{passes}_iterations_{iterations}"
            self.save_model(os.path.join(model_dir, 'trained_model'))
            self.save_dictionary_and_corpus(os.path.join(model_dir, 'dictionary'), os.path.join(model_dir, 'corpus'))
            coherence_lda = self.print_model_info()
            self.coherence_scores.append((num_topics, coherence_lda))
            
            # Plot and save coherence score after each run
            self.plot_coherence_scores(len(self.coherence_scores))
            
            vis = self.visualize_topics()
            pyLDAvis.save_html(vis, os.path.join(model_dir, 'lda_visualization.html'))
            self.analyze_documents()
            
            return vis
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    modeler = TopicModeler('txt')

    param_combinations = [
        (6, 2000, 10, 200),
        (8, 2000, 10, 400),
        (10, 2000, 20, 200),
        (12, 2000, 20, 400),
        (14, 2000, 20, 400),
        (16, 2000, 20, 400)
    ]

    csv_folder = 'csv_output'
    os.makedirs(csv_folder, exist_ok=True)

    for num_topics, chunksize, passes, iterations in param_combinations:
        print(f"\nRunning model with parameters: num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}")
        modeler.run_with_params(num_topics, chunksize, passes, iterations)
    
    modeler.save_frequency_words(
        os.path.join(csv_folder, 'low_freq_words.csv'),
        os.path.join(csv_folder, 'high_freq_words.csv'),
        low_threshold=0.01,
    high_threshold=0.00016
    )

2024-08-06 18:31:30,914 : INFO : collecting all words and their counts
2024-08-06 18:31:30,914 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:31:30,935 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences



Running model with parameters: num_topics=6, chunksize=2000, passes=10, iterations=200
Loaded 440 documents
Sample document: Pricing for interconnection related to the provision of number portability, as referred to in Articl...
After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec']...


2024-08-06 18:31:30,936 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:31:30,936 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:31:30.936710', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:31,010 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:31:31,021 : INFO : built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)
2024-08-06 18:31:31,022 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)", 'datetime': '2024-08-06T18:31:31.02232

Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3466)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=6, chunksize=2000, passes=10, iterations=200, alpha=auto, eta=auto


2024-08-06 18:31:31,210 : INFO : optimized alpha [0.08766382, 0.0900195, 0.08876704, 0.09682601, 0.10300999, 0.09258881]
2024-08-06 18:31:31,211 : INFO : topic #0 (0.088): 0.007*"property" + 0.006*"emergency" + 0.005*"number" + 0.005*"call" + 0.005*"advertiser" + 0.005*"advertisement" + 0.005*"trade_mark" + 0.004*"single" + 0.004*"immovable" + 0.004*"data"
2024-08-06 18:31:31,212 : INFO : topic #2 (0.089): 0.006*"set" + 0.005*"established" + 0.005*"january" + 0.005*"consequence" + 0.004*"procedure" + 0.004*"ass" + 0.004*"operator" + 0.004*"ensure" + 0.004*"determine" + 0.004*"result"
2024-08-06 18:31:31,212 : INFO : topic #5 (0.093): 0.008*"currency" + 0.007*"foreign" + 0.007*"spain" + 0.006*"kingdom_spain" + 0.006*"device" + 0.006*"exchange" + 0.004*"production" + 0.004*"financial" + 0.004*"vehicle" + 0.004*"provide"
2024-08-06 18:31:31,213 : INFO : topic #3 (0.097): 0.008*"le" + 0.007*"et" + 0.007*"take" + 0.006*"la" + 0.005*"en" + 0.005*"reference" + 0.004*"nature" + 0.004*"conseque

LDA model trained successfully
Model saved to lda_models/topics_6_passes_10_iterations_200/trained_model
Dictionary saved to lda_models/topics_6_passes_10_iterations_200/dictionary
Corpus saved to lda_models/topics_6_passes_10_iterations_200/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.7836.

Top topics:
[(0,
  '0.009*"property" + 0.007*"comparative" + 0.007*"number" + 0.007*"3a" + '
  '0.007*"trade_mark" + 0.007*"advertiser" + 0.006*"call" + '
  '0.006*"advertisement" + 0.005*"misleading" + 0.005*"emergency"'),
 (1,
  '0.013*"trade_mark" + 0.007*"grand" + 0.007*"agricultural" + '
  '0.006*"producer" + 0.006*"protected_designation" + 0.005*"proprietor" + '
  '0.005*"fact" + 0.005*"third" + 0.005*"television" + 0.005*"luxembourg"'),
 (2,
  '0.006*"damage" + 0.006*"set" + 0.006*"january" + 0.006*"established" + '
  '0.006*"procedure" + 0.005*"ensure" + 0.005*"consequence" + 0.005*"based" + '
  '0.005*"laying" + 0.005*"ass"'),
 (3,
  '0.009*"t

2024-08-06 18:31:33,688 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:31:33,700 : INFO : accumulated word occurrence stats for 1824 virtual documents



Coherence Score:  0.3583036255904722


2024-08-06 18:31:34,122 : INFO : collecting all words and their counts
2024-08-06 18:31:34,123 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:31:34,144 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:31:34,144 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:31:34,144 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:31:34.144886', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}



Document 0 topics:
[(1, 0.21231648), (2, 0.78424114)]
Best topic: 2
Top words in this topic:
[('damage', 0.006495846),
 ('set', 0.006147506),
 ('january', 0.0057626157),
 ('established', 0.005616492),
 ('procedure', 0.005580976),
 ('ensure', 0.0050364644),
 ('consequence', 0.005009744),
 ('based', 0.004977024),
 ('laying', 0.0048813173),
 ('ass', 0.0047441856)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(3, 0.5696721), (4, 0.42285165)]
Best topic: 3
Top words in this topic:
[('take', 0.008505372),
 ('le', 0.007880409),
 ('et', 0.006602751),
 ('taking', 0.006124346),
 ('la', 0.005991514),
 ('taken', 0.0056198034),
 ('nature', 0.0056108716),
 ('required', 0.0049055726),
 ('reference', 0.004830231),
 ('mortgage', 0.004754349)]
Original document: nat

2024-08-06 18:31:34,203 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:31:34,212 : INFO : built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)
2024-08-06 18:31:34,213 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)", 'datetime': '2024-08-06T18:31:34.213168', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:34,214 : INFO : discarding 1631 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:31:34,214 : INFO : keeping 1835 tokens which were in no les

Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'number', 'portability', 'concern', 'traffic', 'number', 'ported', 'set']...
Dictionary size: 1835 (reduced from 3466)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=8, chunksize=2000, passes=10, iterations=400, alpha=auto, eta=auto


2024-08-06 18:31:34,448 : INFO : optimized alpha [0.073119625, 0.06654704, 0.06868109, 0.07154452, 0.07593994, 0.06860237, 0.065195076, 0.06821899]
2024-08-06 18:31:34,449 : INFO : topic #6 (0.065): 0.014*"trade_mark" + 0.006*"proprietor" + 0.005*"unfairness" + 0.005*"number" + 0.004*"also" + 0.004*"consideration" + 0.004*"absence" + 0.004*"operator" + 0.004*"conclusion" + 0.004*"manufacturer"
2024-08-06 18:31:34,450 : INFO : topic #1 (0.067): 0.009*"trade_mark" + 0.009*"producer" + 0.006*"fact" + 0.005*"agricultural" + 0.005*"used" + 0.005*"television" + 0.005*"grand" + 0.005*"comparative" + 0.005*"system" + 0.005*"grand_duchy"
2024-08-06 18:31:34,450 : INFO : topic #3 (0.072): 0.010*"le" + 0.009*"et" + 0.008*"take" + 0.007*"la" + 0.006*"en" + 0.006*"mortgage" + 0.005*"assurance" + 0.005*"taking" + 0.005*"provider" + 0.005*"nature"
2024-08-06 18:31:34,450 : INFO : topic #0 (0.073): 0.007*"property" + 0.007*"number" + 0.006*"emergency" + 0.006*"call" + 0.006*"advertiser" + 0.005*"adver

LDA model trained successfully
Model saved to lda_models/topics_8_passes_10_iterations_400/trained_model
Dictionary saved to lda_models/topics_8_passes_10_iterations_400/dictionary
Corpus saved to lda_models/topics_8_passes_10_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -11.3216.

Top topics:
[(0,
  '0.009*"property" + 0.008*"number" + 0.008*"advertiser" + 0.007*"call" + '
  '0.006*"3a" + 0.006*"advertisement" + 0.006*"comparative" + '
  '0.006*"telephone" + 0.006*"emergency" + 0.006*"competitor"'),
 (1,
  '0.011*"producer" + 0.007*"agricultural" + 0.007*"luxembourg" + '
  '0.007*"television" + 0.007*"grand" + 0.007*"fact" + 0.007*"trade_mark" + '
  '0.007*"grand_duchy" + 0.007*"duchy" + 0.006*"french"'),
 (2,
  '0.007*"procedure" + 0.007*"set" + 0.007*"ass" + 0.006*"consequence" + '
  '0.006*"january" + 0.006*"registration" + 0.006*"established" + '
  '0.005*"ensure" + 0.005*"annex" + 0.005*"place"'),
 (3,
  '0.010*"le" + 0.009

2024-08-06 18:31:36,911 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:31:36,929 : INFO : accumulated word occurrence stats for 1824 virtual documents



Coherence Score:  0.3759477806944998


2024-08-06 18:31:37,415 : INFO : collecting all words and their counts
2024-08-06 18:31:37,415 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:31:37,438 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:31:37,439 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:31:37,441 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.03s', 'datetime': '2024-08-06T18:31:37.440975', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}



Document 0 topics:
[(0, 0.10452548), (2, 0.33347225), (6, 0.55829585)]
Best topic: 6
Top words in this topic:
[('trade_mark', 0.021743305),
 ('proprietor', 0.011138862),
 ('operator', 0.0065387497),
 ('reputation', 0.006310357),
 ('unfairness', 0.0052320594),
 ('affect', 0.004789331),
 ('taken', 0.004778463),
 ('consideration', 0.0046713366),
 ('number', 0.004647177),
 ('absence', 0.0044199377)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(3, 0.9881533)]
Best topic: 3
Top words in this topic:
[('le', 0.010377274),
 ('et', 0.009048616),
 ('take', 0.008410632),
 ('assurance', 0.0069454703),
 ('la', 0.00632705),
 ('travel', 0.0059740264),
 ('en', 0.005971945),
 ('mortgage', 0.005948711),
 ('taking', 0.00581333),
 ('conjunction', 0.005530167)]
Origina

2024-08-06 18:31:37,502 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:31:37,511 : INFO : built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)
2024-08-06 18:31:37,511 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)", 'datetime': '2024-08-06T18:31:37.511840', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:37,512 : INFO : discarding 1631 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:31:37,513 : INFO : keeping 1835 tokens which were in no les

Dictionary size: 1835 (reduced from 3466)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=10, chunksize=2000, passes=20, iterations=200, alpha=auto, eta=auto


2024-08-06 18:31:37,726 : INFO : optimized alpha [0.064869426, 0.064030215, 0.06322545, 0.065602474, 0.06541661, 0.061697606, 0.06146709, 0.060490623, 0.06259955, 0.06729898]
2024-08-06 18:31:37,727 : INFO : topic #7 (0.060): 0.009*"guarantee" + 0.007*"charge" + 0.007*"examine" + 0.006*"mortgage" + 0.005*"event" + 0.005*"year" + 0.005*"unfairness" + 0.004*"required" + 0.004*"mortgage_enforcement" + 0.004*"associated"
2024-08-06 18:31:37,727 : INFO : topic #6 (0.061): 0.016*"trade_mark" + 0.007*"proprietor" + 0.006*"unfairness" + 0.005*"set" + 0.004*"procedure" + 0.004*"water" + 0.004*"la" + 0.004*"existence" + 0.004*"also" + 0.004*"form"
2024-08-06 18:31:37,728 : INFO : topic #4 (0.065): 0.006*"ingredient" + 0.006*"marketed" + 0.006*"mean" + 0.006*"damage" + 0.005*"germany" + 0.005*"natural" + 0.005*"fact" + 0.005*"federal" + 0.005*"federal_republic" + 0.005*"conformity"
2024-08-06 18:31:37,728 : INFO : topic #3 (0.066): 0.012*"le" + 0.011*"et" + 0.008*"la" + 0.007*"used" + 0.007*"refe

LDA model trained successfully
Model saved to lda_models/topics_10_passes_20_iterations_200/trained_model
Dictionary saved to lda_models/topics_10_passes_20_iterations_200/dictionary
Corpus saved to lda_models/topics_10_passes_20_iterations_200/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.1929.

Top topics:
[(0,
  '0.009*"comparative" + 0.009*"3a" + 0.008*"call" + 0.008*"advertiser" + '
  '0.007*"emergency" + 0.007*"misleading" + 0.006*"advertisement" + '
  '0.006*"comparative_advertising" + 0.006*"chain" + 0.006*"competitor"'),
 (1,
  '0.013*"producer" + 0.009*"registration" + 0.008*"designation_origin" + '
  '0.008*"television" + 0.008*"grand" + 0.008*"duchy" + 0.008*"grand_duchy" + '
  '0.008*"luxembourg" + 0.007*"protected_designation" + 0.007*"fact"'),
 (2,
  '0.009*"annex" + 0.008*"framework" + 0.007*"ensure" + 0.007*"ass" + '
  '0.007*"conjunction" + 0.007*"january" + 0.007*"water" + 0.006*"procedure" + '
  '0.006*"description" + 0.0

2024-08-06 18:31:40,561 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:31:40,587 : INFO : accumulated word occurrence stats for 1824 virtual documents



Coherence Score:  0.3788098961279306


2024-08-06 18:31:41,214 : INFO : collecting all words and their counts
2024-08-06 18:31:41,215 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:31:41,236 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:31:41,236 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:31:41,237 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:31:41.237232', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:41,293 : INFO : adding document #0 to Dictionary<0 unique tokens: []>



Document 0 topics:
[(8, 0.995089)]
Best topic: 8
Top words in this topic:
[('provider', 0.01137076),
 ('directory', 0.00975163),
 ('data', 0.009127536),
 ('number', 0.007894769),
 ('property', 0.0077764527),
 ('regulatory', 0.0073230793),
 ('framework', 0.006753634),
 ('subscriber', 0.0067521925),
 ('hellenic', 0.0061429753),
 ('hellenic_republic', 0.0061429753)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(9, 0.988732)]
Best topic: 9
Top words in this topic:
[('conjunction', 0.008740721),
 ('read_conjunction', 0.008734049),
 ('substance', 0.0072487597),
 ('laying', 0.007225366),
 ('vehicle', 0.006892279),
 ('device', 0.006890655),
 ('required', 0.00665859),
 ('electronic_commerce', 0.0063260077),
 ('commerce', 0.006300902),
 ('society', 0.0061005

2024-08-06 18:31:41,302 : INFO : built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)
2024-08-06 18:31:41,303 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 documents (total 19293 corpus positions)", 'datetime': '2024-08-06T18:31:41.303378', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:41,304 : INFO : discarding 1631 tokens: [('abstract', 1), ('confidential', 1), ('confidentiality', 1), ('dissuaded', 1), ('donor', 1), ('ported', 1), ('porting', 1), ('reaching', 1), ('becomes', 1), ('referral', 1)]...
2024-08-06 18:31:41,304 : INFO : keeping 1835 tokens which were in no less than 2 and no more than 396 (=90.0%) documents
2024-08-06 18:31:41,305 : INFO : resul

Dictionary size: 1835 (reduced from 3466)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=12, chunksize=2000, passes=20, iterations=400, alpha=auto, eta=auto


2024-08-06 18:31:41,532 : INFO : optimized alpha [0.05866857, 0.05653394, 0.055403393, 0.057497937, 0.058518127, 0.055966236, 0.056930613, 0.05416912, 0.05512538, 0.061643116, 0.05579259, 0.055817783]
2024-08-06 18:31:41,534 : INFO : topic #7 (0.054): 0.010*"guarantee" + 0.009*"charge" + 0.007*"manufacturer" + 0.006*"reference" + 0.005*"july_approximation" + 0.005*"equipment" + 0.005*"damage" + 0.005*"informed" + 0.005*"purchase" + 0.005*"reasonably"
2024-08-06 18:31:41,534 : INFO : topic #8 (0.055): 0.012*"provider" + 0.011*"data" + 0.009*"property" + 0.008*"individual" + 0.007*"directory" + 0.006*"regulatory" + 0.006*"territory" + 0.006*"fact" + 0.006*"framework" + 0.005*"subscriber"
2024-08-06 18:31:41,534 : INFO : topic #4 (0.059): 0.010*"marketed" + 0.008*"packaging" + 0.007*"conformity" + 0.007*"labelling_presentation" + 0.006*"federal" + 0.006*"germany" + 0.006*"reference" + 0.006*"ingredient" + 0.006*"federal_republic" + 0.005*"designation_origin"
2024-08-06 18:31:41,535 : INFO

LDA model trained successfully
Model saved to lda_models/topics_12_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_12_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_12_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.6570.

Top topics:
[(0,
  '0.010*"call" + 0.010*"number" + 0.009*"emergency" + 0.007*"advertiser" + '
  '0.007*"advertisement" + 0.007*"property" + 0.006*"single" + 0.005*"chain" + '
  '0.005*"caller" + 0.005*"retail"'),
 (1,
  '0.010*"television" + 0.010*"luxembourg" + 0.010*"grand" + '
  '0.009*"grand_duchy" + 0.009*"duchy" + 0.009*"designation_origin" + '
  '0.008*"comparative" + 0.008*"used" + 0.007*"protected_designation" + '
  '0.007*"system"'),
 (2,
  '0.009*"framework" + 0.009*"conjunction" + 0.008*"procedure" + '
  '0.007*"dispute" + 0.007*"read_conjunction" + 0.007*"access" + 0.007*"pre" + '
  '0.007*"january" + 0.007*"annex" + 0.007*"judicial"')

2024-08-06 18:31:44,339 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:31:44,370 : INFO : accumulated word occurrence stats for 1824 virtual documents



Coherence Score:  0.3688219526204162


2024-08-06 18:31:44,902 : INFO : collecting all words and their counts
2024-08-06 18:31:44,902 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:31:44,923 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:31:44,923 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:31:44,924 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:31:44.924127', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:44,980 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:31:44,990 : INFO : built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(8, 0.9942098)]
Best topic: 8
Top words in this topic:
[('provider', 0.017940998),
 ('directory', 0.0118467305),
 ('property', 0.011166428),
 ('data', 0.011052412),
 ('belgium', 0.00960188),
 ('regulatory', 0.009363188),
 ('kingdom_belgium', 0.008880924),
 ('operator', 0.0083980225),
 ('subscriber', 0.008163488),
 ('individual', 0.007936826)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(9, 0.5778544), (10, 0.41000023)]
Best topic: 9
Top words in this topic:
[('conjunction', 0.009253995),
 ('read_conjunction', 0.008897175),
 ('laying', 0.007744883),
 ('vehicle', 0.0075044567),
 ('consequence', 0.007087648),
 ('required', 0.0067844735),
 ('device', 0.006722809),
 ('second', 0.006593387),
 ('need', 0.0061250706),
 ('procedure', 0.

2024-08-06 18:31:45,002 : INFO : using autotuned alpha, starting with [0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575, 0.071428575]
2024-08-06 18:31:45,004 : INFO : using serial LDA version on this node
2024-08-06 18:31:45,007 : INFO : running online (multi-pass) LDA training, 14 topics, 20 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-06 18:31:45,008 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-06 18:31:45,195 : INFO : optimized alpha [0.057320576, 0.05690696, 0.05664213, 0.057131078, 0.05807572, 0.05643965, 0.057230853, 0.054467447, 0.05643899, 0.057639763, 0.05634466, 0.054417424, 0.058205582, 0.055246614]
2024-08-06 18:31:45,198 : INFO : topic #11 (0.054): 0.013*"wine" + 0.011*"damage" + 0.009*"request" +

LDA model trained successfully
Model saved to lda_models/topics_14_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_14_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_14_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -10.7466.

Top topics:
[(0,
  '0.014*"call" + 0.012*"emergency" + 0.011*"number" + 0.008*"single" + '
  '0.007*"caller" + 0.007*"retail" + 0.007*"telephone" + 0.007*"available" + '
  '0.006*"property" + 0.006*"make"'),
 (1,
  '0.011*"television" + 0.010*"grand" + 0.010*"duchy" + 0.010*"grand_duchy" + '
  '0.010*"luxembourg" + 0.008*"protected_designation" + 0.008*"fact" + '
  '0.008*"channel" + 0.007*"mean" + 0.007*"registration"'),
 (2,
  '0.009*"established" + 0.008*"reasonably" + 0.008*"framework" + '
  '0.008*"evocation" + 0.008*"covered" + 0.008*"january" + 0.007*"ass" + '
  '0.007*"establishing" + 0.007*"procedure" + 0.007*"consequence"'),
 (3,
  '0.018*

2024-08-06 18:31:48,016 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:31:48,043 : INFO : accumulated word occurrence stats for 1824 virtual documents



Coherence Score:  0.3932196160674044


2024-08-06 18:31:48,542 : INFO : collecting all words and their counts
2024-08-06 18:31:48,542 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-06 18:31:48,562 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-06 18:31:48,562 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-06 18:31:48,563 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-06T18:31:48.563083', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-06 18:31:48,617 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-06 18:31:48,626 : INFO : built Dictionary<3466 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(8, 0.9936895)]
Best topic: 8
Top words in this topic:
[('directory', 0.0147787165),
 ('data', 0.012977489),
 ('property', 0.01212701),
 ('provider', 0.010293842),
 ('making', 0.010158941),
 ('subscriber', 0.010024913),
 ('pre', 0.009290173),
 ('telephone', 0.00925566),
 ('number', 0.0075557875),
 ('operator', 0.0075398507)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(9, 0.27961), (13, 0.70689225)]
Best topic: 13
Top words in this topic:
[('natural', 0.014889866),
 ('purchaser', 0.011159046),
 ('understood', 0.008210618),
 ('paid', 0.007855486),
 ('mortgage', 0.00719655),
 ('scheme', 0.0069272364),
 ('holiday', 0.006533294),
 ('natural_person', 0.006389067),
 ('travel', 0.006293065),
 ('manufacturer', 0.0061782184)]
Original d

2024-08-06 18:31:48,635 : INFO : using autotuned alpha, starting with [0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]
2024-08-06 18:31:48,636 : INFO : using serial LDA version on this node
2024-08-06 18:31:48,638 : INFO : running online (multi-pass) LDA training, 16 topics, 20 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-06 18:31:48,638 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-06 18:31:48,811 : INFO : optimized alpha [0.052009016, 0.052649535, 0.051226173, 0.05046125, 0.053348713, 0.050358627, 0.05136916, 0.04964234, 0.05160689, 0.05274506, 0.04967682, 0.04969019, 0.051447958, 0.050722785, 0.050538506, 0.053171054]
2024-08-06 18:31:48,813 : INFO : topic #7 (0.050): 0.010*"charge" + 0.008*"authorisation" + 0.008*"licence" + 0.008*"device" + 0.0

LDA model trained successfully
Model saved to lda_models/topics_16_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_16_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_16_passes_20_iterations_400/corpus
Number of unique tokens: 1835
Number of documents: 440
Average topic coherence: -9.6909.

Top topics:
[(0,
  '0.014*"call" + 0.014*"property" + 0.012*"emergency" + 0.010*"premise" + '
  '0.010*"number" + 0.008*"cancellation" + 0.008*"away" + 0.007*"available" + '
  '0.007*"caller" + 0.007*"telephone"'),
 (1,
  '0.013*"protected_designation" + 0.011*"television" + 0.009*"system" + '
  '0.009*"fact" + 0.008*"agricultural" + 0.008*"registration" + 0.008*"used" + '
  '0.008*"channel" + 0.007*"wine" + 0.007*"specification"'),
 (2,
  '0.007*"pre" + 0.007*"registration" + 0.007*"failure" + 0.007*"ass" + '
  '0.007*"brought" + 0.006*"following" + 0.006*"kg" + 0.006*"body" + '
  '0.006*"ensure" + 0.005*"appropriate"'),
 (3,
  '0.019*"le" + 0.018*"

2024-08-06 18:31:51,715 : INFO : 9 accumulators retrieved from output queue
2024-08-06 18:31:51,750 : INFO : accumulated word occurrence stats for 1824 virtual documents



Coherence Score:  0.42060175306907976

Document 0 topics:
[(8, 0.99313956)]
Best topic: 8
Top words in this topic:
[('directory', 0.016729774),
 ('data', 0.015686158),
 ('property', 0.013388031),
 ('provider', 0.011527475),
 ('subscriber', 0.01047814),
 ('telephone', 0.010249694),
 ('number', 0.009823517),
 ('individual', 0.008216942),
 ('territory', 0.008039163),
 ('finland', 0.0073567485)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(13, 0.98400766)]
Best topic: 13
Top words in this topic:
[('natural', 0.019501213),
 ('natural_person', 0.01035301),
 ('fact', 0.00921383),
 ('profession', 0.00886794),
 ('brought', 0.008130042),
 ('acting', 0.008122334),
 ('substance', 0.008073993),
 ('also', 0.0071520647),
 ('defendant', 0.0069301594),
 ('company'