In [1]:
import os
import logging
import pickle
import json
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

class TopicModeler:
    def __init__(self, input_path):
        self.input_path = input_path
        self.docs = []
        self.filtered_docs = []
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.coherence_scores = []
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        self.high_freq_words = set()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    def load_documents(self):
        for filename in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, filename)
            if os.path.isfile(file_path) and filename.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    if isinstance(content, str):
                        self.docs.append(content)
        print(f"Loaded {len(self.docs)} documents")
        if self.docs:
            print(f"Sample document: {self.docs[0][:100]}...")

    def load_and_filter_high_frequency_words(self, csv_folder='.'):
        all_terms = pd.DataFrame()

        for i in range(1, 6):
            file_path = os.path.join(csv_folder, f'sorted_terms_topic_{i}.csv')
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                all_terms = pd.concat([all_terms, df])

        grouped_terms = all_terms.groupby('Term')['Total'].sum().reset_index()
        self.high_freq_words = set(grouped_terms[grouped_terms['Total'] > 10]['Term'])
        print(f"Identified {len(self.high_freq_words)} high-frequency words to remove")

    def preprocess_documents(self):
        self.load_and_filter_high_frequency_words()

        self.docs = [[token.lower() for token in self.tokenizer.tokenize(doc)] for doc in self.docs if isinstance(doc, str)]
        self.docs = [[token for token in doc if not token.isnumeric() and len(token) > 1] for doc in self.docs]
        stop_words = set(stopwords.words('english'))
        self.docs = [[word for word in doc if word not in stop_words and word not in self.high_freq_words] for doc in self.docs]
        self.docs = [[self.lemmatizer.lemmatize(token) for token in doc] for doc in self.docs]
        print(f"After preprocessing: {len(self.docs)} documents")
        if self.docs:
            print(f"Sample preprocessed document: {self.docs[0][:10]}...")

    def add_bigrams(self):
        bigram = Phrases(self.docs, min_count=5, threshold=100)
        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    self.docs[idx].append(token)
        print(f"Added bigrams. Sample document: {self.docs[0][:15]}...")

    def filter_with_tfidf(self, top_n=200, bottom_n=50):
        texts = [' '.join(doc) for doc in self.docs]
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
        feature_names = tfidf_vectorizer.get_feature_names_out()
        mean_tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
        
        top_indices = mean_tfidf_scores.argsort()[-top_n:]
        bottom_indices = mean_tfidf_scores.argsort()[:bottom_n]
        
        words_to_remove = set(feature_names[i] for i in np.concatenate([top_indices, bottom_indices]))

        self.filtered_docs = [[word for word in doc if word not in words_to_remove] for doc in self.docs]
        print(f"Filtered documents using TF-IDF. Sample filtered document: {self.filtered_docs[0][:10]}...")

    def create_dictionary_and_corpus(self):
        self.dictionary = Dictionary(self.filtered_docs)
        original_size = len(self.dictionary)
        self.dictionary.filter_extremes(no_below=2, no_above=0.9)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.filtered_docs]
        print(f"Dictionary size: {len(self.dictionary)} (reduced from {original_size})")
        print(f"Corpus size: {len(self.corpus)}")
        if self.corpus:
            print(f"Sample corpus entry: {self.corpus[0][:10]}")

    def train_lda_model(self, num_topics=6, chunksize=2000, passes=20, iterations=400, alpha='auto', eta='auto'):
        if not self.corpus or not self.dictionary:
            raise ValueError("Corpus or dictionary is empty. Check your preprocessing steps.")
        
        print(f"Training LDA model with num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}, alpha={alpha}, eta={eta}")
        
        self.model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            chunksize=chunksize,
            alpha=alpha,
            eta=eta,
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=None,
            random_state=42
        )
        print("LDA model trained successfully")

    def print_model_info(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
        top_topics = self.model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / self.model.num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)
        print("\nTop topics:")
        pprint(self.model.print_topics())
        print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.filtered_docs, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return coherence_lda

    def visualize_topics(self):
        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary)
        for i in range(1, 6):
            df = vis.sorted_terms(topic=i, _lambda=0)
            df.to_csv(f'sorted_terms_topic_{i}.csv', index=False)
        return vis

    def analyze_documents(self):
        for i, doc in enumerate(self.filtered_docs[:5]):
            bow = self.dictionary.doc2bow(doc)
            doc_topics = self.model.get_document_topics(bow)
            print(f"\nDocument {i} topics:")
            pprint(doc_topics)
            best_topic = max(doc_topics, key=lambda x: x[1])
            print(f"Best topic: {best_topic[0]}")
            print(f"Top words in this topic:")
            pprint(self.model.show_topic(best_topic[0]))
            print(f"Original document: {' '.join(doc[:30])}...")
            print()

    def save_model(self, file_path):
        if self.model is None:
            raise ValueError("No model has been trained yet.")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        self.model.save(file_path)
        print(f"Model saved to {file_path}")

    def load_model(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No model file found at {file_path}")
        self.model = LdaModel.load(file_path)
        print(f"Model loaded from {file_path}")

    def save_dictionary_and_corpus(self, dict_path, corpus_path):
        if self.dictionary is None or self.corpus is None:
            raise ValueError("Dictionary and corpus have not been created yet.")
        os.makedirs(os.path.dirname(dict_path), exist_ok=True)
        os.makedirs(os.path.dirname(corpus_path), exist_ok=True)
        self.dictionary.save(dict_path)
        with open(corpus_path, 'wb') as f:
            pickle.dump(self.corpus, f)
        print(f"Dictionary saved to {dict_path}")
        print(f"Corpus saved to {corpus_path}")

    def load_dictionary_and_corpus(self, dict_path, corpus_path):
        if not os.path.exists(dict_path) or not os.path.exists(corpus_path):
            raise FileNotFoundError(f"Dictionary or corpus file not found.")
        self.dictionary = Dictionary.load(dict_path)
        with open(corpus_path, 'rb') as f:
            self.corpus = pickle.load(f)
        print(f"Dictionary loaded from {dict_path}")
        print(f"Corpus loaded from {corpus_path}")

    def plot_coherence_scores(self, iteration):
        num_topics = [x[0] for x in self.coherence_scores]
        coherences = [x[1] for x in self.coherence_scores]
        
        plt.figure(figsize=(10, 6))
        plt.plot(num_topics, coherences, marker='o')
        plt.xlabel('Number of Topics')
        plt.ylabel('Coherence Score')
        plt.title('Coherence Score by Number of Topics')
        plt.grid(True)
        
        plt.xticks(num_topics)
        plt.ylim(0, max(coherences) * 1.1)
        
        plt.savefig(f'coherence_scores_iteration_{iteration}.png')
        plt.close()

    def run_with_params(self, num_topics, chunksize, passes, iterations):
        try:
            self.load_documents()
            self.preprocess_documents()
            self.add_bigrams()
            self.filter_with_tfidf(top_n=200, bottom_n=50)
            self.create_dictionary_and_corpus()
            self.train_lda_model(num_topics=num_topics, chunksize=chunksize, passes=passes, iterations=iterations)
            model_dir = f"lda_models/topics_{num_topics}_passes_{passes}_iterations_{iterations}"
            self.save_model(os.path.join(model_dir, 'trained_model'))
            self.save_dictionary_and_corpus(os.path.join(model_dir, 'dictionary'), os.path.join(model_dir, 'corpus'))
            coherence_lda = self.print_model_info()
            self.coherence_scores.append((num_topics, coherence_lda))
            
            self.plot_coherence_scores(len(self.coherence_scores))
            
            vis = self.visualize_topics()
            pyLDAvis.save_html(vis, os.path.join(model_dir, 'lda_visualization.html'))
            self.analyze_documents()
            
            return vis
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    modeler = TopicModeler('txt')

    param_combinations = [
        (4, 2000, 10, 200),
        (8, 2000, 10, 400),
        (12, 2000, 20, 200),
        (16, 2000, 20, 400),
        (20, 2000, 20, 400),
        (24, 2000, 20, 400)
    ]

    for num_topics, chunksize, passes, iterations in param_combinations:
        print(f"\nRunning model with parameters: num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}")
        modeler.run_with_params(num_topics, chunksize, passes, iterations)


Running model with parameters: num_topics=4, chunksize=2000, passes=10, iterations=200
Loaded 440 documents
Sample document: Pricing for interconnection related to the provision of number portability, as referred to in Articl...
Identified 132 high-frequency words to remove


2024-08-10 12:14:40,024 : INFO : collecting all words and their counts
2024-08-10 12:14:40,024 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-10 12:14:40,043 : INFO : collected 21968 token types (unigram + bigrams) from a corpus of 41938 words and 440 sentences
2024-08-10 12:14:40,043 : INFO : merged Phrases<21968 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-10 12:14:40,044 : INFO : Phrases lifecycle event {'msg': 'built Phrases<21968 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-10T12:14:40.044280', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-10 12:14:40,180 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-10 12:14:40,187 : INFO : built Dictionary<3302 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'calculating']...> from

After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'portability', 'referred', 'article', 'directive', 'ec', 'european']...
Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal', 'service']...
Filtered documents using TF-IDF. Sample filtered document: ['pricing', 'interconnection', 'related', 'portability', 'concern', 'traffic', 'number', 'ported', 'incurred', 'mobile']...
Dictionary size: 1715 (reduced from 3302)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Training LDA model with num_topics=4, chunksize=2000, passes=10, iterations=200, alpha=auto, eta=auto


2024-08-10 12:14:40,353 : INFO : optimized alpha [0.12164946, 0.10884525, 0.11331521, 0.121583626]
2024-08-10 12:14:40,354 : INFO : topic #0 (0.122): 0.005*"property" + 0.005*"le" + 0.005*"currency" + 0.005*"provider" + 0.004*"data" + 0.004*"also" + 0.004*"et" + 0.004*"foreign" + 0.004*"established" + 0.004*"purchase"
2024-08-10 12:14:40,354 : INFO : topic #1 (0.109): 0.006*"mortgage" + 0.005*"make" + 0.005*"provider" + 0.005*"whose" + 0.004*"compensation" + 0.004*"determine" + 0.004*"intended" + 0.004*"creditor" + 0.004*"property" + 0.004*"consequence"
2024-08-10 12:14:40,355 : INFO : topic #2 (0.113): 0.008*"trade_mark" + 0.005*"significant" + 0.004*"ascertain" + 0.004*"ass" + 0.004*"travel" + 0.004*"mean" + 0.004*"also" + 0.004*"currency" + 0.003*"concept" + 0.003*"unfairness"
2024-08-10 12:14:40,355 : INFO : topic #3 (0.122): 0.005*"damage" + 0.004*"call" + 0.004*"device" + 0.004*"italian" + 0.004*"producer" + 0.004*"form" + 0.004*"vehicle" + 0.004*"trade_mark" + 0.004*"however" + 

LDA model trained successfully
Model saved to lda_models/topics_4_passes_10_iterations_200/trained_model
Dictionary saved to lda_models/topics_4_passes_10_iterations_200/dictionary
Corpus saved to lda_models/topics_4_passes_10_iterations_200/corpus
Number of unique tokens: 1715
Number of documents: 440
Average topic coherence: -11.6718.

Top topics:
[(0,
  '0.008*"property" + 0.007*"le" + 0.007*"provider" + 0.007*"et" + '
  '0.005*"purchase" + 0.005*"data" + 0.005*"assurance" + 0.005*"established" + '
  '0.005*"la" + 0.005*"premise"'),
 (1,
  '0.006*"mortgage" + 0.005*"compensation" + 0.005*"creditor" + 0.005*"make" + '
  '0.005*"consequence" + 0.005*"lawyer" + 0.004*"whose" + 0.004*"provider" + '
  '0.004*"organic" + 0.004*"based"'),
 (2,
  '0.009*"trade_mark" + 0.008*"currency" + 0.006*"foreign" + 0.006*"travel" + '
  '0.005*"significant" + 0.005*"unfairness" + 0.005*"ass" + 0.005*"informed" + '
  '0.005*"financial" + 0.005*"exchange"'),
 (3,
  '0.007*"damage" + 0.006*"trade_mark" + 

2024-08-10 12:14:42,377 : INFO : 9 accumulators retrieved from output queue
2024-08-10 12:14:42,393 : INFO : accumulated word occurrence stats for 1232 virtual documents



Coherence Score:  0.45385340097226046


  all_terms = pd.concat([all_terms, df])
2024-08-10 12:14:43,602 : INFO : collecting all words and their counts
2024-08-10 12:14:43,603 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-10 12:14:43,622 : INFO : collected 21918 token types (unigram + bigrams) from a corpus of 42073 words and 440 sentences
2024-08-10 12:14:43,623 : INFO : merged Phrases<21918 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-10 12:14:43,623 : INFO : Phrases lifecycle event {'msg': 'built Phrases<21918 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-10T12:14:43.623571', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-10 12:14:43,676 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-10 12:14:43,684 : INFO : built Dictionary<3291 unique tokens: ['abstract', 'adoption', 'ad


Document 0 topics:
[(1, 0.9957869)]
Best topic: 1
Top words in this topic:
[('mortgage', 0.005942852),
 ('compensation', 0.0054136394),
 ('creditor', 0.0052232095),
 ('make', 0.0050510997),
 ('consequence', 0.004794514),
 ('lawyer', 0.0046952358),
 ('whose', 0.0044377493),
 ('provider', 0.0042149774),
 ('organic', 0.004154805),
 ('based', 0.004142763)]
Original document: pricing interconnection related portability concern traffic number ported incurred mobile telephone operator implement request porting adoption method calculating fix advance abstract model maximum charged donor recipient way dissuaded making facility...


Document 1 topics:
[(0, 0.9919724)]
Best topic: 0
Top words in this topic:
[('property', 0.007910582),
 ('le', 0.0070549557),
 ('provider', 0.0067683184),
 ('et', 0.0065358127),
 ('purchase', 0.005188866),
 ('data', 0.004994633),
 ('assurance', 0.0049597435),
 ('established', 0.0049349484),
 ('la', 0.0048986995),
 ('premise', 0.004810404)]
Original document: natural

2024-08-10 12:14:43,695 : INFO : using autotuned alpha, starting with [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
2024-08-10 12:14:43,697 : INFO : using serial LDA version on this node
2024-08-10 12:14:43,698 : INFO : running online (multi-pass) LDA training, 8 topics, 10 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-10 12:14:43,699 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-10 12:14:43,854 : INFO : optimized alpha [0.09324013, 0.08343071, 0.0847777, 0.080188826, 0.07997089, 0.08076963, 0.08064345, 0.080683544]
2024-08-10 12:14:43,859 : INFO : topic #4 (0.080): 0.007*"place" + 0.007*"judicial" + 0.006*"take" + 0.005*"date" + 0.005*"prohibits" + 0.005*"following" + 0.005*"content" + 0.005*"reference" + 0.005*"premise" + 0.005*"presentation"
2024-08-10 12:14:43,860 : INFO : topic #3 (0.080): 0.006*"third" + 0.006*"

LDA model trained successfully
Model saved to lda_models/topics_8_passes_10_iterations_400/trained_model
Dictionary saved to lda_models/topics_8_passes_10_iterations_400/dictionary
Corpus saved to lda_models/topics_8_passes_10_iterations_400/corpus
Number of unique tokens: 1705
Number of documents: 440
Average topic coherence: -8.8380.

Top topics:
[(0,
  '0.009*"risk" + 0.008*"conjunction" + 0.008*"read_conjunction" + '
  '0.006*"required" + 0.006*"set" + 0.005*"reference" + 0.005*"guarantee" + '
  '0.005*"ingredient" + 0.005*"conclusion" + 0.005*"used"'),
 (1,
  '0.013*"le" + 0.009*"nature" + 0.007*"set" + 0.007*"consequence" + '
  '0.007*"provide" + 0.006*"economic" + 0.006*"supply" + 0.006*"relevant" + '
  '0.006*"operator" + 0.005*"framework"'),
 (2,
  '0.008*"misleading" + 0.008*"prescribed" + 0.008*"necessary_comply" + '
  '0.007*"september" + 0.006*"advertisement" + 0.005*"fee" + '
  '0.005*"competitor" + 0.005*"number" + 0.005*"comparative_advertising" + '
  '0.005*"comparativ

2024-08-10 12:14:45,908 : INFO : 9 accumulators retrieved from output queue
2024-08-10 12:14:45,938 : INFO : accumulated word occurrence stats for 1329 virtual documents



Coherence Score:  0.3933750463362915


2024-08-10 12:14:46,764 : INFO : collecting all words and their counts
2024-08-10 12:14:46,764 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-10 12:14:46,783 : INFO : collected 22038 token types (unigram + bigrams) from a corpus of 42033 words and 440 sentences
2024-08-10 12:14:46,784 : INFO : merged Phrases<22038 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-10 12:14:46,784 : INFO : Phrases lifecycle event {'msg': 'built Phrases<22038 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-10T12:14:46.784384', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-10 12:14:46,838 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-10 12:14:46,846 : INFO : built Dictionary<3311 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(2, 0.99475443)]
Best topic: 2
Top words in this topic:
[('misleading', 0.008472045),
 ('prescribed', 0.008060303),
 ('necessary_comply', 0.008007722),
 ('september', 0.007268893),
 ('advertisement', 0.0064812484),
 ('fee', 0.0054242574),
 ('competitor', 0.0053932364),
 ('number', 0.0053213155),
 ('comparative_advertising', 0.005091209),
 ('comparative', 0.0050534443)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(1, 0.984201)]
Best topic: 1
Top words in this topic:
[('le', 0.013033843),
 ('nature', 0.00915156),
 ('set', 0.0071712355),
 ('consequence', 0.0066857785),
 ('provide', 0.006572717),
 ('economic', 0.0064683896),
 ('supply', 0.0060850885),
 ('relevant', 0.00578286),
 ('operator', 0.005740503),
 ('framework', 0.005238419

2024-08-10 12:14:46,856 : INFO : using autotuned alpha, starting with [0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336, 0.083333336]
2024-08-10 12:14:46,856 : INFO : using serial LDA version on this node
2024-08-10 12:14:46,857 : INFO : running online (multi-pass) LDA training, 12 topics, 20 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 200x with a convergence threshold of 0.001000
2024-08-10 12:14:46,858 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-10 12:14:46,991 : INFO : optimized alpha [0.06361442, 0.06741089, 0.06216605, 0.061661042, 0.06747192, 0.06806576, 0.06165421, 0.06501994, 0.066665694, 0.06691929, 0.0606103, 0.066071175]
2024-08-10 12:14:46,993 : INFO : topic #10 (0.061): 0.011*"long" + 0.011*"adopt" + 0.009*"extension" + 0.008*"germany" + 0.007*"mg" + 0.007*"hellenic_republi

LDA model trained successfully
Model saved to lda_models/topics_12_passes_20_iterations_200/trained_model
Dictionary saved to lda_models/topics_12_passes_20_iterations_200/dictionary
Corpus saved to lda_models/topics_12_passes_20_iterations_200/corpus
Number of unique tokens: 1724
Number of documents: 440
Average topic coherence: -11.5361.

Top topics:
[(0,
  '0.012*"lawyer" + 0.009*"email" + 0.008*"packaging" + 0.008*"shop" + '
  '0.008*"retail" + 0.008*"substance" + 0.008*"informed" + 0.007*"chain" + '
  '0.007*"united_kingdom" + 0.007*"united"'),
 (1,
  '0.036*"trade_mark" + 0.010*"registration" + 0.010*"registered" + '
  '0.008*"property" + 0.007*"belgium" + 0.007*"established" + '
  '0.006*"ascertain" + 0.006*"bearing" + 0.006*"kingdom_belgium" + '
  '0.006*"sign"'),
 (2,
  '0.019*"call" + 0.010*"prohibits" + 0.009*"caller" + 0.009*"telephone" + '
  '0.008*"limit" + 0.008*"single" + 0.008*"request" + 0.008*"location" + '
  '0.007*"date" + 0.007*"travel"'),
 (3,
  '0.018*"vehicle" 

2024-08-10 12:14:49,582 : INFO : 9 accumulators retrieved from output queue
2024-08-10 12:14:49,612 : INFO : accumulated word occurrence stats for 1223 virtual documents



Coherence Score:  0.3924581930761028


2024-08-10 12:14:50,119 : INFO : collecting all words and their counts
2024-08-10 12:14:50,119 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-10 12:14:50,138 : INFO : collected 22066 token types (unigram + bigrams) from a corpus of 42324 words and 440 sentences
2024-08-10 12:14:50,138 : INFO : merged Phrases<22066 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-10 12:14:50,139 : INFO : Phrases lifecycle event {'msg': 'built Phrases<22066 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-10T12:14:50.139032', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-10 12:14:50,193 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-10 12:14:50,200 : INFO : built Dictionary<3308 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(6, 0.99252945)]
Best topic: 6
Top words in this topic:
[('regulatory', 0.023111211),
 ('access', 0.02077478),
 ('television', 0.014647277),
 ('common_regulatory', 0.012881898),
 ('common', 0.012710135),
 ('channel', 0.0104859015),
 ('programme', 0.008407411),
 ('television_programme', 0.008407411),
 ('public', 0.008208427),
 ('based', 0.007912938)]
Original document: pricing interconnection related portability concern traffic number ported incurred mobile telephone operator implement request porting adoption calculating fix advance abstract model maximum charged donor recipient fixed way dissuaded making facility...


Document 1 topics:
[(11, 0.980572)]
Best topic: 11
Top words in this topic:
[('travel', 0.008105164),
 ('context', 0.0073837433),
 ('brand', 0.00663907),
 ('domestic', 0.0066241357),
 ('date', 0.006606864),
 ('profession', 0.006598139),
 ('property', 0.0065561505),
 ('additional', 0.0063055),
 ('organiser', 0.00591125),
 ('repayment', 0.0058921827)]


2024-08-10 12:14:50,210 : INFO : using autotuned alpha, starting with [0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]
2024-08-10 12:14:50,211 : INFO : using serial LDA version on this node
2024-08-10 12:14:50,212 : INFO : running online (multi-pass) LDA training, 16 topics, 20 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-10 12:14:50,212 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-10 12:14:50,363 : INFO : optimized alpha [0.051415548, 0.051788498, 0.052400034, 0.052111633, 0.053168178, 0.05074266, 0.051281683, 0.050347053, 0.051218797, 0.05159027, 0.052637074, 0.053045593, 0.050375924, 0.052405264, 0.053231172, 0.05068567]
2024-08-10 12:14:50,365 : INFO : topic #12 (0.050): 0.009*"hellenic" + 0.009*"passenger" + 0.009*"hellenic_republic" + 0.009*"

LDA model trained successfully
Model saved to lda_models/topics_16_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_16_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_16_passes_20_iterations_400/corpus
Number of unique tokens: 1724
Number of documents: 440
Average topic coherence: -11.5509.

Top topics:
[(0,
  '0.014*"natural" + 0.011*"ingredient" + 0.010*"defendant" + '
  '0.009*"reference" + 0.009*"seised" + 0.008*"profession" + 0.008*"cover" + '
  '0.008*"united_kingdom" + 0.008*"united" + 0.007*"debt"'),
 (1,
  '0.011*"dispute" + 0.009*"procedure" + 0.009*"human" + 0.008*"regulatory" + '
  '0.008*"denmark" + 0.007*"email" + 0.007*"net" + 0.007*"medicinal" + '
  '0.007*"kingdom_denmark" + 0.006*"annex"'),
 (2,
  '0.009*"ass" + 0.009*"creditor" + 0.009*"date" + 0.009*"prize" + '
  '0.008*"country" + 0.008*"telephone" + 0.008*"second" + 0.008*"organic" + '
  '0.007*"sixth_chamber" + 0.007*"sixth"'),
 (3,
  '0.019*"laying" + 0.018*"safe

2024-08-10 12:14:52,929 : INFO : 9 accumulators retrieved from output queue
2024-08-10 12:14:52,972 : INFO : accumulated word occurrence stats for 1333 virtual documents



Coherence Score:  0.3746713782086865


2024-08-10 12:14:53,505 : INFO : collecting all words and their counts
2024-08-10 12:14:53,506 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-10 12:14:53,524 : INFO : collected 22147 token types (unigram + bigrams) from a corpus of 42255 words and 440 sentences
2024-08-10 12:14:53,525 : INFO : merged Phrases<22147 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-10 12:14:53,525 : INFO : Phrases lifecycle event {'msg': 'built Phrases<22147 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-10T12:14:53.525428', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-10 12:14:53,579 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-10 12:14:53,587 : INFO : built Dictionary<3309 unique tokens: ['abstract', 'adoption', 'advance', 'calculating', 'charged']...> fro


Document 0 topics:
[(10, 0.060666908), (13, 0.9324622)]
Best topic: 13
Top words in this topic:
[('travel', 0.01940025),
 ('package_travel', 0.011663827),
 ('germany', 0.011589071),
 ('federal_republic', 0.010643392),
 ('federal', 0.010643392),
 ('beer', 0.009723941),
 ('additional', 0.009181434),
 ('characteristic', 0.008766854),
 ('used', 0.008228681),
 ('reference', 0.006832072)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(0, 0.4379787), (7, 0.37576762), (14, 0.1675093)]
Best topic: 0
Top words in this topic:
[('natural', 0.013723555),
 ('ingredient', 0.010708065),
 ('defendant', 0.0097056795),
 ('reference', 0.009492301),
 ('seised', 0.008538524),
 ('profession', 0.008360534),
 ('cover', 0.0076397667),
 ('united_kingdom', 0.0075626615),
 ('un

2024-08-10 12:14:53,740 : INFO : optimized alpha [0.04459978, 0.042618528, 0.044469707, 0.04370908, 0.04488026, 0.042470112, 0.04272468, 0.04402936, 0.043509465, 0.042008847, 0.04375573, 0.04351907, 0.042060677, 0.04281096, 0.043346226, 0.04245895, 0.043118417, 0.04340581, 0.04258882, 0.04258984]
2024-08-10 12:14:53,741 : INFO : topic #9 (0.042): 0.023*"beer" + 0.015*"licence" + 0.015*"equipment" + 0.015*"charge" + 0.013*"luxembourg" + 0.013*"authorisation" + 0.012*"grand" + 0.012*"duchy" + 0.012*"grand_duchy" + 0.010*"tfeu"
2024-08-10 12:14:53,742 : INFO : topic #12 (0.042): 0.015*"consequence" + 0.010*"competent" + 0.008*"italian_republic" + 0.008*"assurance" + 0.008*"taking" + 0.008*"italian" + 0.007*"point" + 0.007*"conduct" + 0.006*"pre" + 0.006*"extent"
2024-08-10 12:14:53,742 : INFO : topic #2 (0.044): 0.029*"trade_mark" + 0.018*"et" + 0.014*"le" + 0.013*"la" + 0.012*"en" + 0.010*"assurance" + 0.008*"wine" + 0.007*"que" + 0.007*"individual" + 0.006*"country"
2024-08-10 12:14:53,

LDA model trained successfully
Model saved to lda_models/topics_20_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_20_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_20_passes_20_iterations_400/corpus
Number of unique tokens: 1726
Number of documents: 440
Average topic coherence: -11.9806.

Top topics:
[(0,
  '0.021*"property" + 0.011*"ruling" + 0.010*"provider" + '
  '0.009*"preliminary_ruling" + 0.009*"preliminary" + 0.009*"substance" + '
  '0.009*"establishment" + 0.009*"tax" + 0.009*"territory" + 0.009*"acting"'),
 (1,
  '0.015*"brought" + 0.012*"conformity" + 0.012*"dispute" + '
  '0.011*"consequence" + 0.010*"assessment" + 0.010*"hearing" + '
  '0.009*"ascertain" + 0.009*"fee" + 0.009*"pre" + 0.008*"examine"'),
 (2,
  '0.034*"trade_mark" + 0.022*"et" + 0.017*"le" + 0.016*"la" + 0.015*"en" + '
  '0.012*"assurance" + 0.010*"wine" + 0.010*"que" + 0.008*"individual" + '
  '0.007*"country"'),
 (3,
  '0.014*"production" + 0.013*"provid

2024-08-10 12:14:56,351 : INFO : 9 accumulators retrieved from output queue
2024-08-10 12:14:56,394 : INFO : accumulated word occurrence stats for 1265 virtual documents



Coherence Score:  0.39506415755233965


2024-08-10 12:14:57,054 : INFO : collecting all words and their counts
2024-08-10 12:14:57,055 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-08-10 12:14:57,074 : INFO : collected 22074 token types (unigram + bigrams) from a corpus of 42370 words and 440 sentences
2024-08-10 12:14:57,074 : INFO : merged Phrases<22074 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-10 12:14:57,075 : INFO : Phrases lifecycle event {'msg': 'built Phrases<22074 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.02s', 'datetime': '2024-08-10T12:14:57.075210', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-10 12:14:57,129 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-10 12:14:57,136 : INFO : built Dictionary<3319 unique tokens: ['abstract', 'adoption', 'advance', 'appeal', 'body']...> from 440 do


Document 0 topics:
[(18, 0.9881489)]
Best topic: 18
Top words in this topic:
[('directory', 0.025856007),
 ('telephone', 0.017536804),
 ('subscriber', 0.01618872),
 ('request', 0.016156826),
 ('making', 0.01585196),
 ('data', 0.014585703),
 ('energy', 0.012974871),
 ('instrument', 0.012870217),
 ('label', 0.009751267),
 ('enquiry', 0.0097427685)]
Original document: pricing interconnection related portability concern traffic number ported incurred telephone operator implement request porting adoption method calculating fix advance abstract model maximum charged donor recipient fixed way dissuaded making facility...


Document 1 topics:
[(5, 0.9790661)]
Best topic: 5
Top words in this topic:
[('building', 0.016450152),
 ('passenger', 0.01579408),
 ('natural', 0.014967524),
 ('long', 0.013274875),
 ('flight', 0.011989219),
 ('common', 0.011787071),
 ('energy', 0.010504),
 ('procedure', 0.010503147),
 ('cancellation', 0.010498826),
 ('need', 0.01049444)]
Original document: natural becomes

2024-08-10 12:14:57,145 : INFO : using autotuned alpha, starting with [0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668, 0.041666668]
2024-08-10 12:14:57,146 : INFO : using serial LDA version on this node
2024-08-10 12:14:57,148 : INFO : running online (multi-pass) LDA training, 24 topics, 20 passes over the supplied corpus of 440 documents, updating model once every 440 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2024-08-10 12:14:57,148 : INFO : PROGRESS: pass 0, at document #440/440
2024-08-10 12:14:57,294 : INFO : optimized alpha [0.037308037, 0.03676772, 0.036728077, 0.03640121, 0.037278034, 0.036961325, 0.036847305, 0.036535025, 0.036751486, 0.037272654, 0.03770063, 0.03694

LDA model trained successfully
Model saved to lda_models/topics_24_passes_20_iterations_400/trained_model
Dictionary saved to lda_models/topics_24_passes_20_iterations_400/dictionary
Corpus saved to lda_models/topics_24_passes_20_iterations_400/corpus
Number of unique tokens: 1724
Number of documents: 440
Average topic coherence: -10.6526.

Top topics:
[(23,
  '0.011*"note" + 0.011*"access" + 0.011*"compensation" + 0.010*"required" + '
  '0.008*"set" + 0.008*"arising" + 0.008*"technical" + 0.008*"complaint" + '
  '0.008*"slovak" + 0.008*"communication_network"'),
 (7,
  '0.017*"reasonably" + 0.012*"provide" + 0.010*"travel" + 0.010*"operation" + '
  '0.008*"reimbursement" + 0.008*"brought" + 0.007*"context" + '
  '0.007*"unfairness" + 0.007*"cheese" + 0.007*"force"'),
 (13,
  '0.017*"compensation" + 0.014*"appeal" + 0.012*"defect" + 0.012*"additional" '
  '+ 0.012*"mandatory" + 0.010*"disease" + 0.010*"link" + 0.008*"existence" + '
  '0.008*"considered" + 0.008*"operator"'),
 (2,
  '0.

2024-08-10 12:14:59,894 : INFO : 9 accumulators retrieved from output queue
2024-08-10 12:14:59,962 : INFO : accumulated word occurrence stats for 1368 virtual documents



Coherence Score:  0.42170033573309224

Document 0 topics:
[(6, 0.9910919)]
Best topic: 6
Top words in this topic:
[('email', 0.016154146),
 ('number', 0.010794199),
 ('january', 0.010794195),
 ('regulatory', 0.009007549),
 ('operator', 0.009007549),
 ('insurance', 0.009000349),
 ('covered', 0.008835581),
 ('appeal', 0.007583737),
 ('framework', 0.0072208983),
 ('display', 0.0072208983)]
Original document: pricing interconnection related number portability concern traffic number ported set incurred mobile telephone operator implement request number porting adoption laying method used calculating fix advance abstract model maximum charged donor...


Document 1 topics:
[(22, 0.9725119)]
Best topic: 22
Top words in this topic:
[('vehicle', 0.021714468),
 ('device', 0.020505369),
 ('cancellation', 0.020483933),
 ('energy', 0.016916452),
 ('passenger', 0.01690911),
 ('protect', 0.0131267635),
 ('outside', 0.011961081),
 ('investment', 0.01175638),
 ('event', 0.011729252),
 ('flight', 0.0096

In [None]:
docs/
├── .vitepress/
│   └── config.ts
├── index.md
├── guide/
│   ├── installation.md
│   ├── getting-started.md
│   └── configuration.md
├── methodology/
│   ├── preprocessing.md
│   ├── filtering.md
│   ├── model-training.md
│   └── visualization.md
└── api/
    └── topic-modeler.md

In [None]:
docs/
├── .vitepress/
│   └── config.ts
├── index.md
├── guide/
│   ├── installation.md
│   ├── getting-started.md
│   └── configuration.md
├── methodology/
│   ├── preprocessing.md
│   ├── filtering.md
│   ├── model-training.md
│   └── visualization.md
└── api/
    └── topic-modeler.md