In [11]:
import os
import logging
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.models import phrases
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from pprint import pprint

class TopicModeler:
    def __init__(self, input_path):
        self.input_path = input_path
        self.docs = []
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    def load_documents(self):
        for filename in os.listdir(self.input_path):
            with open(os.path.join(self.input_path, filename), 'r', encoding='utf-8') as f:
                self.docs.append(f.read())
        print(f"Loaded {len(self.docs)} documents")
        if self.docs:
            print(f"Sample document: {self.docs[0][:100]}...")  

    def preprocess_documents(self):
        self.docs = [[token.lower() for token in self.tokenizer.tokenize(doc)] for doc in self.docs]
        
        self.docs = [[token for token in doc if not token.isnumeric() and len(token) > 1] for doc in self.docs]
        
        stop_words = set(stopwords.words('english'))
        self.docs = [[word for word in doc if word not in stop_words] for doc in self.docs]
        
        self.docs = [[self.lemmatizer.lemmatize(token) for token in doc] for doc in self.docs]

        print(f"After preprocessing: {len(self.docs)} documents")
        if self.docs:
            print(f"Sample preprocessed document: {self.docs[0][:10]}...") 

    def add_bigrams(self):
        bigram = Phrases(self.docs, min_count=5, threshold=100)
        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    self.docs[idx].append(token)
        print(f"Added bigrams. Sample document: {self.docs[0][:15]}...")

    def create_dictionary_and_corpus(self):
        self.dictionary = Dictionary(self.docs)
        original_size = len(self.dictionary)
        self.dictionary.filter_extremes(no_below=2, no_above=0.9)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.docs]
        print(f"Dictionary size: {len(self.dictionary)} (reduced from {original_size})")
        print(f"Corpus size: {len(self.corpus)}")
        if self.corpus:
            print(f"Sample corpus entry: {self.corpus[0][:10]}") 

    def train_lda_model(self, num_topics=6, chunksize=2000, passes=20, iterations=400):
        if not self.corpus or not self.dictionary:
            raise ValueError("Corpus or dictionary is empty. Check your preprocessing steps.")
        self.model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            chunksize=chunksize,
            alpha='auto',
            eta='auto',
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=None,
            random_state=42
        )
        print("LDA model trained successfully")

    def print_model_info(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
        
        top_topics = self.model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / self.model.num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)
        
        print("\nTop topics:")
        pprint(self.model.print_topics())
        
        print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.docs, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    def visualize_topics(self):
        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary)
        return vis

    def analyze_documents(self):
        for i, doc in enumerate(self.docs[:5]): 
            bow = self.dictionary.doc2bow(doc)
            doc_topics = self.model.get_document_topics(bow)
            print(f"\nDocument {i} topics:")
            pprint(doc_topics)
            best_topic = max(doc_topics, key=lambda x: x[1])
            print(f"Best topic: {best_topic[0]}")
            print(f"Top words in this topic:")
            pprint(self.model.show_topic(best_topic[0]))
            print(f"Original document: {' '.join(doc[:30])}...")  # Print first 30 words
            print()

    def run(self):
        try:
            self.load_documents()
            self.preprocess_documents()
            self.add_bigrams()
            self.create_dictionary_and_corpus()
            self.train_lda_model()
            self.print_model_info()
            vis = self.visualize_topics()
            self.analyze_documents()
            return vis
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

# Usage
if __name__ == "__main__":
    modeler = TopicModeler('txt')  
    vis = modeler.run()
    
    # Save visualization
    pyLDAvis.save_html(vis, 'lda_visualization.html')
    print("Visualization saved as 'lda_visualization.html'")


Running model with parameters: num_topics=5, chunksize=2000, passes=10, iterations=200
An error occurred: [Errno 2] No such file or directory: 'input'


FileNotFoundError: [Errno 2] No such file or directory: 'input'

In [13]:
import os
import logging
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases, LdaModel
from gensim.models import phrases
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from pprint import pprint

class TopicModeler:
    def __init__(self, input_path):
        self.input_path = input_path
        self.docs = []
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.lemmatizer = WordNetLemmatizer()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    def load_documents(self):
        for filename in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, filename)
            if os.path.isfile(file_path) and filename.endswith('.txt'):  # Ensure it's a text file
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    if isinstance(content, str):
                        self.docs.append(content)
        print(f"Loaded {len(self.docs)} documents")
        if self.docs:
            print(f"Sample document: {self.docs[0][:100]}...")

    def preprocess_documents(self):
        self.docs = [[token.lower() for token in self.tokenizer.tokenize(doc)] for doc in self.docs if isinstance(doc, str)]
        self.docs = [[token for token in doc if not token.isnumeric() and len(token) > 1] for doc in self.docs]
        stop_words = set(stopwords.words('english'))
        self.docs = [[word for word in doc if word not in stop_words] for doc in self.docs]
        self.docs = [[self.lemmatizer.lemmatize(token) for token in doc] for doc in self.docs]
        print(f"After preprocessing: {len(self.docs)} documents")
        if self.docs:
            print(f"Sample preprocessed document: {self.docs[0][:10]}...")

    def add_bigrams(self):
        bigram = Phrases(self.docs, min_count=5, threshold=100)
        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    self.docs[idx].append(token)
        print(f"Added bigrams. Sample document: {self.docs[0][:15]}...")

    def create_dictionary_and_corpus(self):
        self.dictionary = Dictionary(self.docs)
        original_size = len(self.dictionary)
        self.dictionary.filter_extremes(no_below=2, no_above=0.9)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.docs]
        print(f"Dictionary size: {len(self.dictionary)} (reduced from {original_size})")
        print(f"Corpus size: {len(self.corpus)}")
        if self.corpus:
            print(f"Sample corpus entry: {self.corpus[0][:10]}")

    def train_lda_model(self, num_topics=6, chunksize=2000, passes=20, iterations=400, alpha='auto', eta='auto'):
        if not self.corpus or not self.dictionary:
            raise ValueError("Corpus or dictionary is empty. Check your preprocessing steps.")
        
        print(f"Training LDA model with num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}, alpha={alpha}, eta={eta}")
        
        self.model = LdaModel(
            corpus=self.corpus,
            id2word=self.dictionary,
            chunksize=chunksize,
            alpha=alpha,
            eta=eta,
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=None,
            random_state=42
        )
        print("LDA model trained successfully")

    def print_model_info(self):
        print('Number of unique tokens: %d' % len(self.dictionary))
        print('Number of documents: %d' % len(self.corpus))
        top_topics = self.model.top_topics(self.corpus)
        avg_topic_coherence = sum([t[1] for t in top_topics]) / self.model.num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)
        print("\nTop topics:")
        pprint(self.model.print_topics())
        print('\nPerplexity: ', self.model.log_perplexity(self.corpus))
        coherence_model_lda = CoherenceModel(model=self.model, texts=self.docs, dictionary=self.dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

    def visualize_topics(self):
        vis = pyLDAvis.gensim.prepare(self.model, self.corpus, self.dictionary)
        return vis

    def analyze_documents(self):
        for i, doc in enumerate(self.docs[:5]):
            bow = self.dictionary.doc2bow(doc)
            doc_topics = self.model.get_document_topics(bow)
            print(f"\nDocument {i} topics:")
            pprint(doc_topics)
            best_topic = max(doc_topics, key=lambda x: x[1])
            print(f"Best topic: {best_topic[0]}")
            print(f"Top words in this topic:")
            pprint(self.model.show_topic(best_topic[0]))
            print(f"Original document: {' '.join(doc[:30])}...")
            print()

    def save_model(self, file_path):
        if self.model is None:
            raise ValueError("No model has been trained yet.")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        self.model.save(file_path)
        print(f"Model saved to {file_path}")

    def load_model(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No model file found at {file_path}")
        self.model = LdaModel.load(file_path)
        print(f"Model loaded from {file_path}")

    def save_dictionary_and_corpus(self, dict_path, corpus_path):
        if self.dictionary is None or self.corpus is None:
            raise ValueError("Dictionary and corpus have not been created yet.")
        os.makedirs(os.path.dirname(dict_path), exist_ok=True)
        os.makedirs(os.path.dirname(corpus_path), exist_ok=True)
        self.dictionary.save(dict_path)
        with open(corpus_path, 'wb') as f:
            pickle.dump(self.corpus, f)
        print(f"Dictionary saved to {dict_path}")
        print(f"Corpus saved to {corpus_path}")

    def load_dictionary_and_corpus(self, dict_path, corpus_path):
        if not os.path.exists(dict_path) or not os.path.exists(corpus_path):
            raise FileNotFoundError(f"Dictionary or corpus file not found.")
        self.dictionary = Dictionary.load(dict_path)
        with open(corpus_path, 'rb') as f:
            self.corpus = pickle.load(f)
        print(f"Dictionary loaded from {dict_path}")
        print(f"Corpus loaded from {corpus_path}")

    def run_with_params(self, num_topics, chunksize, passes, iterations):
        try:
            self.load_documents()
            self.preprocess_documents()
            self.add_bigrams()
            self.create_dictionary_and_corpus()
            self.train_lda_model(num_topics=num_topics, chunksize=chunksize, passes=passes, iterations=iterations)
            model_dir = f"lda_models/topics_{num_topics}_passes_{passes}_iterations_{iterations}"
            self.save_model(os.path.join(model_dir, 'trained_model'))
            self.save_dictionary_and_corpus(os.path.join(model_dir, 'dictionary'), os.path.join(model_dir, 'corpus'))
            self.print_model_info()
            vis = self.visualize_topics()
            pyLDAvis.save_html(vis, os.path.join(model_dir, 'lda_visualization.html'))
            self.analyze_documents()
            return vis
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            raise

if __name__ == "__main__":
    modeler = TopicModeler('txt')

    # Train and save models with different parameters
    param_combinations = [
        (5, 2000, 10, 200),
        (10, 2000, 10, 400),
        (15, 2000, 20, 200),
        (10, 2000, 20, 400)
    ]

    for num_topics, chunksize, passes, iterations in param_combinations:
        print(f"\nRunning model with parameters: num_topics={num_topics}, chunksize={chunksize}, passes={passes}, iterations={iterations}")
        modeler.run_with_params(num_topics, chunksize, passes, iterations)

2024-08-02 13:17:07,341 : INFO : collecting all words and their counts
2024-08-02 13:17:07,341 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types



Running model with parameters: num_topics=5, chunksize=2000, passes=10, iterations=200
Loaded 440 documents
Sample document: Pricing for interconnection related to the provision of number portability, as referred to in Articl...
After preprocessing: 440 documents
Sample preprocessed document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec']...


2024-08-02 13:17:07,366 : INFO : collected 23445 token types (unigram + bigrams) from a corpus of 44718 words and 440 sentences
2024-08-02 13:17:07,367 : INFO : merged Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000>
2024-08-02 13:17:07,368 : INFO : Phrases lifecycle event {'msg': 'built Phrases<23445 vocab, min_count=5, threshold=100, max_vocab_size=40000000> in 0.03s', 'datetime': '2024-08-02T13:17:07.368002', 'gensim': '4.3.2', 'python': '3.11.9 (main, Jul 16 2024, 11:07:54) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.5-arm64-arm-64bit', 'event': 'created'}
2024-08-02 13:17:07,407 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-08-02 13:17:07,428 : INFO : built Dictionary<3666 unique tokens: ['abstract', 'account', 'adoption', 'advance', 'appeal']...> from 440 documents (total 46957 corpus positions)
2024-08-02 13:17:07,429 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<3666 unique tokens: ['abstract', 'acc

Added bigrams. Sample document: ['pricing', 'interconnection', 'related', 'provision', 'number', 'portability', 'referred', 'article', 'directive', 'ec', 'european', 'parliament', 'council', 'march', 'universal']...
Dictionary size: 2035 (reduced from 3666)
Corpus size: 440
Sample corpus entry: [(0, 1), (1, 1), (2, 1), (3, 3), (4, 3), (5, 2), (6, 2), (7, 2), (8, 1), (9, 1)]
Training LDA model with num_topics=5, chunksize=2000, passes=10, iterations=200, alpha=auto, eta=auto


2024-08-02 13:17:07,809 : INFO : optimized alpha [0.13063243, 0.11703815, 0.06443559, 0.13090336, 0.02856037]
2024-08-02 13:17:07,810 : INFO : topic #0 (0.131): 0.028*"article" + 0.028*"directive" + 0.019*"ec" + 0.019*"council" + 0.018*"european" + 0.017*"regulation" + 0.014*"consumer" + 0.014*"must" + 0.013*"parliament" + 0.013*"service"
2024-08-02 13:17:07,810 : INFO : topic #1 (0.117): 0.034*"directive" + 0.022*"article" + 0.017*"must" + 0.016*"national" + 0.013*"interpreted" + 0.013*"consumer" + 0.013*"service" + 0.012*"council" + 0.011*"court" + 0.011*"law"
2024-08-02 13:17:07,811 : INFO : topic #2 (0.064): 0.032*"directive" + 0.031*"consumer" + 0.020*"council" + 0.018*"contract" + 0.017*"article" + 0.014*"term" + 0.012*"must" + 0.011*"european" + 0.010*"ec" + 0.010*"interpreted"
2024-08-02 13:17:07,811 : INFO : topic #3 (0.131): 0.027*"directive" + 0.021*"article" + 0.021*"consumer" + 0.021*"council" + 0.017*"ec" + 0.016*"term" + 0.015*"court" + 0.015*"must" + 0.014*"unfair" + 0.

LDA model trained successfully
Model saved to lda_models/topics_5_passes_10_iterations_200/trained_model
Dictionary saved to lda_models/topics_5_passes_10_iterations_200/dictionary
Corpus saved to lda_models/topics_5_passes_10_iterations_200/corpus
Number of unique tokens: 2035
Number of documents: 440
Average topic coherence: -1.0921.

Top topics:
[(0,
  '0.032*"directive" + 0.030*"ec" + 0.027*"article" + 0.027*"council" + '
  '0.026*"european" + 0.022*"regulation" + 0.022*"parliament" + 0.014*"must" + '
  '0.013*"meaning" + 0.012*"interpreted"'),
 (1,
  '0.037*"directive" + 0.023*"article" + 0.022*"service" + 0.016*"must" + '
  '0.014*"national" + 0.013*"interpreted" + 0.012*"state" + 0.012*"cost" + '
  '0.011*"council" + 0.010*"law"'),
 (2,
  '0.036*"consumer" + 0.036*"directive" + 0.020*"contract" + 0.018*"article" + '
  '0.017*"council" + 0.015*"term" + 0.013*"must" + 0.011*"eec" + '
  '0.011*"interpreted" + 0.010*"agreement"'),
 (3,
  '0.033*"consumer" + 0.029*"term" + 0.026*"dir

2024-08-02 13:17:09,781 : INFO : 2 batches submitted to accumulate stats from 128 documents (347 virtual)
2024-08-02 13:17:09,784 : INFO : 3 batches submitted to accumulate stats from 192 documents (1274 virtual)
2024-08-02 13:17:10,834 : INFO : 9 accumulators retrieved from output queue
2024-08-02 13:17:10,844 : INFO : accumulated word occurrence stats for 11447 virtual documents



Coherence Score:  0.4517129413820314

Document 0 topics:
[(1, 0.9984667)]
Best topic: 1
Top words in this topic:
[('directive', 0.037215818),
 ('article', 0.023045152),
 ('service', 0.021931587),
 ('must', 0.01592759),
 ('national', 0.013697454),
 ('interpreted', 0.012828682),
 ('state', 0.011819984),
 ('cost', 0.011662337),
 ('council', 0.010960338),
 ('law', 0.010464187)]
Original document: pricing interconnection related provision number portability referred article directive ec european parliament council march universal service user right relating electronic communication network service universal service directive concern traffic cost number...


Document 1 topics:
[(0, 0.694997), (2, 0.30318424)]
Best topic: 0
Top words in this topic:
[('directive', 0.032075357),
 ('ec', 0.029609142),
 ('article', 0.027314719),
 ('council', 0.026978357),
 ('european', 0.025506597),
 ('regulation', 0.0223595),
 ('parliament', 0.022057863),
 ('must', 0.014074753),
 ('meaning', 0.013009776),
 ('in

TypeError: expected string or bytes-like object, got 'list'