In [45]:
import sys
sys.path.append("..")

from ris_evaluation.evaluator import Evaluator

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np

from bertopic import BERTopic
from umap import UMAP

In [46]:
class Model:

    def __init__(self, documents_df) -> None:
        self.documents_df = documents_df.copy()
        self.documents = self.documents_df['document'].tolist()
        self.labels = self.documents_df['class_name'].tolist()

    def train(self):
        raise NotImplementedError
    
    def get_output(self):
        raise NotImplementedError
    
    def get_results_df(self):
        raise NotImplementedError
    
    def get_words_for_topics(self, topics):
        words_by_topics = {}
        for idx, topic in enumerate(topics):
            words = self.documents_df.iloc[idx]['document'].split()

            if topic not in words_by_topics:
                words_by_topics[topic] = {}

            for word in words:
                if word not in words_by_topics[topic]:
                    words_by_topics[topic][word] = 0

                words_by_topics[topic][word] += 1

        return words_by_topics

In [47]:
class LDAModel(Model):

    def __init__(self, documents_df) -> None:
        super().__init__(documents_df)
        self.n_topics = len(set(self.labels))

    def train(self):
        self.vectorizer = CountVectorizer()
        X = self.vectorizer.fit_transform(self.documents)

        self.lda = LatentDirichletAllocation(n_components=self.n_topics)
        self.lda.fit(X)

    def get_results_df(self):
        results_df = pd.DataFrame()
        results_df['document'] = self.documents
        results_df['y_true'] = self.labels

        X = self.vectorizer.transform(self.documents)
        results_df['y_pred'] = self.lda.transform(X).argmax(axis=1)
        results_df['y_pred_highest_proba'] = self.lda.transform(X).max(axis=1)
        return results_df

    def get_output(self):
        topics = []
        for topic in self.lda.components_:
            topic_words = []
            for i in topic.argsort()[-10:]:
                topic_words.append(self.vectorizer.get_feature_names_out()[i])
            topics.append(topic_words)

        return {
            "topics": topics,
            "topic-document-matrix": None,
            "topic-word-matrix": None,
            "test-topic-document-matrix": None
        }

In [48]:
class BERTopicModel(Model):

    def __init__(self, documents_df) -> None:
        super().__init__(documents_df)
        self.n_topics = len(set(self.labels)) + 1  # +1 for outliers

    def train(self):
        self.bert_model = BERTopic(language="english", calculate_probabilities=True, nr_topics=self.n_topics)
        self.topics, self.probs = self.bert_model.fit_transform(self.documents)

    def get_results_df(self):
        results_df = pd.DataFrame()
        results_df['document'] = self.documents
        results_df['y_true'] = self.labels

        results_df['y_pred'] = self.topics
        results_df['y_pred_highest_proba'] = np.max(self.probs, axis=1)

        relevant_results_df = results_df[results_df['y_pred'] != -1]
        return relevant_results_df
    
    def get_output(self):
        return {
            "topics": [item for item in self.bert_model.get_topic_info()["Representation"]],
            "topic-document-matrix": self.probs.transpose(),
            "topic-word-matrix": self.bert_model.c_tf_idf_,
            "test-topic-document-matrix": self.probs.transpose()
        }

In [49]:
def compute_metrics_df(dataset_names, models, n_iterations=10):
    metrics_df = pd.DataFrame()
    idx = 0

    for dataset in dataset_names:
        # -- Get dataset
        documents_df = pd.read_csv(f'../datasets/data/{dataset}/documents.csv')

        for model in models:
            avg_results = {}

            for i in range(n_iterations):
                print(f'Iteration {i+1}/{n_iterations} for {dataset} and {model.__name__}', end='\r')

                # -- Train model
                trained_model = model(documents_df)
                trained_model.train()

                model_output = trained_model.get_output()

                # -- Evaluate model
                evaluator = Evaluator(model_output)
                results_df = trained_model.get_results_df()

                words_by_extracted_topics = trained_model.get_words_for_topics(results_df['y_pred'].tolist())
                words_by_class = trained_model.get_words_for_topics(results_df['y_true'].tolist())

                coherence = evaluator.compute_coherence()
                diversity = evaluator.compute_diversity()
                supervised_correlation = evaluator.compute_supervised_correlation(words_by_extracted_topics, words_by_class)

                # -- Average results
                for coherence_type, coherence_value in coherence.items():
                    if coherence_type not in avg_results:
                        avg_results[f'coherence_{coherence_type}'] = []
                    avg_results[f'coherence_{coherence_type}'].append(coherence_value)
                
                avg_results['diversity'] = avg_results.get('diversity', []) + [diversity]
                avg_results['supervised_correlation'] = avg_results.get('supervised_correlation', []) + [supervised_correlation]

            for key, value in avg_results.items():
                avg_results[key] = np.mean(value)

            metrics_results = {}
            metrics_results['dataset'] = dataset
            metrics_results['model'] = trained_model.__class__.__name__
            for key, value in avg_results.items():
                metrics_results[key] = [value]

            metrics_df = pd.concat([metrics_df, pd.DataFrame(metrics_results, index=[idx])])
            idx += 1

    return metrics_df

In [50]:
metrics_df = compute_metrics_df(dataset_names=[
    'BBC_News',
    '20NewsGroup',
    'DBLP',
    'M10',
], models=[
    LDAModel,
    BERTopicModel
], n_iterations=5)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Iteration 5/5 for M10 and BERTopicModel

In [51]:
metrics_df

Unnamed: 0,dataset,model,coherence_c_v,coherence_c_uci,coherence_c_npmi,coherence_u_mass,diversity,supervised_correlation
0,BBC_News,LDAModel,0.507245,-0.224381,0.026653,-2.237404,0.92,0.824987
1,BBC_News,BERTopicModel,0.480479,-0.194804,0.021948,-2.220241,0.856667,0.701449
2,20NewsGroup,LDAModel,0.600076,0.660461,0.091718,-1.767074,0.743,0.611222
3,20NewsGroup,BERTopicModel,0.510362,0.175887,0.067621,-2.255179,0.827619,0.382447
4,DBLP,LDAModel,0.566795,0.094171,0.038986,-1.931211,0.83,0.612566
5,DBLP,BERTopicModel,0.555418,-0.304531,0.022946,-2.24664,0.76,0.299963
6,M10,LDAModel,0.457851,-0.061817,0.016673,-2.270753,0.838,0.520814
7,M10,BERTopicModel,0.515526,-2.204118,-0.050275,-2.546092,0.914545,0.626878


In [52]:
print(metrics_df.to_latex(index=False))

\begin{tabular}{llrrrrrr}
\toprule
dataset & model & coherence_c_v & coherence_c_uci & coherence_c_npmi & coherence_u_mass & diversity & supervised_correlation \\
\midrule
BBC_News & LDAModel & 0.507245 & -0.224381 & 0.026653 & -2.237404 & 0.920000 & 0.824987 \\
BBC_News & BERTopicModel & 0.480479 & -0.194804 & 0.021948 & -2.220241 & 0.856667 & 0.701449 \\
20NewsGroup & LDAModel & 0.600076 & 0.660461 & 0.091718 & -1.767074 & 0.743000 & 0.611222 \\
20NewsGroup & BERTopicModel & 0.510362 & 0.175887 & 0.067621 & -2.255179 & 0.827619 & 0.382447 \\
DBLP & LDAModel & 0.566795 & 0.094171 & 0.038986 & -1.931211 & 0.830000 & 0.612566 \\
DBLP & BERTopicModel & 0.555418 & -0.304531 & 0.022946 & -2.246640 & 0.760000 & 0.299963 \\
M10 & LDAModel & 0.457851 & -0.061817 & 0.016673 & -2.270753 & 0.838000 & 0.520814 \\
M10 & BERTopicModel & 0.515526 & -2.204118 & -0.050275 & -2.546092 & 0.914545 & 0.626878 \\
\bottomrule
\end{tabular}

