In [1]:
import sys
sys.path.append("..")

from ris_evaluation.evaluator import Evaluator

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np

from bertopic import BERTopic
from umap import UMAP

from gensim import corpora
import gensim

import warnings
warnings.filterwarnings("ignore")

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
class Model:

    def __init__(self, dataset_name: str) -> None:
        self.dataset_name = dataset_name

        self.documents_df = pd.read_csv(f'../../datasets/data/{dataset_name}/documents.csv')
        self.documents_df = self.documents_df

        self.documents = self.documents_df['document'].tolist()
        self.labels = self.documents_df['class_name'].tolist()

        self.labels_df = pd.read_csv(f'../../datasets/data/{dataset_name}/labels.csv')
        self.defined_keywords = [keywords.split(' ') for keywords in self.labels_df['class_keywords'].tolist()]

        self.num_topics = len(set(self.labels))

    def train(self):
        """ Train the model """
        raise NotImplementedError
    
    def get_output(self):
        """ Get the output of the model on the OCTIS format """
        raise NotImplementedError
    
    def get_results_df(self):
        """ Get the results of the model on a DataFrame """
        raise NotImplementedError
    
    def get_words_for_topics(self, topics):
        """ Get the words for each topic from the documents

        Args:
            topics (list): The topics for each document

        Returns:
            dict: The words for each topic
        """
        words_by_topics = {}
        for idx, topic in enumerate(topics):
            words = self.documents_df.iloc[idx]['document'].split()

            if topic not in words_by_topics:
                words_by_topics[topic] = {}

            for word in words:
                if word not in words_by_topics[topic]:
                    words_by_topics[topic][word] = 0

                words_by_topics[topic][word] += 1

        return words_by_topics

In [3]:
class LDAModel(Model):

    def __init__(self, dataset_name: str) -> None:
        super().__init__(dataset_name)

    def train(self):
        """ See the documentation of the parent class """
        self.vectorizer = CountVectorizer()
        X = self.vectorizer.fit_transform(self.documents)

        self.lda = LatentDirichletAllocation(n_components=self.num_topics)
        self.lda.fit(X)

    def get_results_df(self):
        """ See the documentation of the parent class """
        results_df = pd.DataFrame()
        results_df['document'] = self.documents
        results_df['y_true'] = self.labels

        X = self.vectorizer.transform(self.documents)
        results_df['y_pred'] = self.lda.transform(X).argmax(axis=1)
        results_df['y_pred_highest_proba'] = self.lda.transform(X).max(axis=1)
        return results_df

    def get_output(self):
        """ See the documentation of the parent class """
        topics = []
        for topic in self.lda.components_:
            topic_words = []
            for i in topic.argsort()[-10:]:
                topic_words.append(self.vectorizer.get_feature_names_out()[i])
            topics.append(topic_words)

        return {
            "topics": topics,
            "topic-document-matrix": None,
            "topic-word-matrix": None,
            "test-topic-document-matrix": None
        }

In [4]:
class BERTopicModel(Model):

    def __init__(self, dataset_name: str) -> None:
        super().__init__(dataset_name)
        self.num_topics = len(set(self.labels)) + 1  # +1 for outliers

    def train(self):
        """ See the documentation of the parent class """
        self.bert_model = BERTopic(language="english", calculate_probabilities=True, nr_topics=self.num_topics)
        self.topics, self.probs = self.bert_model.fit_transform(self.documents)

    def get_results_df(self):
        """ See the documentation of the parent class """
        results_df = pd.DataFrame()
        results_df['document'] = self.documents
        results_df['y_true'] = self.labels

        results_df['y_pred'] = self.topics
        results_df['y_pred_highest_proba'] = np.max(self.probs, axis=1)

        relevant_results_df = results_df[results_df['y_pred'] != -1]
        return relevant_results_df
    
    def get_output(self):
        """ See the documentation of the parent class """
        return {
            "topics": [item for item in self.bert_model.get_topic_info()["Representation"]],
            "topic-document-matrix": self.probs.transpose(),
            "topic-word-matrix": self.bert_model.c_tf_idf_,
            "test-topic-document-matrix": self.probs.transpose()
        }

In [5]:
class GuidedLDAModel(Model):

    def __init__(self, dataset_name: str) -> None:
        super().__init__(dataset_name)

    def train(self) -> None:
        """ See the documentation of the parent class """
        self.texts = [document.split(' ') for document in self.documents]
        
        self.dictionary = corpora.Dictionary(self.texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]

        priors = {}
        for idx, keywords in enumerate(self.defined_keywords):
            for keyword in keywords:
                priors[keyword] = idx

        eta = np.full(shape=(self.num_topics, len(self.dictionary)), fill_value=1) # create a (ntopics, nterms) matrix and fill with 1
        for word, topic in priors.items(): # for each word in the list of priors
            keyindex = [index for index,term in self.dictionary.items() if term == word] # look up the word in the dictionary
            if (len(keyindex) > 0): # if it's in the dictionary
                eta[topic,keyindex[0]] = 1e7  # put a large number in there
        eta = np.divide(eta, eta.sum(axis=0)) # normalize so that the probabilities sum to 1 over all topics

        with (np.errstate(divide='ignore')):  # ignore divide-by-zero warnings
            self.model = gensim.models.ldamodel.LdaModel(
                corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics,
                random_state=42, chunksize=100, eta=eta,
                eval_every=-1, update_every=1,
                passes=150, alpha='auto', per_word_topics=True)

    def get_results_df(self):
        """ See the documentation of the parent class """
        self.results_df = pd.DataFrame()
        self.results_df['document'] = self.documents
        self.results_df['y_true'] = self.documents_df['class_name'].tolist()

        scores = [[value[1] for value in score_values[0]] for score_values in self.model[self.corpus]]
        self.results_df['y_pred'] = [np.argmax(score) for score in scores]
        self.results_df['y_pred_highest_proba'] = [np.max(score) for score in scores]
        return self.results_df
    
    def get_output(self):
        """ See the documentation of the parent class """
        topics = [self.model.show_topic(topicid, topn=10) for topicid in range(self.num_topics)]
        topics = [[word for word, _ in topic] for topic in topics]

        return {
            "topics": topics,
            "topic-document-matrix": None,
            "topic-word-matrix": None,
            "test-topic-document-matrix": None
        }

In [6]:
class GuidedBERTopicModel(BERTopicModel):

    def __init__(self, dataset_name: str) -> None:
        super().__init__(dataset_name)

    def train(self) -> None:
        """ See the documentation of the parent class """
        umap = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=0)
        self.bert_model = BERTopic(language="english", calculate_probabilities=True, nr_topics=self.num_topics, umap_model=umap, seed_topic_list=self.defined_keywords)
        self.topics, self.probs = self.bert_model.fit_transform(self.documents)

In [7]:
def compute_metrics_df(dataset_name: str, models: list, n_iterations:int=10, load_from_file:bool=False):
    """ Compute the metrics for the given datasets and models

    Args:
        dataset_name (str): The name of the dataset
        models (list): The list of models
        n_iterations (int, optional): The number of iterations. Defaults to 10.
        load_from_file (bool, optional): Whether to load the results from file or not. Defaults to False.

    Returns:
        pd.DataFrame: The metrics DataFrame
    """
    if load_from_file:
        return pd.read_csv(f'./results/{dataset_name}_metrics.csv')

    metrics_df = pd.DataFrame()
    idx = 0

    for model in models:
        avg_results = {}

        for i in range(n_iterations):
            print(f'Iteration {i+1}/{n_iterations} for {dataset_name} and {model.__name__}', end='\r')

            # -- Train model
            trained_model = model(dataset_name)
            trained_model.train()

            model_output = trained_model.get_output()

            # -- Evaluate model
            evaluator = Evaluator(model_output)
            results_df = trained_model.get_results_df()

            words_by_extracted_topics = trained_model.get_words_for_topics(results_df['y_pred'].tolist())
            words_by_class = trained_model.get_words_for_topics(results_df['y_true'].tolist())

            coherence = evaluator.compute_coherence()
            diversity = evaluator.compute_diversity()
            supervised_correlation = evaluator.compute_supervised_correlation(words_by_extracted_topics, words_by_class)

            # -- Average results
            for coherence_type, coherence_value in coherence.items():
                if coherence_type not in avg_results:
                    avg_results[f'coherence_{coherence_type}'] = []
                avg_results[f'coherence_{coherence_type}'].append(coherence_value)
            
            avg_results['diversity'] = avg_results.get('diversity', []) + [diversity]
            avg_results['supervised_correlation'] = avg_results.get('supervised_correlation', []) + [supervised_correlation]

        for key, value in avg_results.items():
            avg_results[key] = np.mean(value)

        metrics_results = {}
        metrics_results['dataset'] = dataset_name
        metrics_results['model'] = trained_model.__class__.__name__
        for key, value in avg_results.items():
            metrics_results[key] = [value]

        metrics_df = pd.concat([metrics_df, pd.DataFrame(metrics_results, index=[idx])])
        idx += 1

    metrics_df.to_csv(f'./results/{dataset_name}_metrics.csv', index=False)
    return metrics_df

In [8]:
models = [LDAModel, BERTopicModel, GuidedLDAModel, GuidedBERTopicModel]

## BBC News

In [9]:
bbc_news_metrics_df = compute_metrics_df(dataset_name='BBC_News', models=models, n_iterations=5, load_from_file=True)
print(bbc_news_metrics_df.to_latex(index=False))
bbc_news_metrics_df

\begin{tabular}{llrrrrrr}
\toprule
dataset & model & coherence_c_v & coherence_c_uci & coherence_c_npmi & coherence_u_mass & diversity & supervised_correlation \\
\midrule
BBC_News & LDAModel & 0.493955 & 0.279265 & 0.040366 & -2.368924 & 0.920000 & 0.760189 \\
BBC_News & BERTopicModel & 0.433929 & -0.419129 & 0.002830 & -2.499619 & 0.840000 & 0.627419 \\
BBC_News & GuidedLDAModel & 0.498162 & 0.082306 & 0.037583 & -2.235177 & 0.920000 & 0.588504 \\
BBC_News & GuidedBERTopicModel & 0.490397 & -0.188388 & 0.026225 & -2.232617 & 0.883333 & 0.922348 \\
\bottomrule
\end{tabular}



Unnamed: 0,dataset,model,coherence_c_v,coherence_c_uci,coherence_c_npmi,coherence_u_mass,diversity,supervised_correlation
0,BBC_News,LDAModel,0.493955,0.279265,0.040366,-2.368924,0.92,0.760189
1,BBC_News,BERTopicModel,0.433929,-0.419129,0.00283,-2.499619,0.84,0.627419
2,BBC_News,GuidedLDAModel,0.498162,0.082306,0.037583,-2.235177,0.92,0.588504
3,BBC_News,GuidedBERTopicModel,0.490397,-0.188388,0.026225,-2.232617,0.883333,0.922348


## 20NewsGroup

In [12]:
newsgroups_metrics_df = compute_metrics_df(dataset_name='20NewsGroup', models=models, n_iterations=5, load_from_file=False)
print(newsgroups_metrics_df.to_latex(index=False))
newsgroups_metrics_df

\begin{tabular}{llrrrrrr}roup and GuidedBERTopicModel
\toprule
dataset & model & coherence_c_v & coherence_c_uci & coherence_c_npmi & coherence_u_mass & diversity & supervised_correlation \\
\midrule
20NewsGroup & LDAModel & 0.603940 & 0.673476 & 0.096662 & -1.883472 & 0.761000 & 0.401317 \\
20NewsGroup & BERTopicModel & 0.472277 & 0.125097 & 0.053755 & -2.306378 & 0.803810 & 0.268840 \\
20NewsGroup & GuidedLDAModel & 0.462860 & -0.205873 & 0.025873 & -2.436411 & 0.985000 & 0.361511 \\
20NewsGroup & GuidedBERTopicModel & 0.528895 & 0.203075 & 0.071613 & -2.263779 & 0.814286 & 0.318357 \\
\bottomrule
\end{tabular}



Unnamed: 0,dataset,model,coherence_c_v,coherence_c_uci,coherence_c_npmi,coherence_u_mass,diversity,supervised_correlation
0,20NewsGroup,LDAModel,0.60394,0.673476,0.096662,-1.883472,0.761,0.401317
1,20NewsGroup,BERTopicModel,0.472277,0.125097,0.053755,-2.306378,0.80381,0.26884
2,20NewsGroup,GuidedLDAModel,0.46286,-0.205873,0.025873,-2.436411,0.985,0.361511
3,20NewsGroup,GuidedBERTopicModel,0.528895,0.203075,0.071613,-2.263779,0.814286,0.318357


## DBLP

In [None]:
dblp_metrics_df = compute_metrics_df(dataset_name='DBLP', models=models, n_iterations=5, load_from_file=False)
print(dblp_metrics_df.to_latex(index=False))
dblp_metrics_df

## M10

In [None]:
m10_metrics_df = compute_metrics_df(dataset_name='M10', models=models, n_iterations=5, load_from_file=False)
print(m10_metrics_df.to_latex(index=False))
m10_metrics_df