In [None]:
import octis
from octis.models.LDA import LDA
from octis.models.ProdLDA import ProdLDA
from octis.models.ETM import ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.dataset.dataset import Dataset
import pandas as pd
import json
import os

In [None]:
def prepare_octis_corpus(output_folder, docs):
  # Write to docs.tsv
  with open(os.path.join(output_folder, "corpus.tsv"), "w", encoding="utf-8") as f:
      for doc in docs:
          f.write(f"{doc}\n")
  # Tokenize and create vocabulary
  vocab = set()
  for doc in docs:
      vocab.update(doc.split())

  vocab = {w for w in vocab if w.isalpha() and len(w) > 2}
  vocab = sorted(vocab)  # sorting is optional but nice for consistency

  # Save vocab.json
  with open(os.path.join(output_folder, "vocab.json"), "w", encoding="utf-8") as f:
      json.dump(vocab, f)

In [None]:
def evaluate_metrics(output, topk=10, measure='c_npmi'):
    
    diverisity_metric = TopicDiversity(topk=topk) # Initialize metric
    diversity_score = diverisity_metric.score(output) # Compute score of the metric

    coherence_metrics = Coherence(texts=dataset.get_corpus(), #list of our documents
                        measure=measure)
    coherence_score = coherence_metrics.score(output) # Compute score of the metric
    
    return {'coherence_score' : coherence_score, 'diversity_score' : diversity_score}

In [None]:
df_chilit = pd.read_csv("./data/ChiLit_Paragraphs.csv")
df_chilit = df_chilit.fillna("")
df_chilit = df_chilit[df_chilit['tokens'] != '']

In [None]:
df_chilit.head()

In [None]:
octis_folder = "./octis/"

### Create OCTIS dataset

In [None]:

docs = []
for _, row in df_chilit.iterrows():
  docs.append(row["tokens"])

prepare_octis_corpus(octis_folder, docs)

In [None]:
dataset = Dataset()
dataset.load_custom_dataset_from_folder(octis_folder)

### Test models with default setting

#### LDA

In [None]:
model = LDA(num_topics=20, passes=10)
output = model.train_model(dataset)

In [None]:
for t in output['topics']:
  print(" ".join(t))

In [None]:
scores = evaluate_metrics(output)
print(scores)

#### ProdLDA

In [None]:
model = ProdLDA(num_topics=20, use_partitions=False)
output = model.train_model(dataset)

In [None]:
for t in output['topics']:
  print(" ".join(t))

In [None]:
scores = evaluate_metrics(output)
print(scores)

### ETM

In [None]:
model = ETM(num_topics=15, use_partitions=False, device = 'gpu', embeddings_path='./data/chilit-19th-century-averaged-embeddings.txt')
output = model.train_model(dataset)

In [None]:
for t in output['topics']:
  print(" ".join(t))

In [None]:
scores = evaluate_metrics(output)
print(scores)